Commit 38c3e463 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

tracker

parent dd06c629
...@@ -4,7 +4,6 @@ import sys ...@@ -4,7 +4,6 @@ import sys
import json import json
import os import os
import datetime import datetime
from collections import OrderedDict
""" """
Descarga las noticias de un sitio desde entre dos fechas especificas. Descarga las noticias de un sitio desde entre dos fechas especificas.
...@@ -12,62 +11,7 @@ USO: ...@@ -12,62 +11,7 @@ USO:
tracker.py data.json tracker.py data.json
""" """
def dictRowGenerator(line):
row = []
try:
row.append(("date", line['date']))
except:
pass
try:
row.append(("topic", line['topic']))
except:
pass
try:
row.append(("title", line['title']))
except:
pass
try:
row.append(("author", line['author']))
except:
pass
try:
row.append(("location", line['location']))
except:
pass
try:
row.append(("text", line['text']))
except:
pass
try:
row.append(("url", line['url']))
except:
pass
return row
def parse_json(mydir, media, filename):
with open(media + "/" + filename) as inputFile, open(filename, "a") as outputFile:
jsonFile = json.load(inputFile)
counter = 0
outputFile.write("[")
for line in jsonFile:
counter += 1
auxRow = dictRowGenerator(line)
row = OrderedDict(auxRow)
if counter == 1:
outputFile.write(json.dumps(row))
elif counter > 1:
outputFile.write(",\n" + json.dumps(row))
outputFile.write("]")
os.system("mv " + filename + " " + mydir)
## INICIO
# today = datetime.datetime.now() # today = datetime.datetime.now()
baseDir = "/home/geoint/virtualHDD/m3/noticias/" baseDir = "/home/geoint/virtualHDD/m3/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/" scrapyDir = "/home/geoint/crawlersNoticias/"
...@@ -122,7 +66,7 @@ with open(sys.argv[1]) as data_file: ...@@ -122,7 +66,7 @@ with open(sys.argv[1]) as data_file:
for d in range(day, ((datetime.date(y,12,31)-datetime.date(y,1,1)).days + 1 if hasta.year!=y else hasta.timetuple().tm_yday)+1): for d in range(day, ((datetime.date(y,12,31)-datetime.date(y,1,1)).days + 1 if hasta.year!=y else hasta.timetuple().tm_yday)+1):
filename = currentDate.strftime('%Y-%m-%d')+".json" filename = currentDate.strftime('%Y-%m-%d')+".json"
scrapycommand = "scrapy crawl noticias -t json --nolog -o " + filename + " -a year="+str(currentDate.year)+ " -a month="+str(currentDate.month)+" -a day="+str(currentDate.day) scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename + " -a year="+str(currentDate.year)+ " -a month="+str(currentDate.month)+" -a day="+str(currentDate.day)
mydir = os.getcwd() mydir = os.getcwd()
print mydir print mydir
os.chdir(scrapyDir+s['crawler']) os.chdir(scrapyDir+s['crawler'])
...@@ -130,13 +74,8 @@ with open(sys.argv[1]) as data_file: ...@@ -130,13 +74,8 @@ with open(sys.argv[1]) as data_file:
print scrapycommand print scrapycommand
os.system(scrapycommand) os.system(scrapycommand)
fileSize = os.stat(filename).st_size fileSize = os.stat(filename).st_size
if fileSize <= 3: if fileSize <= 3: os.system("rm " + filename)
os.system("rm " + filename) else: os.system("mv " + filename + " " + mydir)
else:
os.chdir("..")
parse_json(mydir, media, filename)
# os.system("mv " + filename + " " + mydir)
os.system("rm " + media + "/" + filename)
os.chdir(mydir) os.chdir(mydir)
currentDate = currentDate + datetime.timedelta(days=1) currentDate = currentDate + datetime.timedelta(days=1)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment