Commit 87ed4374 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawl often

parent 6e0a0add
...@@ -36,11 +36,11 @@ def dictRowGenerator(line): ...@@ -36,11 +36,11 @@ def dictRowGenerator(line):
except: except:
pass pass
try: try:
row.append(("url", line['url'])) row.append(("text", line['text']))
except: except:
pass pass
try: try:
row.append(("text", line['text'])) row.append(("url", line['url']))
except: except:
pass pass
...@@ -50,7 +50,6 @@ def dictRowGenerator(line): ...@@ -50,7 +50,6 @@ def dictRowGenerator(line):
today = datetime.datetime.now() today = datetime.datetime.now()
baseDir = "/home/geoint/virtualHDD/m3/noticias/" baseDir = "/home/geoint/virtualHDD/m3/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/" scrapyDir = "/home/geoint/crawlersNoticias/"
with open(sys.argv[1]) as data_file: with open(sys.argv[1]) as data_file:
siteList = json.load(data_file) siteList = json.load(data_file)
os.chdir(baseDir) os.chdir(baseDir)
...@@ -112,7 +111,7 @@ with open(sys.argv[1]) as data_file: ...@@ -112,7 +111,7 @@ with open(sys.argv[1]) as data_file:
for d in range(day, ((datetime.date(y, 12, 31) - datetime.date(y, 1, 1)).days + 1 if today.year != y else today.timetuple().tm_yday) + 1): for d in range(day, ((datetime.date(y, 12, 31) - datetime.date(y, 1, 1)).days + 1 if today.year != y else today.timetuple().tm_yday) + 1):
YESTERDAY = False YESTERDAY = False
filename = currentDate.strftime('%Y-%m-%d') + ".json" filename = currentDate.strftime('%Y-%m-%d') + ".json"
scrapycommand = "scrapy crawl noticias -t json --nolog -o " + filename scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename
mydir = os.getcwd() mydir = os.getcwd()
print mydir print mydir
os.chdir(scrapyDir + s['crawler']) os.chdir(scrapyDir + s['crawler'])
...@@ -171,33 +170,46 @@ with open(sys.argv[1]) as data_file: ...@@ -171,33 +170,46 @@ with open(sys.argv[1]) as data_file:
f1 = mydir + '/' + filenameYesterday f1 = mydir + '/' + filenameYesterday
f2 = filename f2 = filename
f3 = baseDir + media + '/' + filenameYesterday f3 = baseDir + media + '/' + filenameYesterday
with open(f1) as infile1, open(f2) as infile2, open(f3, 'a') as infile3: with open(f2) as infile2, open(f3, 'a') as infile3:
master = json.load(infile1) try:
infile1 = open(f1)
master = json.load(infile1)
yesterdayFlag = True
except:
yesterdayFlag = False
urlSet = set()
slave = json.load(infile2) slave = json.load(infile2)
urlSet = set([line['url'] for line in master])
counter = 0
infile3.write("[") infile3.write("[")
for line in master: if yesterdayFlag:
counter += 1 urlSet = set([line['url'] for line in master])
counter = 0
for line in master:
counter += 1
auxRow = dictRowGenerator(line) auxRow = dictRowGenerator(line)
row = OrderedDict(auxRow) row = OrderedDict(auxRow)
if counter == 1: if counter == 1:
infile3.write(json.dumps(row)) infile3.write(json.dumps(row))
elif counter > 1: elif counter > 1:
infile3.write(",\n" + json.dumps(row)) infile3.write(",\n" + json.dumps(row))
counter = 0
for line in slave: for line in slave:
lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d').date() lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d').date()
if not line['url'] in urlSet and lineDate == currentDate: if not line['url'] in urlSet and lineDate == currentDate:
counter += 1
auxRow = dictRowGenerator(line) auxRow = dictRowGenerator(line)
row = OrderedDict(auxRow) row = OrderedDict(auxRow)
infile3.write(",\n" + json.dumps(row)) if not yesterdayFlag and counter == 1:
infile3.write(json.dumps(row))
else:
infile3.write(",\n" + json.dumps(row))
infile3.write("]") infile3.write("]")
if yesterdayFlag: infile1.close()
os.system("mv " + f3 + " " + mydir) os.system("mv " + f3 + " " + mydir)
os.system("rm " + f2) os.system("rm " + f2)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment