Commit a6839698 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawl often

parent 95002f31
[{"nombre": "El Financiero", "crawler": "descarga_por_rss/elFinanciero", "url": "http://www.elfinanciero.com.mx/"},
{"nombre": "El Universal", "crawler": "descarga_por_rss/elUniversal", "url": "http://www.eluniversal.com.mx/"},
{"nombre": "El Sol de Mexico", "crawler": "descarga_por_rss/solDeMex", "url": "https://www.elsoldemexico.com.mx/"},
{"nombre": "Diario de Yucatan", "crawler": "descarga_hacia_atras/diarioYucatan", "url": "http://www.yucatan.com.mx/"}]
\ No newline at end of file
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Script para la descarga automatica de noticias por rss
"""
import sys
import json
import os
import datetime
from collections import OrderedDict
def dictRowGenerator(line):
row = []
try:
row.append(("date", line['date']))
except:
pass
try:
row.append(("topic", line['topic']))
except:
pass
try:
row.append(("title", line['title']))
except:
pass
try:
row.append(("author", line['author']))
except:
pass
try:
row.append(("location", line['location']))
except:
pass
try:
row.append(("url", line['url']))
except:
pass
try:
row.append(("text", line['text']))
except:
pass
return row
today = datetime.datetime.now()
# baseDir = "/home/geoint/virtualHDD/m3/noticias/"
# scrapyDir = "/home/geoint/crawlersNoticias/"
baseDir = "/home/cna_service/rss_test/"
scrapyDir = "/home/cna_service/crawler/crawlersNoticias/"
with open(sys.argv[1]) as data_file:
siteList = json.load(data_file)
os.chdir(baseDir)
for s in siteList:
# desde = datetime.datetime.strptime(s['desde'], '%d-%m-%Y')
desde = today
print str(s['nombre'] + ", desde:" + desde.strftime("%Y-%m-%d"))
media = s['crawler'][s['crawler'].find("/") + 1:]
try:
os.makedirs(media)
except:
print "ok"
os.chdir(media)
CORRECT_YEAR = False
while not CORRECT_YEAR:
lstYears = os.listdir(".")
lstYears.sort()
if len(lstYears) > 0:
element = lstYears[len(lstYears) - 1]
if element[-4:] == 'json':
os.system('rm ' + element)
else:
CORRECT_YEAR = True
else:
break
if CORRECT_YEAR: year = int(element)
else: year = desde.year
for y in range(year, today.year + 1):
print y
try:
os.makedirs(str(y))
except:
print "ok"
os.chdir(str(y))
# print os.getcwd()
lstDays = os.listdir(".")
lstDays = [l for l in lstDays if not l.startswith('.')]
lstDays.sort()
print lstDays
day = desde.timetuple().tm_yday
print day
currentDate = desde.date()
# if len(lstDays) > 0:
# strDate = lstDays[len(lstDays) - 1]
# strDate = strDate[:strDate.find(".")]
# currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
# day = currentDate.timetuple().tm_yday
# elif y != desde.year:
# currentDate = datetime.datetime.strptime(str(y) + "-01-01", '%Y-%m-%d')
# day = 1
for d in range(day, ((datetime.date(y, 12, 31) - datetime.date(y, 1, 1)).days + 1 if today.year != y else today.timetuple().tm_yday) + 1):
YESTERDAY = False
filename = currentDate.strftime('%Y-%m-%d') + ".json"
scrapycommand = "scrapy crawl noticias -t json --nolog -o " + filename
mydir = os.getcwd()
print mydir
os.chdir(scrapyDir + s['crawler'])
print media
print scrapycommand
os.system(scrapycommand)
fileSize = os.stat(filename).st_size
if fileSize <= 3:
os.system("rm " + filename)
else:
f1 = mydir + "/" + filename
f2 = filename
f3 = baseDir + media + "/" + filename
try:
with open(f1) as infile1, open(f2) as infile2, open(f3, 'a') as infile3:
master = json.load(infile1)
slave = json.load(infile2)
urlSet = set([line['url'] for line in master])
counter = 0
infile3.write("[")
for line in master:
lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d').date()
if lineDate == currentDate:
counter += 1
auxRow = dictRowGenerator(line)
row = OrderedDict(auxRow)
if counter == 1:
infile3.write(json.dumps(row))
elif counter > 1:
infile3.write(",\n" + json.dumps(row))
for line in slave:
lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d').date()
if not line['url'] in urlSet and lineDate == currentDate:
auxRow = dictRowGenerator(line)
row = OrderedDict(auxRow)
infile3.write(",\n" + json.dumps(row))
elif lineDate != currentDate and (currentDate - lineDate).days == 1:
YESTERDAY = True
infile3.write("]")
os.system("mv " + f3 + " " + mydir)
# os.system("rm " + f2)
except:
os.system("cp " + f2 + " " + mydir)
if YESTERDAY:
currentDate -= datetime.timedelta(days=1)
filenameYesterday = currentDate.strftime('%Y-%m-%d') + ".json"
f1 = mydir + '/' + filenameYesterday
f2 = filename
f3 = baseDir + media + '/' + filenameYesterday
with open(f1) as infile1, open(f2) as infile2, open(f3, 'a') as infile3:
master = json.load(infile1)
slave = json.load(infile2)
urlSet = set([line['url'] for line in master])
counter = 0
infile3.write("[")
for line in master:
counter += 1
auxRow = dictRowGenerator(line)
row = OrderedDict(auxRow)
if counter == 1:
infile3.write(json.dumps(row))
elif counter > 1:
infile3.write(",\n" + json.dumps(row))
for line in slave:
lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d').date()
if not line['url'] in urlSet and lineDate == currentDate:
auxRow = dictRowGenerator(line)
row = OrderedDict(auxRow)
infile3.write(",\n" + json.dumps(row))
infile3.write("]")
os.system("mv " + f3 + " " + mydir)
os.system("rm " + f2)
os.chdir(mydir)
if YESTERDAY:
currentDate += datetime.timedelta(days=2)
else:
currentDate += datetime.timedelta(days=1)
os.chdir("..")
os.chdir("..")
print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment