Commit be2e6424 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawl all

parent dafd9a05
......@@ -4,73 +4,137 @@ import sys
import json
import os
import datetime
from collections import OrderedDict
def dictRowGenerator(line):
row = []
try:
row.append(("date", line['date']))
except:
pass
try:
row.append(("topic", line['topic']))
except:
pass
try:
row.append(("title", line['title']))
except:
pass
try:
row.append(("author", line['author']))
except:
pass
try:
row.append(("location", line['location']))
except:
pass
try:
row.append(("text", line['text']))
except:
pass
try:
row.append(("url", line['url']))
except:
pass
return row
def parse_json(mydir, media, filename):
with open(media + "/" + filename) as inputFile, open(filename, "a") as outputFile:
jsonFile = json.load(inputFile)
counter = 0
outputFile.write("[")
for line in jsonFile:
counter += 1
auxRow = dictRowGenerator(line)
row = OrderedDict(auxRow)
if counter == 1:
outputFile.write(json.dumps(row))
elif counter > 1:
outputFile.write(",\n" + json.dumps(row))
outputFile.write("]")
os.system("mv " + filename + " " + mydir)
## INICIO
today = datetime.datetime.now()
baseDir = "/home/geoint/virtualHDD/m3/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/"
with open(sys.argv[1]) as data_file:
siteList = json.load(data_file)
os.chdir(baseDir)
for s in siteList:
desde = datetime.datetime.strptime(s['desde'], '%d-%m-%Y')
print str(s['nombre'] +", desde:" + desde.strftime("%Y-%m-%d"))
media = s['crawler'][s['crawler'].find("/")+1:]
try:
os.makedirs(media)
except:
print "ok"
os.chdir(media)
lstYears = os.listdir(".")
lstYears.sort()
year = desde.year
if len(lstYears) > 0:
year = int(lstYears[len(lstYears)-1])
for y in range(year, today.year+1):
print y
try:
os.makedirs(str(y))
except:
print "ok"
os.chdir(str(y))
siteList = json.load(data_file)
os.chdir(baseDir)
for s in siteList:
desde = datetime.datetime.strptime(s['desde'], '%d-%m-%Y')
print str(s['nombre'] +", desde:" + desde.strftime("%Y-%m-%d"))
media = s['crawler'][s['crawler'].find("/")+1:]
try:
os.makedirs(media)
except:
print "ok"
os.chdir(media)
lstYears = os.listdir(".")
lstYears.sort()
year = desde.year
if len(lstYears) > 0:
year = int(lstYears[len(lstYears)-1])
for y in range(year, today.year+1):
print y
try:
os.makedirs(str(y))
except:
print "ok"
os.chdir(str(y))
# print os.getcwd()
lstDays = os.listdir(".")
lstDays = [l for l in lstDays if not l.startswith('.')]
lstDays.sort()
print lstDays
day = desde.timetuple().tm_yday
print day
currentDate = desde
if len(lstDays) > 0:
strDate = lstDays[len(lstDays)-1]
strDate = strDate[:strDate.find(".")]
currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
day = currentDate.timetuple().tm_yday
elif y != desde.year:
currentDate = datetime.datetime.strptime(str(y)+"-01-01", '%Y-%m-%d')
day = 1
for d in range(day, ((datetime.date(y,12,31)-datetime.date(y,1,1)).days + 1 if today.year!=y else today.timetuple().tm_yday)+1):
filename = currentDate.strftime('%Y-%m-%d')+".json"
scrapycommand = "scrapy crawl noticias -t json -o " + filename + " -a year="+str(currentDate.year)+ " -a month="+str(currentDate.month)+" -a day="+str(currentDate.day)
mydir = os.getcwd()
print mydir
os.chdir(scrapyDir+s['crawler'])
print media
print scrapycommand
os.system(scrapycommand)
fileSize = os.stat(filename).st_size
if fileSize <= 3: os.system('rm '+filename)
else: os.system("mv " + filename + " " + mydir)
os.chdir(mydir)
currentDate = currentDate + datetime.timedelta(days=1)
os.chdir("..")
os.chdir("..")
lstDays = os.listdir(".")
lstDays = [l for l in lstDays if not l.startswith('.')]
lstDays.sort()
print lstDays
day = desde.timetuple().tm_yday
print day
currentDate = desde
if len(lstDays) > 0:
strDate = lstDays[len(lstDays)-1]
strDate = strDate[:strDate.find(".")]
currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
day = currentDate.timetuple().tm_yday
elif y != desde.year:
currentDate = datetime.datetime.strptime(str(y)+"-01-01", '%Y-%m-%d')
day = 1
for d in range(day, ((datetime.date(y,12,31)-datetime.date(y,1,1)).days + 1 if today.year!=y else today.timetuple().tm_yday)+1):
filename = currentDate.strftime('%Y-%m-%d')+".json"
scrapycommand = "scrapy crawl noticias -t json --nolog -o " + filename + " -a year="+str(currentDate.year)+ " -a month="+str(currentDate.month)+" -a day="+str(currentDate.day)
mydir = os.getcwd()
print mydir
os.chdir(scrapyDir+s['crawler'])
print media
print scrapycommand
os.system(scrapycommand)
fileSize = os.stat(filename).st_size
if fileSize <= 3:
os.system("rm " + filename)
else:
os.chdir("..")
parse_json(mydir, media, filename)
# os.system("mv " + filename + " " + mydir)
os.system("rm " + media + "/" + filename)
os.chdir(mydir)
currentDate = currentDate + datetime.timedelta(days=1)
os.chdir("..")
os.chdir("..")
print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment