Commit f8eba08a authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawl backwards

parent 6d213ea2
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import json
import os
import datetime
today = datetime.datetime.now()
# baseDir = "/home/geoint/virtualHDD/m3/noticias/"
# scrapyDir = "/home/geoint/crawlersNoticias/"
baseDir = "/home/cna_service/noticias/"
scrapyDir = "/home/cna_service/crawler/crawlersNoticias/"
with open(sys.argv[1]) as data_file:
siteList = json.load(data_file)
os.chdir(baseDir)
for s in siteList:
media = s['crawler'][s['crawler'].rfind("/")+1:]
try:
os.makedirs(media)
except:
print "ok"
os.chdir(media)
lstYears = os.listdir(".")
lstYears.sort()
if len(lstYears) > 0:
year = int(lstYears[len(lstYears)-1])
print year
try:
os.makedirs(str(year))
except:
print "ok"
os.chdir(str(year))
lstDays = os.listdir(".")
lstDays = [l for l in lstDays if not l.startswith('.')]
lstDays.sort()
print lstDays
filename = "noticias.json"
if len(lstDays) > 0:
strDate = lstDays[len(lstDays)-1]
print strDate
strDate = strDate[:strDate.find(".")]
currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename + " -a year=" + str(currentDate.year) + " -a month=" + str(currentDate.month) + " -a day=" + str(currentDate.day)
else:
scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename
mydir = os.getcwd()
print mydir
os.chdir(scrapyDir+s['crawler'])
print media
print scrapycommand
os.system(scrapycommand)
fileSize = os.stat(filename).st_size
if fileSize <= 3: os.system("rm " + filename)
else:
os.chdir(scrapyDir)
os.system("python3 parse_date_files.py " + s['crawler'])
os.chdir(media)
mediaYears = os.listdir(".")
mediaYears.sort()
for yy in mediaYears:
os.chdir(yy)
try:
os.makedirs(baseDir + media + "/" + yy)
except:
pass
mediaDays = os.listdir(".")
mediaDays = [l for l in mediaDays if not l.startswith('.')]
mediaDays.sort()
for dd in mediaDays:
os.system("mv " + dd + " " + baseDir + media + "/" + yy)
os.chdir("..")
os.system("rm -R " + yy)
os.chdir("..")
os.system("rm -R " + media)
os.chdir(s['crawler'])
os.system("rm " + filename)
os.chdir(mydir)
os.chdir("..")
# print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment