Commit 5f865616 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

backwards tracker

parent 2e1d39ff
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import json
import os
import datetime
"""
Script para la descarga de histórico de medios del tipo "descarga_hacia_atras".
"""
today = datetime.datetime.now()
baseDir = "/home/geoint/virtualHDD/m3/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/"
with open(sys.argv[1]) as data_file:
siteList = json.load(data_file)
os.chdir(baseDir)
for s in siteList:
media = s['crawler'][s['crawler'].rfind("/")+1:]
try:
os.makedirs(media)
except:
print "ok"
os.chdir(media)
lstYears = os.listdir(".")
lstYears.sort()
if len(lstYears) > 0:
year = int(lstYears[len(lstYears)-1])
else:
year = today.date().year
print year
try:
os.makedirs(str(year))
except:
print "ok"
os.chdir(str(year))
lstDays = os.listdir(".")
lstDays = [l for l in lstDays if not l.startswith('.')]
lstDays.sort()
print lstDays
filename = "noticias.json"
# if len(lstDays) > 0:
# strDate = lstDays[len(lstDays)-1]
# print strDate
# strDate = strDate[:strDate.find(".")]
# currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
# scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename + " -a year=" + str(currentDate.year) + " -a month=" + str(currentDate.month) + " -a day=" + str(currentDate.day)
#
# else:
scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename
mydir = os.getcwd()
print mydir
os.chdir(scrapyDir+s['crawler'])
print media
print scrapycommand
os.system(scrapycommand)
fileSize = os.stat(filename).st_size
if fileSize <= 3: os.system("rm " + filename)
else:
os.chdir(scrapyDir)
os.system("python3 parse_date_files.py " + s['crawler'])
os.chdir(media)
mediaYears = os.listdir(".")
mediaYears.sort()
for yy in mediaYears:
os.chdir(yy)
try:
os.makedirs(baseDir + media + "/" + yy)
except:
pass
mediaDays = os.listdir(".")
mediaDays = [l for l in mediaDays if not l.startswith('.')]
mediaDays.sort()
for dd in mediaDays:
os.system("mv " + dd + " " + baseDir + media + "/" + yy)
os.chdir("..")
os.system("rm -R " + yy)
os.chdir("..")
os.system("rm -R " + media)
os.chdir(s['crawler'])
os.system("rm " + filename)
os.chdir(mydir)
os.chdir("..")
# print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment