Commit a79008f2 authored by Mario Chirinos Colunga's avatar Mario Chirinos Colunga 💬

script

parent a2cb46ab
......@@ -2,11 +2,75 @@
# -*- coding: utf-8 -*-
import sys
import json
baseDir = "/home/virtualHDD/m3/noticias/"
import os
import datetime
today = datetime.datetime.now()
baseDir = "/home/geoint/virtualHDD/m3/noticias/"
scrapyDir = "/home/geoint/cawlersNoticias/"
with open(sys.argv[1]) as data_file:
siteList = json.load(data_file)
print siteList
os.chdir(baseDir)
for s in siteList:
desde = datetime.datetime.strptime(s['desde'], '%d-%m-%Y')
print str(s['nombre'] +", desde:" + desde.strftime("%Y-%m-%d"))
media = s['crawler'][s['crawler'].find("/")+1:]
try:
os.makedirs(media)
except:
print "ok"
os.chdir(media)
lstYears = os.listdir(".")
lstYears.sort()
year = desde.year
if len(lstYears)>0:
year = int(lstYears[len(lstYears)-1])
for y in range(year, today.year+1):
print y
try:
os.makedirs(str(y))
except:
print "ok"
os.chdir(str(y))
# print os.getcwd()
lstDays = os.listdir(".")
lstDays = [l for l in lstDays if not l.startswith('.')]
lstDays.sort()
print lstDays
day = desde.timetuple().tm_yday
print day
currentDate = desde
if len(lstDays)>0:
strDate = lstDays[len(lstDays)-1]
strDate = strDate[:strDate.find(".")]
currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
day = currentDate.timetuple().tm_yday
elif y!=desde.year:
currentDate = datetime.datetime.strptime(str(y)+"-01-01", '%Y-%m-%d')
day = 1
for d in range(day, 365+1):
filename = currentDate.strftime('%Y-%m-%d')+".json"
scrapycommand = "scrapy crawl noticias -t json -o " + filename + " -a year="+str(currentDate.year)+ " -a month="+str(currentDate.month)+" -a day="+str(currentDate.day)
mydir = os.getcwd()
print mydir
os.chdir(scrapyDir+s['crawler'])
print scrapycommand
os.system(scrapycommand)
os.system("mv " + filename + " " + mydir)
os.chdir(mydir)
currentDate = currentDate + datetime.timedelta(days=1)
os.chdir("..")
os.chdir("..")
print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
#!/bin/bash
python /home/geoint/crawlerNoticias/crawler_script.py /home/geoint/crawlerNoticias/crawler_data.json
[
{"nombre": "alChile", "crawler": "sitios_yucatan/alChile"},
{"nombre": "desdeElBalcon", "crawler": "sitios_yucatan/desdeElBalcon"},
{"nombre": "diarioYucatan", "crawler": "sitios_yucatan/diarioYucatan"},
{"nombre": "grilloPorteno", "crawler": "sitios_yucatan/grilloPorteno"},
{"nombre": "laJornadaMaya", "crawler": "sitios_yucatan/alChile"},
{"nombre": "laVerdadYuc", "crawler": "sitios_yucatan/laVerdadYuc"},
{"nombre": "lectorMX", "crawler": "sitios_yucatan/lectorMX"},
{"nombre": "miPuntoDeVista", "crawler": "sitios_yucatan/miPuntoDeVista"},
{"nombre": "notirivas", "crawler": "sitios_yucatan/notirivas"},
{"nombre": "notisureste", "crawler": "sitios_yucatan/notisureste"},
{"nombre": "puntoMedio", "crawler": "sitios_yucatan/puntoMedio"},
{"nombre": "sona893", "crawler": "sitios_yucatan/sona893"},
{"nombre": "yucatanALaMano", "crawler": "sitios_yucatan/yucatanALaMano"},
{"nombre": "yucatanAlMinuto", "crawler": "sitios_yucatan/yucatanAlMinuto"},
{"nombre": "yucatanEnCorto", "crawler": "sitios_yucatan/yucatanEnCorto"},
{"nombre": "diarioYaqui", "crawler": "otros_sitios/diarioYaqui"},
{"nombre": "laJornada", "crawler": "otros_sitios/laJornada"},
{"nombre": "laJornadaAgs", "crawler": "otros_sitios/laJornadaAgs"},
{"nombre": "laJornadaBC", "crawler": "otros_sitios/laJornadaBC"},
{"nombre": "laJornadaGro", "crawler": "otros_sitios/laJornadaGro"},
{"nombre": "laJornadaOte", "crawler": "otros_sitios/laJornadaOte"},
{"nombre": "laJornadaSanLuis", "crawler": "otros_sitios/laJornadaSanLuis"},
{"nombre": "laJornadaVer", "crawler": "otros_sitios/laJornadaVer"},
{"nombre": "laJornadaZac", "crawler": "otros_sitios/laJornadaZac"}
]
\ No newline at end of file
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment