script

parent a2cb46ab
......@@ -2,11 +2,75 @@
# -*- coding: utf-8 -*-
import sys
import json
baseDir = "/home/virtualHDD/m3/noticias/"
import os
import datetime
today = datetime.datetime.now()
baseDir = "/home/geoint/virtualHDD/m3/noticias/"
scrapyDir = "/home/geoint/cawlersNoticias/"
with open(sys.argv[1]) as data_file:
siteList = json.load(data_file)
os.chdir(baseDir)
for s in siteList:
desde = datetime.datetime.strptime(s['desde'], '%d-%m-%Y')
print str(s['nombre'] +", desde:" + desde.strftime("%Y-%m-%d"))
media = s['crawler'][s['crawler'].find("/")+1:]
try:
os.makedirs(media)
except:
print "ok"
os.chdir(media)
lstYears = os.listdir(".")
lstYears.sort()
year = desde.year
if len(lstYears)>0:
year = int(lstYears[len(lstYears)-1])
for y in range(year, today.year+1):
print y
try:
os.makedirs(str(y))
except:
print "ok"
os.chdir(str(y))
# print os.getcwd()
lstDays = os.listdir(".")
lstDays = [l for l in lstDays if not l.startswith('.')]
lstDays.sort()
print lstDays
day = desde.timetuple().tm_yday
print day
currentDate = desde
if len(lstDays)>0:
strDate = lstDays[len(lstDays)-1]
strDate = strDate[:strDate.find(".")]
currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
day = currentDate.timetuple().tm_yday
elif y!=desde.year:
currentDate = datetime.datetime.strptime(str(y)+"-01-01", '%Y-%m-%d')
day = 1
for d in range(day, 365+1):
filename = currentDate.strftime('%Y-%m-%d')+".json"
scrapycommand = "scrapy crawl noticias -t json -o " + filename + " -a year="+str(currentDate.year)+ " -a month="+str(currentDate.month)+" -a day="+str(currentDate.day)
mydir = os.getcwd()
print mydir
os.chdir(scrapyDir+s['crawler'])
print scrapycommand
os.system(scrapycommand)
os.system("mv " + filename + " " + mydir)
os.chdir(mydir)
currentDate = currentDate + datetime.timedelta(days=1)
os.chdir("..")
os.chdir("..")
print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
print siteList
#!/bin/bash
python /home/geoint/crawlerNoticias/crawler_script.py /home/geoint/crawlerNoticias/crawler_data.json
[
{"nombre": "alChile", "crawler": "sitios_yucatan/alChile"},
{"nombre": "desdeElBalcon", "crawler": "sitios_yucatan/desdeElBalcon"},
{"nombre": "diarioYucatan", "crawler": "sitios_yucatan/diarioYucatan"},
{"nombre": "grilloPorteno", "crawler": "sitios_yucatan/grilloPorteno"},
{"nombre": "laJornadaMaya", "crawler": "sitios_yucatan/alChile"},
{"nombre": "laVerdadYuc", "crawler": "sitios_yucatan/laVerdadYuc"},
{"nombre": "lectorMX", "crawler": "sitios_yucatan/lectorMX"},
{"nombre": "miPuntoDeVista", "crawler": "sitios_yucatan/miPuntoDeVista"},
{"nombre": "notirivas", "crawler": "sitios_yucatan/notirivas"},
{"nombre": "notisureste", "crawler": "sitios_yucatan/notisureste"},
{"nombre": "puntoMedio", "crawler": "sitios_yucatan/puntoMedio"},
{"nombre": "sona893", "crawler": "sitios_yucatan/sona893"},
{"nombre": "yucatanALaMano", "crawler": "sitios_yucatan/yucatanALaMano"},
{"nombre": "yucatanAlMinuto", "crawler": "sitios_yucatan/yucatanAlMinuto"},
{"nombre": "yucatanEnCorto", "crawler": "sitios_yucatan/yucatanEnCorto"},
{"nombre": "diarioYaqui", "crawler": "otros_sitios/diarioYaqui"},
{"nombre": "laJornada", "crawler": "otros_sitios/laJornada"},
{"nombre": "laJornadaAgs", "crawler": "otros_sitios/laJornadaAgs"},
{"nombre": "laJornadaBC", "crawler": "otros_sitios/laJornadaBC"},
{"nombre": "laJornadaGro", "crawler": "otros_sitios/laJornadaGro"},
{"nombre": "laJornadaOte", "crawler": "otros_sitios/laJornadaOte"},
{"nombre": "laJornadaSanLuis", "crawler": "otros_sitios/laJornadaSanLuis"},
{"nombre": "laJornadaVer", "crawler": "otros_sitios/laJornadaVer"},
{"nombre": "laJornadaZac", "crawler": "otros_sitios/laJornadaZac"}
]
\ No newline at end of file
This diff is collapsed.
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment