Commit 0800309b authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

Cambios scripts.

parent b85d6144
#!/bin/bash
## ------------------------------------------------------------------
## SCRIPT PARA LA DESCARGA AUTOMATICA DE NOTICIAS CON EL CRAWLER
## ------------------------------------------------------------------
site_section=( otros_sitios sitios_yucatan )
other_site_list=( diarioYaqui laJornada laJornadaAgs laJornadaBC laJornadaGro laJornadaMaya laJornadaOte laJornadaSanLuis laJornadaVer laJornadaZac )
yuc_site_list=( alChile desdeElBalcon diarioYucatan grilloPorteno laVerdadYuc lectorMX miPuntoDeVista notirivas notisureste puntoMedio sona893 yucatanALaMano yucatanAlMinuto yucatanEnCorto )
base_path=$HOME/crawler/
cd $base_path # activando el venv (entorno virtual) del crawler
source bin/activate
PATH=$PATH:$HOME/crawler/bin/python:$HOME/crawler/bin/scrapy # rutas donde se encuentran el scrapy y python
export PATH
## CALCULO DEL NUMERO DE DIAS PARA DESCARGAR NOTICIAS -------------------------
function obtain_days() {
local last_date=$1 # parametro 1
local stop_date=$2 # parametro 2
local day_date_1=`date -d "$last_date" '+%j'` # numero del dia del anio de la fecha en "last_date"
local y1=`date -d "$last_date" '+%Y'`
local day_date_2=`date -d "$stop_date" '+%j'`
local y2=`date -d "$stop_date" '+%Y'`
if [ $y1 -eq $y2 ] # si $y1 es igual a $y2
then
local num_days=$(expr $day_date_2 - $day_date_1)
elif [ $y1 -lt $y2 ]
then
local days_date_1=0
for year in `seq $y1 $y2`
do
if [ $year -eq $y1 ]
then
local days_date=$(expr `date -d "$y1-12-31" '+%j'` - $day_date_1)
elif [ $year -eq $y2 ]
then
days_date=$day_date_2
else
days_date=`date -d "$year-12-31" '+%j'`
fi
days_date_1=$(expr $days_date_1 + $days_date)
done
local num_days=$(expr $days_date_1)
fi
return $num_days
}
## ----------------------------------------------------------------------------
## SECUENCIA DE DESCARGA DE NOTICIAS --------------------------------------------------------------
for section in ${site_section[@]}
do
if [ $section = otros_sitios ]
then
list=${other_site_list[@]}
else
list=${yuc_site_list[@]}
fi
for site in $list
do
## POR CADA SITIO ENCUENTRA EL ARCHIVO CON LA ULTIMA FECHA EN QUE SE DESCARGO NOTICIAS ----
cd crawledJsonFiles/$section/$site
max=`ls | tail -1` # obtiene el ultimo directorio
cd $max
json_file=`ls | tail -1` # obtiene el ultimo archivo dentro del directorio
## ----------------------------------------------------------------------------------------
cd ~/crawler
last_date=`date -d "${json_file%%.*}" '+%Y-%m-%d'`
stop_date=`date -d "now" '+%Y-%m-%d'` # descarga hasta una fecha antes de esta
## NOTA: Para que descargue hasta una fecha antes, el paro debe fijarse una fecha despues. Por eso 'stop_date' se fija con 'now'.
if [ $last_date != $stop_date ]
then
last_date=`date -d "$last_date +1 days" '+%Y-%m-%d'`
## FUNCION 'obtain_days' CALCULA EL NUMERO DE DIAS ENTRE LA ULTIMA FECHA DE DESCARGA Y LA FECHA DE PARO
obtain_days $last_date $stop_date # parametros que se pasan a la funcion
num_days=$? # retorno del valor por parte de la funcion 'obtain_days'
for i in `seq $num_days -1 1`
do
y=`date -d "$stop_date - $i days" '+%Y'`
m=`date -d "$stop_date - $i days" '+%m'`
d=`date -d "$stop_date - $i days" '+%d'`
cd cawlersNoticias/$section/$site/ # ruta donde se encuentran alojados los crawlers de cada sitio
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json # revisa si el archivo contiene informacion, sino lo elimina
if [ -e $y-$m-$d.json ] # revisa si se genero el archivo json con las noticias
then
destination=$HOME/crawler/prueba/$section/$site/$y/ # ruta donde se guardaran los json generados
if [ ! -d $destination ] # si no existe la ruta de destino la crea
then
mkdir -p $destination
fi
mv -f $y-$m-$d.json $destination # mueve el archivo json a la ruta de destino
fi
cd ~/crawler
done
fi
done
done
deactivate
## ------------------------------------------------------------------------------------------------
\ No newline at end of file
import scrapy import scrapy
from datetime import datetime, date, timedelta
""" """
USO: USO:
...@@ -30,47 +31,95 @@ class QuotesSpider(scrapy.Spider): ...@@ -30,47 +31,95 @@ class QuotesSpider(scrapy.Spider):
self.page = 0 self.page = 0
self.count = 0 self.count = 0
self.found = False self.found = False
self.not_found = False
self.next_section = False
self.year = getattr(self, 'year', None) self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None) self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None) self.day = getattr(self, 'day', None)
self.date = date(int(self.year), int(self.month), int(self.day))
self.data_list = []
self.baseURL = 'http://jornadabc.mx' self.baseURL = 'http://jornadabc.mx'
section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo' # section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo', 'cultura']
'cultura', 'espectaculos', 'deportes'] section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo', 'cultura']
for section in section_list: for section in section_list:
self.section = section self.section = section
print self.section
self.page = 0 self.page = 0
self.count = 0 self.count = 0
self.found = False
self.not_found = False
self.next_section = False
self.data_list = []
page = -1 page = -1
while True: while True:
if ( self.found ): if ( self.found ):
self.found = False self.found = False
break break
if ( self.not_found ):
self.not_found = False
self.next_section = True
break
page += 1 page += 1
print 'page '+str(page)
yield scrapy.Request(url=self.baseURL+'/seccion/'+section+'?page='+str(page), callback=self.parse, dont_filter=True) yield scrapy.Request(url=self.baseURL+'/seccion/'+section+'?page='+str(page), callback=self.parse, dont_filter=True)
self.page -= self.count # if not ( self.next_section ):
if ( self.page > 0 ): # self.next_section = False
self.page -= 1 # self.page -= self.count
# if ( self.page > 0 ):
for pag in range(self.page, self.page+25): # self.page -= 1
yield scrapy.Request(url=self.baseURL+'/seccion/'+section+'?page='+str(pag), callback=self.parse_page, dont_filter=True)
# for pag in range(self.page, self.page+25):
# yield scrapy.Request(url=self.baseURL+'/seccion/'+section+'?page='+str(pag), callback=self.parse_page, dont_filter=True)
if ( len(self.data_list) > 0 ):
for link in self.data_list:
# link_date = datetime.strptime(link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:], '%d-%m-%Y').date()
# if ( link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:] == self.day.zfill(2)+'-'+self.month.zfill(2)+'-'+self.year ):
yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item, dont_filter=True)
def parse(self, response): def parse(self, response):
print response.url
count = 0
this_page = int(response.url[response.url.rfind('=')+1:])
if ( self.section == 'espectaculos' or self.section == 'deportes' ): if ( self.section == 'espectaculos' or self.section == 'deportes' ):
path = '//*[@class="region region-soft-first"]' path = '//*[@class="region region-soft-first"]'
else: else:
path = '//*[@class="region region-hard-first"]' path = '//*[@class="region region-hard-first"]'
for link in response.xpath(path).css('div.views-row').xpath('./*[@class="views-field views-field-title"]/span[@class="field-content"]/a/@href').extract(): link_list = response.xpath(path).css('div.views-row').xpath('./*[@class="views-field views-field-title"]/span[@class="field-content"]/a/@href').extract()
link_date = link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:] # filtra la fecha del link
if ( link_date == self.day.zfill(2)+'-'+self.month.zfill(2)+'-'+self.year ): if ( len(link_list) > 0 ):
self.page = int(response.url[response.url.rfind('=')+1:]) if ( this_page > 0 ):
self.count += 1 del link_list[0]
self.found = True print link_list
break for link in link_list:
link_date = datetime.strptime(link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:], '%d-%m-%Y').date()
print self.section+' '+link_date.isoformat()+' ? '+self.date.isoformat()
if ( link_date.month == self.date.month and link_date.year == self.date.year ):
# self.page = int(response.url[response.url.rfind('=')+1:])
# self.count += 1
# self.found = True
# print '************founddddd********'
# break
self.data_list.append(link)
if ( this_page >= 300 ):
self.found = True
break
if ( this_page > 300 and len(link_list) == 0 ):
self.not_found = True
# if ( link_date < stop_date ):
# # count += 1
# # print count
# # if ( count > 3 ):
# self.not_found = True
# break
def parse_page(self, response): def parse_page(self, response):
...@@ -79,10 +128,12 @@ class QuotesSpider(scrapy.Spider): ...@@ -79,10 +128,12 @@ class QuotesSpider(scrapy.Spider):
else: else:
path = '//*[@class="region region-hard-first"]' path = '//*[@class="region region-hard-first"]'
for link in response.xpath(path).css('div.views-row').xpath('./*[@class="views-field views-field-title"]/span[@class="field-content"]/a/@href').extract(): link_list = response.xpath(path).css('div.views-row').xpath('./*[@class="views-field views-field-title"]/span[@class="field-content"]/a/@href').extract()
link_date = link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:] # filtra la fecha del link if ( len(link_list) > 0 ):
if ( link_date == self.day.zfill(2)+'-'+self.month.zfill(2)+'-'+self.year ): for link in link_list:
yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item) link_date = datetime.strptime(link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:], '%d-%m-%Y').date()
if ( link_date == self.date ):
yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item)
def parse_item(self, response): def parse_item(self, response):
......
...@@ -28,7 +28,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -28,7 +28,7 @@ class QuotesSpider(scrapy.Spider):
def start_requests(self): def start_requests(self):
self.section = '' self.section = ''
self.baseURL = 'http://jornadabc.mx' self.baseURL = 'http://jornadabc.mx'
section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo' section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo',
'cultura', 'espectaculos', 'deportes'] 'cultura', 'espectaculos', 'deportes']
for section in section_list: for section in section_list:
self.section = section self.section = section
......
import scrapy import scrapy
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 # scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re import re
......
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -7,24 +7,27 @@ do ...@@ -7,24 +7,27 @@ do
mkdir -p $y; mkdir -p $y;
fi fi
cd $y cd $y
for m in $(seq -f "%02g" 1 3) for m in $(seq -f "%02g" 1 4)
do do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31) for d in $(seq -f "%02g" 1 31)
do do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json # revisa si el archivo contiene informacion, sino lo elimina
done done
fi fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 17) for d in $(seq -f "%02g" 1 30)
do do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done done
fi fi
if [ $m -eq 2 ]; then if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28) for d in $(seq -f "%02g" 1 28)
do do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done done
fi fi
done done
......
...@@ -13,18 +13,21 @@ do ...@@ -13,18 +13,21 @@ do
for d in $(seq -f "%02g" 1 31) for d in $(seq -f "%02g" 1 31)
do do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json # revisa si el archivo contiene informacion, sino lo elimina
done done
fi fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 20) for d in $(seq -f "%02g" 1 30)
do do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done done
fi fi
if [ $m -eq 2 ]; then if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28) for d in $(seq -f "%02g" 1 28)
do do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done done
fi fi
done done
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment