Commit 0800309b authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

Cambios scripts.

parent b85d6144
#!/bin/bash
## ------------------------------------------------------------------
## SCRIPT PARA LA DESCARGA AUTOMATICA DE NOTICIAS CON EL CRAWLER
## ------------------------------------------------------------------
site_section=( otros_sitios sitios_yucatan )
other_site_list=( diarioYaqui laJornada laJornadaAgs laJornadaBC laJornadaGro laJornadaMaya laJornadaOte laJornadaSanLuis laJornadaVer laJornadaZac )
yuc_site_list=( alChile desdeElBalcon diarioYucatan grilloPorteno laVerdadYuc lectorMX miPuntoDeVista notirivas notisureste puntoMedio sona893 yucatanALaMano yucatanAlMinuto yucatanEnCorto )
base_path=$HOME/crawler/
cd $base_path # activando el venv (entorno virtual) del crawler
source bin/activate
PATH=$PATH:$HOME/crawler/bin/python:$HOME/crawler/bin/scrapy # rutas donde se encuentran el scrapy y python
export PATH
## CALCULO DEL NUMERO DE DIAS PARA DESCARGAR NOTICIAS -------------------------
function obtain_days() {
local last_date=$1 # parametro 1
local stop_date=$2 # parametro 2
local day_date_1=`date -d "$last_date" '+%j'` # numero del dia del anio de la fecha en "last_date"
local y1=`date -d "$last_date" '+%Y'`
local day_date_2=`date -d "$stop_date" '+%j'`
local y2=`date -d "$stop_date" '+%Y'`
if [ $y1 -eq $y2 ] # si $y1 es igual a $y2
then
local num_days=$(expr $day_date_2 - $day_date_1)
elif [ $y1 -lt $y2 ]
then
local days_date_1=0
for year in `seq $y1 $y2`
do
if [ $year -eq $y1 ]
then
local days_date=$(expr `date -d "$y1-12-31" '+%j'` - $day_date_1)
elif [ $year -eq $y2 ]
then
days_date=$day_date_2
else
days_date=`date -d "$year-12-31" '+%j'`
fi
days_date_1=$(expr $days_date_1 + $days_date)
done
local num_days=$(expr $days_date_1)
fi
return $num_days
}
## ----------------------------------------------------------------------------
## SECUENCIA DE DESCARGA DE NOTICIAS --------------------------------------------------------------
for section in ${site_section[@]}
do
if [ $section = otros_sitios ]
then
list=${other_site_list[@]}
else
list=${yuc_site_list[@]}
fi
for site in $list
do
## POR CADA SITIO ENCUENTRA EL ARCHIVO CON LA ULTIMA FECHA EN QUE SE DESCARGO NOTICIAS ----
cd crawledJsonFiles/$section/$site
max=`ls | tail -1` # obtiene el ultimo directorio
cd $max
json_file=`ls | tail -1` # obtiene el ultimo archivo dentro del directorio
## ----------------------------------------------------------------------------------------
cd ~/crawler
last_date=`date -d "${json_file%%.*}" '+%Y-%m-%d'`
stop_date=`date -d "now" '+%Y-%m-%d'` # descarga hasta una fecha antes de esta
## NOTA: Para que descargue hasta una fecha antes, el paro debe fijarse una fecha despues. Por eso 'stop_date' se fija con 'now'.
if [ $last_date != $stop_date ]
then
last_date=`date -d "$last_date +1 days" '+%Y-%m-%d'`
## FUNCION 'obtain_days' CALCULA EL NUMERO DE DIAS ENTRE LA ULTIMA FECHA DE DESCARGA Y LA FECHA DE PARO
obtain_days $last_date $stop_date # parametros que se pasan a la funcion
num_days=$? # retorno del valor por parte de la funcion 'obtain_days'
for i in `seq $num_days -1 1`
do
y=`date -d "$stop_date - $i days" '+%Y'`
m=`date -d "$stop_date - $i days" '+%m'`
d=`date -d "$stop_date - $i days" '+%d'`
cd cawlersNoticias/$section/$site/ # ruta donde se encuentran alojados los crawlers de cada sitio
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json # revisa si el archivo contiene informacion, sino lo elimina
if [ -e $y-$m-$d.json ] # revisa si se genero el archivo json con las noticias
then
destination=$HOME/crawler/prueba/$section/$site/$y/ # ruta donde se guardaran los json generados
if [ ! -d $destination ] # si no existe la ruta de destino la crea
then
mkdir -p $destination
fi
mv -f $y-$m-$d.json $destination # mueve el archivo json a la ruta de destino
fi
cd ~/crawler
done
fi
done
done
deactivate
## ------------------------------------------------------------------------------------------------
\ No newline at end of file
import scrapy
from datetime import datetime, date, timedelta
"""
USO:
......@@ -30,58 +31,108 @@ class QuotesSpider(scrapy.Spider):
self.page = 0
self.count = 0
self.found = False
self.not_found = False
self.next_section = False
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
self.date = date(int(self.year), int(self.month), int(self.day))
self.data_list = []
self.baseURL = 'http://jornadabc.mx'
section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo'
'cultura', 'espectaculos', 'deportes']
# section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo', 'cultura']
section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo', 'cultura']
for section in section_list:
self.section = section
print self.section
self.page = 0
self.count = 0
self.found = False
self.not_found = False
self.next_section = False
self.data_list = []
page = -1
while True:
if ( self.found ):
self.found = False
break
if ( self.not_found ):
self.not_found = False
self.next_section = True
break
page += 1
print 'page '+str(page)
yield scrapy.Request(url=self.baseURL+'/seccion/'+section+'?page='+str(page), callback=self.parse, dont_filter=True)
self.page -= self.count
if ( self.page > 0 ):
self.page -= 1
for pag in range(self.page, self.page+25):
yield scrapy.Request(url=self.baseURL+'/seccion/'+section+'?page='+str(pag), callback=self.parse_page, dont_filter=True)
# if not ( self.next_section ):
# self.next_section = False
# self.page -= self.count
# if ( self.page > 0 ):
# self.page -= 1
# for pag in range(self.page, self.page+25):
# yield scrapy.Request(url=self.baseURL+'/seccion/'+section+'?page='+str(pag), callback=self.parse_page, dont_filter=True)
if ( len(self.data_list) > 0 ):
for link in self.data_list:
# link_date = datetime.strptime(link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:], '%d-%m-%Y').date()
# if ( link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:] == self.day.zfill(2)+'-'+self.month.zfill(2)+'-'+self.year ):
yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item, dont_filter=True)
def parse(self, response):
print response.url
count = 0
this_page = int(response.url[response.url.rfind('=')+1:])
if ( self.section == 'espectaculos' or self.section == 'deportes' ):
path = '//*[@class="region region-soft-first"]'
else:
path = '//*[@class="region region-hard-first"]'
for link in response.xpath(path).css('div.views-row').xpath('./*[@class="views-field views-field-title"]/span[@class="field-content"]/a/@href').extract():
link_date = link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:] # filtra la fecha del link
if ( link_date == self.day.zfill(2)+'-'+self.month.zfill(2)+'-'+self.year ):
self.page = int(response.url[response.url.rfind('=')+1:])
self.count += 1
link_list = response.xpath(path).css('div.views-row').xpath('./*[@class="views-field views-field-title"]/span[@class="field-content"]/a/@href').extract()
if ( len(link_list) > 0 ):
if ( this_page > 0 ):
del link_list[0]
print link_list
for link in link_list:
link_date = datetime.strptime(link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:], '%d-%m-%Y').date()
print self.section+' '+link_date.isoformat()+' ? '+self.date.isoformat()
if ( link_date.month == self.date.month and link_date.year == self.date.year ):
# self.page = int(response.url[response.url.rfind('=')+1:])
# self.count += 1
# self.found = True
# print '************founddddd********'
# break
self.data_list.append(link)
if ( this_page >= 300 ):
self.found = True
break
if ( this_page > 300 and len(link_list) == 0 ):
self.not_found = True
# if ( link_date < stop_date ):
# # count += 1
# # print count
# # if ( count > 3 ):
# self.not_found = True
# break
def parse_page(self, response):
if ( self.section == 'espectaculos' or self.section == 'deportes' ):
path = '//*[@class="region region-soft-first"]'
else:
path = '//*[@class="region region-hard-first"]'
for link in response.xpath(path).css('div.views-row').xpath('./*[@class="views-field views-field-title"]/span[@class="field-content"]/a/@href').extract():
link_date = link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:] # filtra la fecha del link
if ( link_date == self.day.zfill(2)+'-'+self.month.zfill(2)+'-'+self.year ):
link_list = response.xpath(path).css('div.views-row').xpath('./*[@class="views-field views-field-title"]/span[@class="field-content"]/a/@href').extract()
if ( len(link_list) > 0 ):
for link in link_list:
link_date = datetime.strptime(link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:], '%d-%m-%Y').date()
if ( link_date == self.date ):
yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item)
......
......@@ -28,7 +28,7 @@ class QuotesSpider(scrapy.Spider):
def start_requests(self):
self.section = ''
self.baseURL = 'http://jornadabc.mx'
section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo'
section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo',
'cultura', 'espectaculos', 'deportes']
for section in section_list:
self.section = section
......
import scrapy
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
# scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
......
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -7,24 +7,27 @@ do
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 3)
for m in $(seq -f "%02g" 1 4)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json # revisa si el archivo contiene informacion, sino lo elimina
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 17)
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
done
......
......@@ -13,18 +13,21 @@ do
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json # revisa si el archivo contiene informacion, sino lo elimina
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 20)
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
done
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment