Cambios scripts.

0800309b · Renán Sosa Guillen · b85d6144 · 0800309b · 0800309b · 0800309b
Commit 0800309b authored Jul 17, 2017 by Renán Sosa Guillen
14 changed files
--- a/crawler_script/crawler.sh
+++ b/crawler_script/crawler.sh
+#!/bin/bash
+## ------------------------------------------------------------------
+## SCRIPT PARA LA DESCARGA AUTOMATICA DE NOTICIAS CON EL CRAWLER
+## ------------------------------------------------------------------
+
+
+site_section=( otros_sitios sitios_yucatan )
+other_site_list=( diarioYaqui laJornada laJornadaAgs laJornadaBC laJornadaGro laJornadaMaya laJornadaOte laJornadaSanLuis laJornadaVer laJornadaZac )
+yuc_site_list=( alChile desdeElBalcon diarioYucatan grilloPorteno laVerdadYuc lectorMX miPuntoDeVista notirivas notisureste puntoMedio sona893 yucatanALaMano yucatanAlMinuto yucatanEnCorto )
+
+base_path=$HOME/crawler/
+cd $base_path                    # activando el venv (entorno virtual) del crawler
+source bin/activate
+
+PATH=$PATH:$HOME/crawler/bin/python:$HOME/crawler/bin/scrapy       # rutas donde se encuentran el scrapy y python
+export PATH
+
+
+## CALCULO DEL NUMERO DE DIAS PARA DESCARGAR NOTICIAS -------------------------
+function obtain_days() {
+	local last_date=$1          # parametro 1
+	local stop_date=$2	        # parametro 2
+
+
+	local day_date_1=`date -d "$last_date" '+%j'`      # numero del dia del anio de la fecha en "last_date"
+	local y1=`date -d "$last_date" '+%Y'`
+	local day_date_2=`date -d "$stop_date" '+%j'`
+	local y2=`date -d "$stop_date" '+%Y'`
+
+
+	if [ $y1 -eq $y2 ]  # si $y1 es igual a $y2
+	then
+		local num_days=$(expr $day_date_2 - $day_date_1)
+	elif [ $y1 -lt $y2 ]
+	then
+		local days_date_1=0
+		for year in `seq $y1 $y2`
+		do
+			if [ $year -eq $y1 ]
+			then
+				local days_date=$(expr `date -d "$y1-12-31" '+%j'` - $day_date_1)
+			elif [ $year -eq $y2 ]
+			then
+				days_date=$day_date_2
+			else
+				days_date=`date -d "$year-12-31" '+%j'`
+			fi
+			days_date_1=$(expr $days_date_1 + $days_date)
+		done
+		local num_days=$(expr $days_date_1)
+	fi
+
+	return $num_days
+}
+## ----------------------------------------------------------------------------
+
+
+## SECUENCIA DE DESCARGA DE NOTICIAS --------------------------------------------------------------
+
+for section in ${site_section[@]}
+do
+	if [ $section = otros_sitios ]
+	then
+		list=${other_site_list[@]}
+	else
+		list=${yuc_site_list[@]}
+	fi
+
+	for site in $list
+	do
+		## POR CADA SITIO ENCUENTRA EL ARCHIVO CON LA ULTIMA FECHA EN QUE SE DESCARGO NOTICIAS ----
+		cd crawledJsonFiles/$section/$site
+		max=`ls | tail -1`           # obtiene el ultimo directorio
+
+		cd $max
+		json_file=`ls | tail -1`     # obtiene el ultimo archivo dentro del directorio
+		## ----------------------------------------------------------------------------------------
+		cd ~/crawler
+
+		last_date=`date -d "${json_file%%.*}" '+%Y-%m-%d'`
+		stop_date=`date -d "now" '+%Y-%m-%d'`        # descarga hasta una fecha antes de esta
+		## NOTA: Para que descargue hasta una fecha antes, el paro debe fijarse una fecha despues. Por eso 'stop_date' se fija con 'now'.
+
+		if [ $last_date != $stop_date ]
+		then
+			last_date=`date -d "$last_date +1 days" '+%Y-%m-%d'`                  
+
+			## FUNCION 'obtain_days' CALCULA EL NUMERO DE DIAS ENTRE LA ULTIMA FECHA DE DESCARGA Y LA FECHA DE PARO
+			obtain_days $last_date $stop_date    # parametros que se pasan a la funcion
+			num_days=$?                          # retorno del valor por parte de la funcion 'obtain_days'
+
+			for i in `seq $num_days -1 1`
+			do
+				y=`date -d "$stop_date - $i days" '+%Y'`
+				m=`date -d "$stop_date - $i days" '+%m'`
+				d=`date -d "$stop_date - $i days" '+%d'`
+				
+				cd cawlersNoticias/$section/$site/       # ruta donde se encuentran alojados los crawlers de cada sitio
+				
+				scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d      # ejecucion del crawler correspondiente segun el sitio
+				[ -s $y-$m-$d.json ] || rm $y-$m-$d.json    # revisa si el archivo contiene informacion, sino lo elimina
+			
+				if [ -e $y-$m-$d.json ]                  # revisa si se genero el archivo json con las noticias
+				then
+					destination=$HOME/crawler/prueba/$section/$site/$y/      # ruta donde se guardaran los json generados
+					
+					if [ ! -d $destination ]             # si no existe la ruta de destino la crea
+					then
+						mkdir -p $destination
+					fi
+					
+					mv -f $y-$m-$d.json $destination     # mueve el archivo json a la ruta de destino
+				fi
+
+				cd ~/crawler
+			done
+		fi
+	done
+done
+
+deactivate
+## ------------------------------------------------------------------------------------------------
\ No newline at end of file
--- a/otros_sitios/laJornada/laJornada/spiders/noticias.pyc
+++ b/otros_sitios/laJornada/laJornada/spiders/noticias.pyc
--- a/otros_sitios/laJornadaBC/laJornadaBC/spiders/noticias.py
+++ b/otros_sitios/laJornadaBC/laJornadaBC/spiders/noticias.py
 import scrapy
+from datetime import datetime, date, timedelta

 """
 USO:
@@ -30,58 +31,108 @@ class QuotesSpider(scrapy.Spider):
 		self.page = 0
 		self.count = 0
 		self.found = False
+		self.not_found = False
+		self.next_section = False
 		self.year = getattr(self, 'year', None)
 		self.month = getattr(self, 'month', None)
 		self.day = getattr(self, 'day', None)
+		self.date = date(int(self.year), int(self.month), int(self.day))
+		self.data_list = []
 		self.baseURL = 'http://jornadabc.mx'
-		section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo'
-						'cultura', 'espectaculos', 'deportes']
+		# section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo', 'cultura']
+		section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo', 'cultura']
 		
 		for section in section_list:
 			self.section = section
+			print self.section
 			self.page = 0
 			self.count = 0
+			self.found = False
+			self.not_found = False
+			self.next_section = False
+			self.data_list = []
 			page = -1
 			while True:
 				if ( self.found ):
 					self.found = False
 					break
+				if ( self.not_found ):
+					self.not_found = False
+					self.next_section = True
+					break
 				page += 1
+				print 'page '+str(page)
 				yield scrapy.Request(url=self.baseURL+'/seccion/'+section+'?page='+str(page), callback=self.parse, dont_filter=True)
 			
-			self.page -= self.count
-			if ( self.page > 0 ):
-				self.page -= 1
-			
-			for pag in range(self.page, self.page+25):
-				yield scrapy.Request(url=self.baseURL+'/seccion/'+section+'?page='+str(pag), callback=self.parse_page, dont_filter=True)				
+			# if not ( self.next_section ):
+			# 	self.next_section = False
+			# 	self.page -= self.count
+			# 	if ( self.page > 0 ):
+			# 		self.page -= 1
 					
+			# 	for pag in range(self.page, self.page+25):
+			# 		yield scrapy.Request(url=self.baseURL+'/seccion/'+section+'?page='+str(pag), callback=self.parse_page, dont_filter=True)				
+			if ( len(self.data_list) > 0 ):
+				for link in self.data_list:
+					# link_date = datetime.strptime(link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:], '%d-%m-%Y').date()
+					# if ( link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:] == self.day.zfill(2)+'-'+self.month.zfill(2)+'-'+self.year ):
+					yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item, dont_filter=True)

 	
 	def parse(self, response):
+		print response.url
+		count = 0
+		this_page = int(response.url[response.url.rfind('=')+1:])
+
 		if ( self.section == 'espectaculos' or self.section == 'deportes' ):
 			path = '//*[@class="region region-soft-first"]'
 		else:
 			path = '//*[@class="region region-hard-first"]'

-		for link in response.xpath(path).css('div.views-row').xpath('./*[@class="views-field views-field-title"]/span[@class="field-content"]/a/@href').extract():
-			link_date = link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:]   # filtra la fecha del link
-			if ( link_date == self.day.zfill(2)+'-'+self.month.zfill(2)+'-'+self.year ):
-				self.page = int(response.url[response.url.rfind('=')+1:])
-				self.count += 1
+		link_list = response.xpath(path).css('div.views-row').xpath('./*[@class="views-field views-field-title"]/span[@class="field-content"]/a/@href').extract()
+		
+		if ( len(link_list) > 0 ):
+			if ( this_page > 0 ):
+				del link_list[0]
+			print link_list
+			for link in link_list:
+				link_date = datetime.strptime(link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:], '%d-%m-%Y').date()
+				print self.section+' '+link_date.isoformat()+' ? '+self.date.isoformat()
+				if ( link_date.month == self.date.month and link_date.year == self.date.year ):
+					# self.page = int(response.url[response.url.rfind('=')+1:])
+					# self.count += 1
+					# self.found = True
+					# print '************founddddd********'
+					# break
+					self.data_list.append(link)
+
+				if ( this_page >= 300 ):
 					self.found = True
 					break
 		

+			if ( this_page > 300 and len(link_list) == 0 ):
+				self.not_found = True
+				# if ( link_date < stop_date ):
+				# 	# count += 1
+				# 	# print count
+				# 	# if ( count > 3 ):
+				# 	self.not_found = True
+				# 	break
+				
+
+
 	def parse_page(self, response):
 		if ( self.section == 'espectaculos' or self.section == 'deportes' ):
 			path = '//*[@class="region region-soft-first"]'
 		else:
 			path = '//*[@class="region region-hard-first"]'

-		for link in response.xpath(path).css('div.views-row').xpath('./*[@class="views-field views-field-title"]/span[@class="field-content"]/a/@href').extract():
-			link_date = link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:]   # filtra la fecha del link
-			if ( link_date == self.day.zfill(2)+'-'+self.month.zfill(2)+'-'+self.year ):
+		link_list = response.xpath(path).css('div.views-row').xpath('./*[@class="views-field views-field-title"]/span[@class="field-content"]/a/@href').extract()
+		if ( len(link_list) > 0 ):
+			for link in link_list:
+				link_date = datetime.strptime(link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:], '%d-%m-%Y').date()
+				if ( link_date == self.date ):
 					yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item)



--- a/otros_sitios/laJornadaBC/laJornadaBC/spiders/noticias.pyc
+++ b/otros_sitios/laJornadaBC/laJornadaBC/spiders/noticias.pyc
--- a/otros_sitios/laJornadaBC2/laJornadaBC2/spiders/noticias.py
+++ b/otros_sitios/laJornadaBC2/laJornadaBC2/spiders/noticias.py
@@ -28,7 +28,7 @@ class QuotesSpider(scrapy.Spider):
 	def start_requests(self):
 		self.section = ''
 		self.baseURL = 'http://jornadabc.mx'
-		section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo'
+		section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo',
 						'cultura', 'espectaculos', 'deportes']
 		for section in section_list:
 			self.section = section

--- a/otros_sitios/laJornadaBC2/laJornadaBC2/spiders/noticias.pyc
+++ b/otros_sitios/laJornadaBC2/laJornadaBC2/spiders/noticias.pyc
--- a/otros_sitios/laJornadaGro/laJornadaGro/spiders/noticias.py
+++ b/otros_sitios/laJornadaGro/laJornadaGro/spiders/noticias.py
 import scrapy

-#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+# scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22

 import re


--- a/otros_sitios/laJornadaGro/laJornadaGro/spiders/noticias.pyc
+++ b/otros_sitios/laJornadaGro/laJornadaGro/spiders/noticias.pyc
--- a/otros_sitios/laJornadaMaya/laJornadaMaya.json
+++ b/otros_sitios/laJornadaMaya/laJornadaMaya.json
--- a/otros_sitios/laJornadaMaya/laJornadaMayaopinion.json
+++ b/otros_sitios/laJornadaMaya/laJornadaMayaopinion.json
--- a/otros_sitios/laJornadaMaya/laJornadaMayaopinion2.json
+++ b/otros_sitios/laJornadaMaya/laJornadaMayaopinion2.json
--- a/otros_sitios/laJornadaMaya/noticias.json
+++ b/otros_sitios/laJornadaMaya/noticias.json
--- a/sitios_yucatan/miPuntoDeVista/miPuntoDeVistaCrawler.sh
+++ b/sitios_yucatan/miPuntoDeVista/miPuntoDeVistaCrawler.sh
@@ -7,24 +7,27 @@ do
 	  mkdir -p $y;
 	fi
 	cd $y
-	for m in $(seq -f "%02g" 1 3)
+	for m in $(seq -f "%02g" 1 4)
 	do
 		if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
 			for d in $(seq -f "%02g" 1 31)
 			do
 				scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
+				[ -s $y-$m-$d.json ] || rm $y-$m-$d.json  # revisa si el archivo contiene informacion, sino lo elimina
 			done
 		fi
 		if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
-			for d in $(seq -f "%02g" 1 17)
+			for d in $(seq -f "%02g" 1 30)
 			do
 				scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
+				[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
 			done
 		fi
 		if [ $m -eq 2 ]; then
 			for d in $(seq -f "%02g" 1 28)
 			do
 				scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
+				[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
 			done
 		fi
 	done

--- a/sitios_yucatan/sona893/sona893Crawler.sh
+++ b/sitios_yucatan/sona893/sona893Crawler.sh
@@ -13,18 +13,21 @@ do
 			for d in $(seq -f "%02g" 1 31)
 			do
 				scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
+				[ -s $y-$m-$d.json ] || rm $y-$m-$d.json  # revisa si el archivo contiene informacion, sino lo elimina
 			done
 		fi
 		if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
-			for d in $(seq -f "%02g" 1 20)
+			for d in $(seq -f "%02g" 1 30)
 			do
 				scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
+				[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
 			done
 		fi
 		if [ $m -eq 2 ]; then
 			for d in $(seq -f "%02g" 1 28)
 			do
 				scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
+				[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
 			done
 		fi
 	done