crawlers

392e0d4e · Renán Sosa Guillen · 77fa0911 · 392e0d4e · 392e0d4e · 392e0d4e
Commit 392e0d4e authored Nov 25, 2017 by Renán Sosa Guillen
18 changed files
--- a/descarga_hacia_atras/diarioYucatan/diarioYucatan/spiders/noticias.py
+++ b/descarga_hacia_atras/diarioYucatan/diarioYucatan/spiders/noticias.py
@@ -104,7 +104,7 @@ class QuotesSpider(scrapy.Spider):
 		for paragraph in response.css('div.entry-content').css('p').extract():
 			text += remove_tags(paragraph) + '\n'		
 		item['text'] = text
-		item['topic'] = [response.xpath('//*[@class="breadcrumbs-plus"]/span/a/span/text()').extract()[1]]
+		item['topic'] = response.xpath('//*[@class="breadcrumbs-plus"]/span/a/span/text()').extract()[1]
 		item['url'] = response.url
 		# print item['title']
 		yield item

--- a/descarga_hacia_atras/diarioYucatan/diarioYucatan/spiders/noticias.pyc
+++ b/descarga_hacia_atras/diarioYucatan/diarioYucatan/spiders/noticias.pyc
--- a/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/spiders/noticias.py
+++ b/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/spiders/noticias.py
@@ -108,7 +108,7 @@ class QuotesSpider(scrapy.Spider):
 		for paragraph in response.css('div.entry-content').css('p').extract():
 			text += remove_tags(paragraph) + '\n'		
 		item['text'] = text
-		item['topic'] = [response.xpath('//*[@class="breadcrumbs-plus"]/span/a/span/text()').extract()[1]]
+		item['topic'] = response.xpath('//*[@class="breadcrumbs-plus"]/span/a/span/text()').extract()[1]
 		item['url'] = response.url
 		# print item['title']
 		yield item

--- a/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/spiders/noticias.pyc
+++ b/descarga_hacia_atras/diarioYucatan2/diarioYucatan2/spiders/noticias.pyc
--- a/descarga_hacia_atras/laJornadaBC2/laJornadaBC2/spiders/noticias.pyc
+++ b/descarga_hacia_atras/laJornadaBC2/laJornadaBC2/spiders/noticias.pyc
--- a/descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.py
+++ b/descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.py
@@ -73,7 +73,7 @@ class QuotesSpider(scrapy.Spider):
 		text = ''
 		item['date'] = self.date
 		item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
-		item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract()
+		item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract_first()
 		
 		for paragraph in response.xpath('//div[@class="clearfix"]/p').extract():
 			text += remove_tags(paragraph) + '\n'

--- a/descarga_por_dia/laJornadaOte/laJornadaOte/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaOte/laJornadaOte/spiders/noticias.py
@@ -43,7 +43,6 @@ class QuotesSpider(scrapy.Spider):
 		## la fecha de la noticia ya incluye la zona horaria
 		item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
 		item['topic'] = response.xpath('//span[@itemprop="genre"]/text()').extract_first()
-		item['author'] = response.xpath('//span[@itemprop="name"]/text()').extract_first()
 		
 		for paragraph in response.xpath('//span[@itemprop="articleBody"]').extract():
 			text += remove_tags(paragraph) + '\n'

--- a/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/spiders/noticias.py
@@ -54,9 +54,15 @@ class QuotesSpider(scrapy.Spider):
 		item = NoticiasItem()
 		text = ''
 		## la fecha de la noticia ya incluye la zona horaria
-		item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+		d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+		if d is None:
+			d = response.xpath('//time[@class="entry-date updated"]/@datetime').extract_first()
+		item['date'] = d
+
 		item['title'] = response.css('h1.entry-title::text').extract_first()
+		
 		item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract_first()
+		
 		for paragraph in response.xpath('//p[@style="text-align: justify;"]/text()').extract():
 			text += remove_tags(paragraph) + '\n'
 		item['text'] = text

--- a/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.py
@@ -54,7 +54,7 @@ class QuotesSpider(scrapy.Spider):

 		d = response.xpath('//*[@class="entry-post-meta"]').css('time::attr(content)').extract_first()
 		## '-06:00' corresponde al UTC-6, zona horaria de zacatecas (centro de mexico)
-		if d[-6:] != '-05:00' and d[-6:] != '-06:00' :
+		if d[-6:] != '-06:00':
 			d = d[:-6] + '-06:00'
 		item['date'] = d


--- a/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.pyc
--- a/descarga_por_dia/lectorMX/lectorMX/spiders/noticias.py
+++ b/descarga_por_dia/lectorMX/lectorMX/spiders/noticias.py
@@ -71,7 +71,6 @@ class QuotesSpider(scrapy.Spider):
 		item['date'] = self.date
 		item['title'] = response.xpath('//div[@class="single_post"]/header/h1/text()').extract_first()
 		item['topic'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[6]/a/text()').extract_first()
-		item['author'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[2]/a/text()').extract_first()
 		for paragraph in response.css('div.post-single-content').css('p').extract():
 			text += remove_tags(paragraph) + '\n'
 		item['text'] = text

--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/spiders/noticias.py
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/spiders/noticias.py
@@ -65,8 +65,7 @@ class QuotesSpider(scrapy.Spider):
 			d = d[:-6] + '-06:00'
 		item['date'] = d
 		
-		item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract()
-		item['author'] = response.css('div.td-post-author-name').css('a::text').extract_first()
+		item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()
 		
 		for paragraph in response.css('div.td-post-content').css('p').extract():
 			text += remove_tags(paragraph) + '\n'

--- a/descarga_por_dia/notisureste/notisureste/spiders/noticias.py
+++ b/descarga_por_dia/notisureste/notisureste/spiders/noticias.py
@@ -65,7 +65,7 @@ class QuotesSpider(scrapy.Spider):
 		item['date'] = d
 		
 		item['url'] = response.url
-		item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract()
+		item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()
 		
 		for paragraph in response.css('div.td-post-content').css('p').extract():
 			text += remove_tags(paragraph) + '\n'

--- a/descarga_por_dia/notisureste/notisureste/spiders/noticias.pyc
+++ b/descarga_por_dia/notisureste/notisureste/spiders/noticias.pyc
--- a/descarga_por_dia/puntoMedio/puntoMedio/spiders/noticias.py
+++ b/descarga_por_dia/puntoMedio/puntoMedio/spiders/noticias.py
@@ -49,8 +49,7 @@ class QuotesSpider(scrapy.Spider):
 			d = d[:-6] + '-06:00'
 		item['date'] = d

-		item['author'] = response.css('span.author').css('a::text').extract_first()
-		item['topic'] = response.xpath('//a[@rel="category tag"]/text()').extract()
+		item['topic'] = response.xpath('//a[@rel="category tag"]/text()').extract_first()
 		
 		for paragraph in response.css('div.post-entry').css('p').extract():
 			text += remove_tags(paragraph)

--- a/descarga_por_dia/sona893/sona893/spiders/noticias.py
+++ b/descarga_por_dia/sona893/sona893/spiders/noticias.py
@@ -50,7 +50,7 @@ class QuotesSpider(scrapy.Spider):
 	def parse_page(self, response):
 		for post in response.css('div.mosaicflow').css('div.post'):
 			item = NoticiasItem()
-			item['topic'] = post.css('a.custom_cat_class_Kesha::text').extract()
+			item['topic'] = post.css('a.custom_cat_class_Kesha::text').extract_first()
 			item['title'] = post.xpath('./h1/a/@title').extract_first()
 			request = scrapy.Request(url=post.xpath('./h1/a/@href').extract_first(), callback=self.parse_item)
 			request.meta['item'] = item

--- a/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/spiders/noticias.py
+++ b/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/spiders/noticias.py
@@ -3,6 +3,9 @@ import scrapy, re

 ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22

+"""
+Crawler descarga noticias desde el 2017.10.18
+"""

 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
@@ -24,16 +27,16 @@ class QuotesSpider(scrapy.Spider):
 		year = getattr(self, 'year', None)
 		month = getattr(self, 'month', None)
 		day = getattr(self, 'day', None)
-		self.baseURL='http://florcastillo.mx/noticias/'+year+'/'+month+'/'+day
+		self.baseURL='http://www.yucatanencorto.com/noticias/'+year+'/'+month+'/'+day
 		
 		yield scrapy.Request(url=self.baseURL, callback=self.parse)


 	def parse(self, response):
-		pagination = response.css('div.pagination').css('a::attr(href)').extract()
+		pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a/@href').extract()
 		
 		if ( len(pagination) > 0 ):
-			pagination = pagination[-1]
+			pagination = pagination[-2].strip('/')
 			pages = int(pagination[pagination.rfind('/')+1:])
 			
 			for page in range(0, pages):
@@ -48,25 +51,32 @@ class QuotesSpider(scrapy.Spider):


 	def parse_page(self, response):
-		for link in response.css('div.list-block').xpath('./h3/a/@href').extract():
+		for link in response.xpath('//*[@class="td_module_1 td_module_wrap td-animation-stack"]/h3/a/@href').extract():
 			yield scrapy.Request(url=link, callback=self.parse_item)

 	def parse_item(self, response):
 		item = NoticiasItem()
 		text = ''
-		item['title'] = response.css('div.post-title').css('h1.entry-title::text').extract_first()
+		title = response.xpath('//*[@class="td-post-header-holder"]/header/h1/text()').extract_first()
+		if title is None:
+			title = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first()
+		item['title'] = title
 		
-		d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+		d = response.xpath('//span[@class="td-post-date td-post-date-no-dot"]/time/@datetime').extract_first()
 		## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
 		if d[-6:] != '-06:00':
 			d = d[:-6] + '-06:00'
 		item['date'] = d
 		
-		item['topic'] = response.xpath('//ul[@id="tpcrn-breadcrumbs"]/li[2]/a/text()').extract_first()[:-2]
+		item['topic'] = response.xpath('//*[@class="entry-crumbs"]/span/a/text()').extract()[1]
 		
-		for paragraph in response.css('div.post_content').css('p').extract():
-			text += remove_tags(paragraph) + '\n'
+		paragraphs = response.xpath('//*[@class="td-post-content"]/div').extract()
+		if len(paragraphs) <= 2:
+			paragraphs = response.xpath('//*[@class="td-post-content"]/p').extract()
+		for p in paragraphs:
+			text += remove_tags(p) + '\n'
 		item['text'] = text
+		
 		item['url'] = response.url
 		
 		# print item['title']

--- a/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/spiders/noticias.pyc
+++ b/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/spiders/noticias.pyc