crawlers

2d809dad · Renán Sosa Guillen · 11c4aa01 · 2d809dad · 2d809dad
Commit 2d809dad authored Aug 29, 2017 by Renán Sosa Guillen
Show whitespace changes
Inline Side-by-side

Showing with 15 additions and 10 deletions

noticias.py ...rga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.py +15 -10

noticias.pyc ...ga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.pyc +0 -0

No files found.
--- a/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.py
@@ -8,6 +8,7 @@ TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
 	return TAG_RE.sub('', text)
 class NoticiasItem(scrapy.Item):
 	title = scrapy.Field()
 	text = scrapy.Field()
@@ -17,6 +18,7 @@ class NoticiasItem(scrapy.Item):
 	topic = scrapy.Field()
 	url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
 	name = "noticias"
 	def start_requests(self):
@@ -30,6 +32,7 @@ class QuotesSpider(scrapy.Spider):
 		for url in urls:
 			yield scrapy.Request(url=url, callback=self.parse)
 	def parse(self, response):
 		pagination = response.xpath('//div[@class="pagination"]/a/@href').extract()
 		if ( len(pagination) > 0 ):
@@ -43,14 +46,16 @@ class QuotesSpider(scrapy.Spider):
 		else:
 			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
 	def parse_page(self, response):
 		for link in response.xpath('//h2[@class="cat-list-title"]/a/@href').extract():
 			yield scrapy.Request(url=link, callback=self.parse_item)
 	def parse_item(self, response):
 		item = NoticiasItem()
 		text = ''
-		item['date'] = response.xpath('//time[@class="entry-date updated"]/@datetime').extract_first()
+		item['date'] = response.xpath('//*[@class="entry-post-meta"]').css('time::attr(content)').extract_first()
 		item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
 		item['topic'] = response.xpath('//*[@class="entry-cat"]/a/text()').extract_first()

--- a/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.pyc