chiapas hoy

c6e78aaa · Mario Chirinos · ee0af704 · c6e78aaa
Commit c6e78aaa authored Sep 20, 2019 by Mario Chirinos
Show whitespace changes
Inline Side-by-side

Showing with 33 additions and 51 deletions

noticias.py descarga_por_dia/chiapasHoy/chiapasHoy/spiders/noticias.py +33 -51

No files found.
--- a/descarga_por_dia/chiapasHoy/chiapasHoy/spiders/noticias.py
+++ b/descarga_por_dia/chiapasHoy/chiapasHoy/spiders/noticias.py
 # -*- coding: utf-8 -*-
 import scrapy, re
 from chiapasHoy.items import NoticiasItem
-
+import datetime
 """
 MEDIO:
 Chiapas Hoy, Chiapas
@@ -22,61 +22,43 @@ class QuotesSpider(scrapy.Spider):
 	name = "noticias"

 	def start_requests(self):
+	
 		year = getattr(self, "year", None)
 		month = getattr(self, "month", None)
 		day = getattr(self, "day", None)

+		date = datetime.date(int(year), int(month), int(day))
 		self.baseURL = "http://www.chiapashoy.com.mx/notashoy/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)

-        yield scrapy.Request(url=self.baseURL, callback=self.parse)
+		yield scrapy.Request(url=self.baseURL, callback=self.parse, cb_kwargs={"date":date})


-    def parse(self, response):
-        for link in response.xpath('//main[@class="site-main"]/article/header/h2/a/@href').extract():
-            yield scrapy.Request(url=link, callback=self.parse_item)
+	def parse(self, response, **kwargs):
+		links =  response.css('article').css('h3').css('a::attr(href)').extract()
+		print(links)
+		for link in links:
+			yield scrapy.Request(url=link, callback=self.parse_item, cb_kwargs=kwargs)

 		nextPage = response.xpath('//*[@class="nav-links"]/a[@class="next page-numbers"]/@href').extract_first()
 		if nextPage is not None and nextPage != '':
-            yield scrapy.Request(url=nextPage, callback=self.parse)
+			yield scrapy.Request(url=nextPage, callback=self.parse, cb_kwargs=kwargs)


-    def parse_item(self, response):
+	def parse_item(self, response, **kwargs):
 		item = NoticiasItem()
 		text = ''

-        "La fecha obtenida ya incluye formato y zona horaria"
-        item['date'] = response.xpath('//span[@class="meta-date"]/a/time/@datetime').extract_first()
-        item['title'] = remove_tags(response.xpath('//header[@class="entry-header"]/h1').extract_first())
-        try:
-            topic = response.xpath('//span[@class="meta-category"]/a/text()').extract()[1]
-        except:
-            topic = response.xpath('//span[@class="meta-category"]/a/text()').extract_first()
-        item['topic'] = topic
-
-        author = response.xpath('//span[@class="author vcard"]/a/text()').extract_first()
-        if author is not None and author != '':
-            item['author'] = author
-
-        # bodyText = response.xpath('//*[@class="entry-content clearfix"]/p').extract()
-        # for i in range(0, len(bodyText)):
-        #     p = remove_tags(bodyText[i])
-        #     if i <= 2:
-        for p in response.xpath('//*[@class="entry-content clearfix"]/p').extract():
-            p = remove_tags(p)
-            p = p.lstrip().replace(u'\u2013', "-")
-            result = LOC_RE.match(p)
-            if result:
-                location = DAT_RE.sub('', result.group(0))
-                if location is not None: item['location'] = location
-                p = LOC_RE.sub('', p)
-
-            text += p + "\n"
-        item['text'] = text.strip()
+		item['date'] = kwargs["date"].strftime('%Y/%m/%d') #response.xpath('//span[@class="meta-date"]/a/time/@datetime').extract_first()
+		item['title'] = response.css("h1.entry-title::text").extract_first()
+		item['topic'] = response.css('li.meta-category').css('a::text').extract_first().replace(" ", "").replace("\n", "")

-        # for p in response.xpath('//*[@class="entry-content clearfix"]/p').extract():
-        #     text += remove_tags(p) + "\n"
-        # item['text'] = text
+		paragraphs = response.css("article").css("div.entry-content").css("p").extract()
+		item['author'] = remove_tags(paragraphs[-1])
+		text = ""
+		for p in paragraphs:
+			text += remove_tags(p) + "\n"

+		item['text'] = text.strip()
 		item['url'] = response.url
-
+		print(item)
 		yield item