chiapas hoy

c6e78aaa · Mario Chirinos · ee0af704 · c6e78aaa
Commit c6e78aaa authored Sep 20, 2019 by Mario Chirinos
Hide whitespace changes
Inline Side-by-side

Showing with 33 additions and 51 deletions

noticias.py descarga_por_dia/chiapasHoy/chiapasHoy/spiders/noticias.py +33 -51

No files found.
--- a/descarga_por_dia/chiapasHoy/chiapasHoy/spiders/noticias.py
+++ b/descarga_por_dia/chiapasHoy/chiapasHoy/spiders/noticias.py
 # -*- coding: utf-8 -*-
 import scrapy, re
 from chiapasHoy.items import NoticiasItem
+import datetime
 """
 MEDIO:
 Chiapas Hoy, Chiapas
@@ -19,64 +19,46 @@ DAT_RE = re.compile(r'[,;]?(\sa?\s?\d{1,2}\sde\s[a-zA-Z]+\sde\s\d{4}\s?)?\.\s?-\
 class QuotesSpider(scrapy.Spider):
-    name = "noticias"
+	name = "noticias"
-    def start_requests(self):
-        year = getattr(self, "year", None)
-        month = getattr(self, "month", None)
-        day = getattr(self, "day", None)
-        self.baseURL = "http://www.chiapashoy.com.mx/notashoy/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
-        yield scrapy.Request(url=self.baseURL, callback=self.parse)
-    def parse(self, response):
+	def start_requests(self):
-        for link in response.xpath('//main[@class="site-main"]/article/header/h2/a/@href').extract():
-            yield scrapy.Request(url=link, callback=self.parse_item)
+		year = getattr(self, "year", None)
+		month = getattr(self, "month", None)
+		day = getattr(self, "day", None)
-        nextPage = response.xpath('//*[@class="nav-links"]/a[@class="next page-numbers"]/@href').extract_first()
+		date = datetime.date(int(year), int(month), int(day))
-        if nextPage is not None and nextPage != '':
+		self.baseURL = "http://www.chiapashoy.com.mx/notashoy/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
-            yield scrapy.Request(url=nextPage, callback=self.parse)
+		yield scrapy.Request(url=self.baseURL, callback=self.parse, cb_kwargs={"date":date})
-    def parse_item(self, response):
-        item = NoticiasItem()
-        text = ''
-        "La fecha obtenida ya incluye formato y zona horaria"
+	def parse(self, response, **kwargs):
-        item['date'] = response.xpath('//span[@class="meta-date"]/a/time/@datetime').extract_first()
+		links =  response.css('article').css('h3').css('a::attr(href)').extract()
-        item['title'] = remove_tags(response.xpath('//header[@class="entry-header"]/h1').extract_first())
+		print(links)
-        try:
+		for link in links:
-            topic = response.xpath('//span[@class="meta-category"]/a/text()').extract()[1]
+			yield scrapy.Request(url=link, callback=self.parse_item, cb_kwargs=kwargs)
-        except:
-            topic = response.xpath('//span[@class="meta-category"]/a/text()').extract_first()
-        item['topic'] = topic
-        author = response.xpath('//span[@class="author vcard"]/a/text()').extract_first()
+		nextPage = response.xpath('//*[@class="nav-links"]/a[@class="next page-numbers"]/@href').extract_first()
-        if author is not None and author != '':
+		if nextPage is not None and nextPage != '':
-            item['author'] = author
+			yield scrapy.Request(url=nextPage, callback=self.parse, cb_kwargs=kwargs)
-        # bodyText = response.xpath('//*[@class="entry-content clearfix"]/p').extract()
-        # for i in range(0, len(bodyText)):
-        #     p = remove_tags(bodyText[i])
-        #     if i <= 2:
-        for p in response.xpath('//*[@class="entry-content clearfix"]/p').extract():
-            p = remove_tags(p)
-            p = p.lstrip().replace(u'\u2013', "-")
-            result = LOC_RE.match(p)
-            if result:
-                location = DAT_RE.sub('', result.group(0))
-                if location is not None: item['location'] = location
-                p = LOC_RE.sub('', p)
-            text += p + "\n"
+	def parse_item(self, response, **kwargs):
-        item['text'] = text.strip()
+		item = NoticiasItem()
+		text = ''
-        # for p in response.xpath('//*[@class="entry-content clearfix"]/p').extract():
+		item['date'] = kwargs["date"].strftime('%Y/%m/%d') #response.xpath('//span[@class="meta-date"]/a/time/@datetime').extract_first()
-        #     text += remove_tags(p) + "\n"
+		item['title'] = response.css("h1.entry-title::text").extract_first()
-        # item['text'] = text
+		item['topic'] = response.css('li.meta-category').css('a::text').extract_first().replace(" ", "").replace("\n", "")
-        item['url'] = response.url
+		paragraphs = response.css("article").css("div.entry-content").css("p").extract()
+		item['author'] = remove_tags(paragraphs[-1])
+		text = ""
+		for p in paragraphs:
+			text += remove_tags(p) + "\n"
-        yield item
+		item['text'] = text.strip()
+		item['url'] = response.url
+		print(item)
+		yield item