merge with dev

a16827a4 · Renán Sosa Guillen · 3edb1097 · 5c86d2f7 · a16827a4 · a16827a4
Commit a16827a4 authored Nov 22, 2018 by Renán Sosa Guillen
Show whitespace changes
Inline Side-by-side

Showing with 69 additions and 34 deletions

README.md README.md +1 -1

noticias.py descarga_por_dia/cuartoPoder/cuartoPoder/spiders/noticias.py +68 -33

No files found.
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ Se incluyen los siguientes medios nacionales:
 	Acceso por día:
 	```bash
-	http://www.cuartopoder.mx/archivo/portada/listado/30-08-2018/30-08-2018/
+	http://www.cuartopoder.mx/archivo/portada/listado/8-30-2018/8-30-2018/
 	```
 	Uso:

--- a/descarga_por_dia/cuartoPoder/cuartoPoder/spiders/noticias.py
+++ b/descarga_por_dia/cuartoPoder/cuartoPoder/spiders/noticias.py
@@ -13,7 +13,7 @@ USAGE:
 import scrapy, re
 from cuartoPoder.items import NoticiasItem
-from datetime import datetime, timedelta, tzinfo
+from datetime import datetime, date, timedelta, tzinfo
 TAG_RE = re.compile(r'<[^>]+>')
@@ -40,7 +40,9 @@ class ImportantData(scrapy.Item):
    Useful data for the flow of the implementation
    """
    to_next_page = scrapy.Field()
+    is_last_link = scrapy.Field()
    next_page    = scrapy.Field()
+    return_url   = scrapy.Field()
@@ -57,10 +59,17 @@ class QuotesSpider(scrapy.Spider):
        self.month = getattr(self, "month", None)
        self.day   = getattr(self, "day", None)
+        self.stop_date = date(int(self.year), int(self.month), int(self.day))
        self.baseURL = "http://www.cuartopoder.mx"
        first_URL = self.baseURL + "/archivo/portada/listado/{1}-{2}-{0}/{1}-{2}-{0}/".format(self.year, self.month.zfill(2), self.day.zfill(2))
        self.second_URL = self.baseURL + "/XStatic/cuartopoder/template/cargaBloque.aspx?strControl=ctrlArchivoResultadosPaginadoListado&"
+        self.month_parser = {"Enero"   : 1,  "Mayo"   : 5,  "Septiembre" : 9,
+                             "Febrero" : 2,  "Junio"  : 6,  "Octubre"    : 10,
+                             "Marzo"   : 3,  "Julio"  : 7,  "Noviembre"  : 11,
+                             "Abril"   : 4,  "Agosto" : 8,  "Diciembre"  : 12}
        flow_info = ImportantData()
        flow_info['to_next_page'] = False
        flow_info['next_page'] = 2
@@ -74,15 +83,26 @@ class QuotesSpider(scrapy.Spider):
    def parse(self, response):
        flow_info = response.meta['item']
+        page = flow_info['next_page']
+        if not flow_info['to_next_page']:
+            link_list = response.css('ul.news-list').xpath('./li/h5/a/@href').extract()
+            for link in link_list:
+                flow_info = ImportantData()
+                flow_info['next_page']  = page
+                flow_info['return_url'] = response.url
+                if link == link_list[-1] : flow_info['is_last_link'] = True
+                else                     : flow_info['is_last_link'] = False 
-        for link in response.css('ul.news-list').xpath('./li/h5/a/@href').extract():
-            flow_info['to_next_page'] = True
                news_link = self.baseURL + link
+                request = scrapy.Request(url=news_link, callback=self.parse_item)
+                request.meta['item'] = flow_info
-            yield scrapy.Request(url=news_link, callback=self.parse_item)
+                yield request
-        if flow_info['to_next_page']:
+        else:
-            page = flow_info['next_page']
            page_URL = self.second_URL + "p={3}&eids=&fd={1}-{2}-{0}&fh={1}-{2}-{0}&id=portada".format(self.year, self.month.zfill(2), self.day.zfill(2), str(page))        
            flow_info['to_next_page'] = False
@@ -94,8 +114,15 @@ class QuotesSpider(scrapy.Spider):
            yield request
    def parse_item(self, response):
+        news_date = response.css('ul.metas-list > li > p').extract_first()
+        news_date = remove_tags(news_date)
+        news_date = news_date.split(u'\xa0')
+        news_date[1] = news_date[1].strip().replace(",", '')
+        news_date = date(int(self.year), self.month_parser[news_date[0]], int(news_date[1]))
+        if news_date == self.stop_date:
+            flow_info = response.meta['item']
            item = NoticiasItem()
            text = ''
@@ -113,9 +140,17 @@ class QuotesSpider(scrapy.Spider):
            ## News item info ##
            item['date']  = news_date
-        item['title'] = title
+            item['title'] = title.strip()
            item['topic'] = topic
            item['text']  = text.strip()
            item['url']   = response.url
            yield item
+            if flow_info['is_last_link']:
+                flow_info['to_next_page'] = True
+                request = scrapy.Request(url=flow_info['return_url'], callback=self.parse, dont_filter=True)
+                request.meta['item'] = flow_info
+                yield request
\ No newline at end of file