crawlers

5c86d2f7 · Renán Sosa Guillen · 58b75087 · 5c86d2f7
Commit 5c86d2f7 authored Nov 22, 2018 by Renán Sosa Guillen
Hide whitespace changes
Inline Side-by-side

Showing with 68 additions and 33 deletions

noticias.py descarga_por_dia/cuartoPoder/cuartoPoder/spiders/noticias.py +68 -33

No files found.
--- a/descarga_por_dia/cuartoPoder/cuartoPoder/spiders/noticias.py
+++ b/descarga_por_dia/cuartoPoder/cuartoPoder/spiders/noticias.py
@@ -13,7 +13,7 @@ USAGE:
 import scrapy, re
 from cuartoPoder.items import NoticiasItem
-from datetime import datetime, timedelta, tzinfo
+from datetime import datetime, date, timedelta, tzinfo
 TAG_RE = re.compile(r'<[^>]+>')
@@ -40,7 +40,9 @@ class ImportantData(scrapy.Item):
    Useful data for the flow of the implementation
    """
    to_next_page = scrapy.Field()
-    next_page = scrapy.Field()
+    is_last_link = scrapy.Field()
+    next_page    = scrapy.Field()
+    return_url   = scrapy.Field()
@@ -57,10 +59,17 @@ class QuotesSpider(scrapy.Spider):
        self.month = getattr(self, "month", None)
        self.day   = getattr(self, "day", None)
+        self.stop_date = date(int(self.year), int(self.month), int(self.day))
        self.baseURL = "http://www.cuartopoder.mx"
        first_URL = self.baseURL + "/archivo/portada/listado/{1}-{2}-{0}/{1}-{2}-{0}/".format(self.year, self.month.zfill(2), self.day.zfill(2))
        self.second_URL = self.baseURL + "/XStatic/cuartopoder/template/cargaBloque.aspx?strControl=ctrlArchivoResultadosPaginadoListado&"
+        self.month_parser = {"Enero"   : 1,  "Mayo"   : 5,  "Septiembre" : 9,
+                             "Febrero" : 2,  "Junio"  : 6,  "Octubre"    : 10,
+                             "Marzo"   : 3,  "Julio"  : 7,  "Noviembre"  : 11,
+                             "Abril"   : 4,  "Agosto" : 8,  "Diciembre"  : 12}
        flow_info = ImportantData()
        flow_info['to_next_page'] = False
        flow_info['next_page'] = 2
@@ -74,15 +83,26 @@ class QuotesSpider(scrapy.Spider):
    def parse(self, response):
        flow_info = response.meta['item']
+        page = flow_info['next_page']
-        for link in response.css('ul.news-list').xpath('./li/h5/a/@href').extract():
+        if not flow_info['to_next_page']:
-            flow_info['to_next_page'] = True
+            link_list = response.css('ul.news-list').xpath('./li/h5/a/@href').extract()
-            news_link = self.baseURL + link
+            for link in link_list:
-            yield scrapy.Request(url=news_link, callback=self.parse_item)
+                flow_info = ImportantData()
+                flow_info['next_page']  = page
+                flow_info['return_url'] = response.url
+                if link == link_list[-1] : flow_info['is_last_link'] = True
+                else                     : flow_info['is_last_link'] = False 
+                news_link = self.baseURL + link
+                request = scrapy.Request(url=news_link, callback=self.parse_item)
+                request.meta['item'] = flow_info
+                yield request
-        if flow_info['to_next_page']:
+        else:
-            page = flow_info['next_page']
            page_URL = self.second_URL + "p={3}&eids=&fd={1}-{2}-{0}&fh={1}-{2}-{0}&id=portada".format(self.year, self.month.zfill(2), self.day.zfill(2), str(page))        
            flow_info['to_next_page'] = False
@@ -94,28 +114,43 @@ class QuotesSpider(scrapy.Spider):
            yield request
    def parse_item(self, response):
-        item = NoticiasItem()
+        news_date = response.css('ul.metas-list > li > p').extract_first()
-        text = ''
+        news_date = remove_tags(news_date)
+        news_date = news_date.split(u'\xa0')
-        news_date = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
+        news_date[1] = news_date[1].strip().replace(",", '')
+        news_date = date(int(self.year), self.month_parser[news_date[0]], int(news_date[1]))
-        title = response.css('div.post-title').css('h1').extract_first()
-        if title is not None : title = remove_tags(title)
+        if news_date == self.stop_date:
+            flow_info = response.meta['item']
-        topic = response.css('div.big-title').xpath('./h2/a/span').extract_first()
+            item = NoticiasItem()
-        if topic is not None : topic = remove_tags(topic)
+            text = ''
-        for p in response.css('div.post-content').css('p').extract():
+            news_date = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
-            p = remove_tags(p)
-            text += p + "\n"
+            title = response.css('div.post-title').css('h1').extract_first()
+            if title is not None : title = remove_tags(title)
-        ## News item info ##
-        item['date']  = news_date
+            topic = response.css('div.big-title').xpath('./h2/a/span').extract_first()
-        item['title'] = title
+            if topic is not None : topic = remove_tags(topic)
-        item['topic'] = topic
-        item['text']  = text.strip()
+            for p in response.css('div.post-content').css('p').extract():
-        item['url']   = response.url
+                p = remove_tags(p)
+                text += p + "\n"
-        yield item
+            ## News item info ##
+            item['date']  = news_date
+            item['title'] = title.strip()
+            item['topic'] = topic
+            item['text']  = text.strip()
+            item['url']   = response.url
+            yield item
+            if flow_info['is_last_link']:
+                flow_info['to_next_page'] = True
+                request = scrapy.Request(url=flow_info['return_url'], callback=self.parse, dont_filter=True)
+                request.meta['item'] = flow_info
+                yield request
\ No newline at end of file