crawlers

dd06c629 · Renán Sosa Guillen · d835cdfc · dd06c629 · dd06c629 · dd06c629
Commit dd06c629 authored Jan 29, 2018 by Renán Sosa Guillen
4 changed files
--- a/descarga_por_dia/heraldoAgs/heraldoAgs/pipelines.pyc
+++ b/descarga_por_dia/heraldoAgs/heraldoAgs/pipelines.pyc
--- a/descarga_por_dia/heraldoAgs/heraldoAgs/spiders/noticias.pyc
+++ b/descarga_por_dia/heraldoAgs/heraldoAgs/spiders/noticias.pyc
--- a/descarga_por_dia/laRazon/laRazon/spiders/noticias.py
+++ b/descarga_por_dia/laRazon/laRazon/spiders/noticias.py
@@ -31,11 +31,13 @@ class QuotesSpider(scrapy.Spider):
    def parse(self, response):
        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-        pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a[@class="last"]/@href').extract_first()
+        pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a/@href').extract()
-        pagination = pagination.strip('/')
+        if len(pagination) > 0:
-        pages = int(pagination[pagination.rfind('/')+1:])
+            pagination = pagination[-2].strip('/')
-        for page in range(1, pages):
+            pages = int(pagination[pagination.rfind('/')+1:])
-            yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
+            for page in range(1, pages):
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
    def parse_page(self, response):
@@ -53,12 +55,24 @@ class QuotesSpider(scrapy.Spider):
            d = d[:-6] + '-06:00'
        item['date'] = d
-        item['topic'] = response.xpath('//*[@class="entry-crumbs"]/span/a[@class="entry-crumb"]/text()').extract()[2]
+        try:
+            topic = response.xpath('//*[@class="entry-crumbs"]/span/a[@class="entry-crumb"]/text()').extract()[2]
+        except:
+            try:
+                topic = response.xpath('//*[@class="entry-crumbs"]/span/a[@class="entry-crumb"]/text()').extract()[1]
+            except:
+                topic = response.xpath('//*[@class="entry-crumbs"]/span/a[@class="entry-crumb"]/text()').extract_first()
+        item['topic'] = topic
        ti = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first()
        if ti is None:
            ti = response.xpath('//header[@class="td-post-title"]/h1/text()').extract_first()
        item['title'] = ti
+        author = response.xpath('//div[@class="td-post-author-name"]/a/text()').extract_first()
+        if author is not None:
+            item['author'] = author
        paragraphs = response.xpath('//*[@class="td-post-content"]/p').extract()
        if len(paragraphs) <= 0:
            paragraphs = response.xpath('//*[@dir="auto"]').extract()

--- a/descarga_por_dia/laRazon/laRazon/spiders/noticias.pyc
+++ b/descarga_por_dia/laRazon/laRazon/spiders/noticias.pyc