crawlers

80a29bca · Renán Sosa Guillen · a4a62474 · 80a29bca · 80a29bca · 80a29bca
Commit 80a29bca authored Feb 01, 2018 by Renán Sosa Guillen
4 changed files
--- a/descarga_por_dia/edoMexDia/edoMexDia/spiders/noticias.py
+++ b/descarga_por_dia/edoMexDia/edoMexDia/spiders/noticias.py
@@ -45,12 +45,25 @@ class QuotesSpider(scrapy.Spider):


    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+
+        lastPage = response.xpath('//div[@class="numbered-pagination"]/a[@class="pagi-last"]/@href').extract_first()
+        if lastPage is None:
+            lastPage = response.xpath('//div[@class="numbered-pagination"]/a/@href').extract()[-1]
+        if lastPage is not None and lastPage != '':
+            lastPage = lastPage.strip('/')
+            lastPage = int(lastPage[lastPage.rfind('/')+1:])
+
+            for page in range(1, lastPage):
+                yield scrapy.Request(url=self.baseURL + "/page/" + str(page+1), callback=self.parse_page)
+
+
+    def parse_page(self, response):
        for link in response.xpath('//div[@id="main"]/div/h2[@class="entry_title"]/a/@href').extract():
            yield scrapy.Request(url=link, callback=self.parse_item)

        nextPage = response.xpath('//div[@class="numbered-pagination"]/a[@class="pagi-next"]/@href').extract_first()
        if nextPage is not None and nextPage != '':
-            print nextPage
            yield scrapy.Request(url=nextPage, callback=self.parse)



--- a/descarga_por_dia/edoMexDia/edoMexDia/spiders/noticias.pyc
+++ b/descarga_por_dia/edoMexDia/edoMexDia/spiders/noticias.pyc
--- a/descarga_por_dia/heraldoLeon/heraldoLeon/spiders/noticias.py
+++ b/descarga_por_dia/heraldoLeon/heraldoLeon/spiders/noticias.py
@@ -34,11 +34,14 @@ class QuotesSpider(scrapy.Spider):
    def parse(self, response):
        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)

-        lastPage = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a[@class="last"]/text()').extract_first()
+        lastPage = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a[@class="last"]/@href').extract_first()
+        if lastPage is None:
+            lastPage = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a/@href').extract()[-1]
        if lastPage is not None and lastPage != '':
-            lastPage = int(lastPage)
+            lastPage = lastPage.strip('/')
+            lastPage = int(lastPage[lastPage.rfind('/')+1:])

-            for page in range(1,lastPage):
+            for page in range(1, lastPage):
                yield scrapy.Request(url=self.baseURL + "/page/" + str(page+1), callback=self.parse_page)



--- a/descarga_por_dia/heraldoLeon/heraldoLeon/spiders/noticias.pyc
+++ b/descarga_por_dia/heraldoLeon/heraldoLeon/spiders/noticias.pyc