crawlers

15cf2c16 · Renán Sosa Guillen · 9c843009 · 15cf2c16
Commit 15cf2c16 authored Apr 06, 2018 by Renán Sosa Guillen
Show whitespace changes
Inline Side-by-side

Showing with 11 additions and 9 deletions

noticias.py ...ia_atras/foraneos/heraldoHn/heraldoHn/spiders/noticias.py +11 -9

No files found.
--- a/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/spiders/noticias.py
+++ b/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/spiders/noticias.py
@@ -53,7 +53,7 @@ class QuotesSpider(scrapy.Spider):
            for s in sectionList:
                info = ImportantData()
                info['page'] = 1
-                request = scrapy.Request(url=baseURL + s, callback=self.parse)
+                request = scrapy.Request(url=baseURL + s + "/", callback=self.parse)
                request.meta['item'] = info
                yield request

@@ -62,7 +62,7 @@ class QuotesSpider(scrapy.Spider):
                info = ImportantData()
                info['page'] = 1
                info['CONTINUE_SEARCHING'] = False
-                request = scrapy.Request(url=baseURL + s, callback=self.parse_with_stop_date)
+                request = scrapy.Request(url=baseURL + s + "/", callback=self.parse_with_stop_date)
                request.meta['item'] = info
                yield request

@@ -70,16 +70,18 @@ class QuotesSpider(scrapy.Spider):
    def parse(self, response):
        searchData = response.meta['item']
        CONTINUE_SEARCHING = True
+        linkSet = set()
        if searchData['page'] == 1:
-            searchData['section_url'] = response.url + "/"
-            entrySet = set(response.css('article.entry').css('div.content').css('a::attr(href)').extract())
-            entrySet.remove(searchData['section_url'])
-            linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract())
+            searchData['section_url'] = response.url
+            linkSet = linkSet.union(set(response.xpath('//article[@id="destacada"]/a/@href').extract()))
+            linkSet = linkSet.union(set(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract()))
+            linkSet = linkSet.union(set(response.xpath('//div[@class="contenedor"]/article/a/@href').extract()))
+            linkSet = linkSet.union(set(response.xpath('//article[@class="nobordes"]/div/a/@href').extract()))
            linkSet.remove(searchData['section_url'])
-            linkSet.union(entrySet)

        else:
-            linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract())
+            linkSet = linkSet.union(set(response.xpath('//div[@class="contenedor"]/article/a/@href').extract()))
+            linkSet = linkSet.union(set(response.xpath('//article[@class="nobordes"]/div/a/@href').extract()))
            try:
                linkSet.remove(searchData['section_url'])
            except KeyError:
@@ -103,7 +105,7 @@ class QuotesSpider(scrapy.Spider):

        if not CONTINUE_SEARCHING:
            if searchData['page'] == 1:
-                searchData['section_url'] = response.url + "/"
+                searchData['section_url'] = response.url
                entrySet = set(response.css('article.entry').css('div.content').css('a::attr(href)').extract())
                entrySet.remove(searchData['section_url'])
                linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract())