Commit 15cf2c16 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 9c843009
...@@ -53,7 +53,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -53,7 +53,7 @@ class QuotesSpider(scrapy.Spider):
for s in sectionList: for s in sectionList:
info = ImportantData() info = ImportantData()
info['page'] = 1 info['page'] = 1
request = scrapy.Request(url=baseURL + s, callback=self.parse) request = scrapy.Request(url=baseURL + s + "/", callback=self.parse)
request.meta['item'] = info request.meta['item'] = info
yield request yield request
...@@ -62,7 +62,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -62,7 +62,7 @@ class QuotesSpider(scrapy.Spider):
info = ImportantData() info = ImportantData()
info['page'] = 1 info['page'] = 1
info['CONTINUE_SEARCHING'] = False info['CONTINUE_SEARCHING'] = False
request = scrapy.Request(url=baseURL + s, callback=self.parse_with_stop_date) request = scrapy.Request(url=baseURL + s + "/", callback=self.parse_with_stop_date)
request.meta['item'] = info request.meta['item'] = info
yield request yield request
...@@ -70,16 +70,18 @@ class QuotesSpider(scrapy.Spider): ...@@ -70,16 +70,18 @@ class QuotesSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
searchData = response.meta['item'] searchData = response.meta['item']
CONTINUE_SEARCHING = True CONTINUE_SEARCHING = True
linkSet = set()
if searchData['page'] == 1: if searchData['page'] == 1:
searchData['section_url'] = response.url + "/" searchData['section_url'] = response.url
entrySet = set(response.css('article.entry').css('div.content').css('a::attr(href)').extract()) linkSet = linkSet.union(set(response.xpath('//article[@id="destacada"]/a/@href').extract()))
entrySet.remove(searchData['section_url']) linkSet = linkSet.union(set(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract()))
linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract()) linkSet = linkSet.union(set(response.xpath('//div[@class="contenedor"]/article/a/@href').extract()))
linkSet = linkSet.union(set(response.xpath('//article[@class="nobordes"]/div/a/@href').extract()))
linkSet.remove(searchData['section_url']) linkSet.remove(searchData['section_url'])
linkSet.union(entrySet)
else: else:
linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract()) linkSet = linkSet.union(set(response.xpath('//div[@class="contenedor"]/article/a/@href').extract()))
linkSet = linkSet.union(set(response.xpath('//article[@class="nobordes"]/div/a/@href').extract()))
try: try:
linkSet.remove(searchData['section_url']) linkSet.remove(searchData['section_url'])
except KeyError: except KeyError:
...@@ -103,7 +105,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -103,7 +105,7 @@ class QuotesSpider(scrapy.Spider):
if not CONTINUE_SEARCHING: if not CONTINUE_SEARCHING:
if searchData['page'] == 1: if searchData['page'] == 1:
searchData['section_url'] = response.url + "/" searchData['section_url'] = response.url
entrySet = set(response.css('article.entry').css('div.content').css('a::attr(href)').extract()) entrySet = set(response.css('article.entry').css('div.content').css('a::attr(href)').extract())
entrySet.remove(searchData['section_url']) entrySet.remove(searchData['section_url'])
linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract()) linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract())
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment