Commit ad9b4024 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent dbdb00c3
......@@ -37,6 +37,7 @@ class QuotesSpider(scrapy.Spider):
def start_requests(self):
self.tz = UTC()
self.counter = 0
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
......@@ -150,6 +151,7 @@ class QuotesSpider(scrapy.Spider):
yield request
elif self.date > self.comparison_date_6:
# print 'first filter'
section_list = ['opinion', 'politica', 'economia', 'mundo', 'estados', 'ciencias',
'capital', 'sociedad', 'cultura', 'espectaculos', 'deportes']
......@@ -159,6 +161,7 @@ class QuotesSpider(scrapy.Spider):
if self.date <= self.comparison_date_7:
yield scrapy.Request(url=self.baseURL+s, callback=self.parse_5)
elif self.date > self.comparison_date_7:
# print 'second filter in ' + self.baseURL + s
yield scrapy.Request(url=self.baseURL+s, callback=self.parse_6)
......@@ -264,12 +267,15 @@ class QuotesSpider(scrapy.Spider):
def parse_6(self, response):
if ( response.url[:response.url.rfind('/')+1] == self.baseURL ):
linkSet = set()
path_list = ['//*[@class="itemfirst"]/div/a/@href', '//*[@class="item start"]/div/a/@href',
'//*[@class="item"]/div/a/@href']
for path in path_list:
for link in response.xpath(path).extract():
yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item_4)
if link not in linkSet:
linkSet.add(link)
yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item_4)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment