crawlers

ad9b4024 · Renán Sosa Guillen · dbdb00c3 · ad9b4024 · ad9b4024
Commit ad9b4024 authored Dec 04, 2017 by Renán Sosa Guillen
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 1 deletion

noticias.py descarga_por_dia/laJornada/laJornada/spiders/noticias.py +7 -1

noticias.pyc descarga_por_dia/laJornada/laJornada/spiders/noticias.pyc +0 -0

No files found.
--- a/descarga_por_dia/laJornada/laJornada/spiders/noticias.py
+++ b/descarga_por_dia/laJornada/laJornada/spiders/noticias.py
@@ -37,6 +37,7 @@ class QuotesSpider(scrapy.Spider):
 	def start_requests(self):
 		self.tz = UTC()
+		self.counter = 0
 		year = getattr(self, 'year', None)
 		month = getattr(self, 'month', None)
 		day = getattr(self, 'day', None)
@@ -150,6 +151,7 @@ class QuotesSpider(scrapy.Spider):
 				yield request
 		elif self.date > self.comparison_date_6:
+			# print 'first filter'
 			section_list = ['opinion', 'politica', 'economia', 'mundo', 'estados', 'ciencias',
 											'capital', 'sociedad', 'cultura', 'espectaculos', 'deportes']
@@ -159,6 +161,7 @@ class QuotesSpider(scrapy.Spider):
 				if self.date <= self.comparison_date_7:
 					yield scrapy.Request(url=self.baseURL+s, callback=self.parse_5)
 				elif self.date > self.comparison_date_7:
+					# print 'second filter in ' + self.baseURL + s
 					yield scrapy.Request(url=self.baseURL+s, callback=self.parse_6)
@@ -264,12 +267,15 @@ class QuotesSpider(scrapy.Spider):
 	def parse_6(self, response):
 		if ( response.url[:response.url.rfind('/')+1] == self.baseURL ):
+			linkSet = set()
 			path_list = ['//*[@class="itemfirst"]/div/a/@href', '//*[@class="item start"]/div/a/@href',
 						 			 '//*[@class="item"]/div/a/@href']
 			for path in path_list:
 				for link in response.xpath(path).extract():
-					yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item_4)
+					if link not in linkSet:
+						linkSet.add(link)
+						yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item_4)

--- a/descarga_por_dia/laJornada/laJornada/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornada/laJornada/spiders/noticias.pyc