crawlers

3885bd5c · Renán Sosa Guillen · 52d6be74 · 3885bd5c · 3885bd5c
Commit 3885bd5c authored Dec 06, 2017 by Renán Sosa Guillen
Hide whitespace changes
Inline Side-by-side

Showing with 20 additions and 29 deletions

noticias.py descarga_por_dia/laVerdadYuc/laVerdadYuc/spiders/noticias.py +20 -29

noticias.pyc ...arga_por_dia/laVerdadYuc/laVerdadYuc/spiders/noticias.pyc +0 -0

No files found.
--- a/descarga_por_dia/laVerdadYuc/laVerdadYuc/spiders/noticias.py
+++ b/descarga_por_dia/laVerdadYuc/laVerdadYuc/spiders/noticias.py
-from scrapy.spidermiddlewares.httperror import HttpError
 import scrapy, re
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+'''
+scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+'''
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
@@ -72,9 +72,9 @@ class QuotesSpider(scrapy.Spider):
 		year = getattr(self, 'year', None)
 		month = getattr(self, 'month', None)
 		day = getattr(self, 'day', None)
-		self.baseURL='http://laverdadnoticias.com/'+year+'/'+month+'/'+day
+		self.baseURL = 'http://laverdadnoticias.com/' + year + '/' + month + '/' + day
-		self.stop = False
-		page = 0
+		yield scrapy.Request(url=self.baseURL, callback=self.parse)
 		# while not self.stop:
 		# # for page in range(0, 50):
@@ -84,15 +84,6 @@ class QuotesSpider(scrapy.Spider):
 		# 		yield scrapy.Request(url=self.baseURL+'/page/'+str(page), callback=self.parse, errback=self.errback_http)
 		# 	page += 1
-		while not self.stop:
-			if page == 0:
-				yield scrapy.Request(url=self.baseURL, callback=self.parse, errback=self.errback_http)
-			elif page > 0:
-				yield scrapy.Request(url=self.baseURL+'/page/'+str(page), callback=self.parse, errback=self.errback_http)
-			page += 1
 	def parse(self, response):
@@ -100,19 +91,15 @@ class QuotesSpider(scrapy.Spider):
 		# 	print('**********hey, 404! TRUE!!!')
 		# 	self.stop = True
 		# else:
-		link_list = response.xpath('//*[@class="two_third post_header"]/h5/a/@href').extract()
+		linkList = response.xpath('//*[@class="two_third post_header"]/h5/a/@href').extract()
-		link_list.extend(response.xpath('//*[@class="post_header_title two_third last"]/h5/a/@href').extract())
+		linkList.extend(response.xpath('//*[@class="post_header_title two_third last"]/h5/a/@href').extract())
-		link_list.extend(response.xpath('//*[@class="post_header_title one"]/h5/a/@href').extract())
+		linkList.extend(response.xpath('//*[@class="post_header_title one"]/h5/a/@href').extract())
-		for link in link_list:
-			yield scrapy.Request(url=link, callback=self.parse_item)
+		for link in linkList:
+			yield scrapy.Request(url=link, callback=self.parse_item)
-	def errback_http(self, failure):	
+		nextPage = response.xpath('//*[@class="pagination"]/a/@href').extract()[-1]
-		if failure.check(HttpError):
+		yield scrapy.Request(url=nextPage, callback=self.parse)
-			response = failure.value.response
-			self.logger.error('HttpError on %s', response.url)
-			self.stop = True
 	def parse_item(self, response):
@@ -129,9 +116,13 @@ class QuotesSpider(scrapy.Spider):
 		item['title'] = response.xpath('//*[@class="page_title_inner"]/h1/text()').extract_first()
 		item['topic'] = response.xpath('//*[@class="post_info_cat"]/a/text()').extract_first()
-		for paragraph in response.xpath('//*[@class="post_content_wrapper"]/p').extract():
+		paragraph = response.xpath('//*[@class="post_content_wrapper"]/p').extract()
-			text += remove_tags(paragraph) + '\n'
+		paragraph.extend(response.xpath('//*[@title="Page 1"]/div/p').extract())
+		paragraph.extend(response.xpath('//*[@class="text_exposed_root text_exposed"]/p').extract())
+		for p in paragraph:
+			text += remove_tags(p) + '\n'
 		item['text'] = text
 		item['url'] = response.url

--- a/descarga_por_dia/laVerdadYuc/laVerdadYuc/spiders/noticias.pyc
+++ b/descarga_por_dia/laVerdadYuc/laVerdadYuc/spiders/noticias.pyc