Commit 3885bd5c authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 52d6be74
from scrapy.spidermiddlewares.httperror import HttpError
import scrapy, re import scrapy, re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 '''
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
'''
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
...@@ -72,9 +72,9 @@ class QuotesSpider(scrapy.Spider): ...@@ -72,9 +72,9 @@ class QuotesSpider(scrapy.Spider):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL='http://laverdadnoticias.com/'+year+'/'+month+'/'+day self.baseURL = 'http://laverdadnoticias.com/' + year + '/' + month + '/' + day
self.stop = False
page = 0 yield scrapy.Request(url=self.baseURL, callback=self.parse)
# while not self.stop: # while not self.stop:
# # for page in range(0, 50): # # for page in range(0, 50):
...@@ -84,15 +84,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -84,15 +84,6 @@ class QuotesSpider(scrapy.Spider):
# yield scrapy.Request(url=self.baseURL+'/page/'+str(page), callback=self.parse, errback=self.errback_http) # yield scrapy.Request(url=self.baseURL+'/page/'+str(page), callback=self.parse, errback=self.errback_http)
# page += 1 # page += 1
while not self.stop:
if page == 0:
yield scrapy.Request(url=self.baseURL, callback=self.parse, errback=self.errback_http)
elif page > 0:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page), callback=self.parse, errback=self.errback_http)
page += 1
def parse(self, response): def parse(self, response):
...@@ -100,19 +91,15 @@ class QuotesSpider(scrapy.Spider): ...@@ -100,19 +91,15 @@ class QuotesSpider(scrapy.Spider):
# print('**********hey, 404! TRUE!!!') # print('**********hey, 404! TRUE!!!')
# self.stop = True # self.stop = True
# else: # else:
link_list = response.xpath('//*[@class="two_third post_header"]/h5/a/@href').extract() linkList = response.xpath('//*[@class="two_third post_header"]/h5/a/@href').extract()
link_list.extend(response.xpath('//*[@class="post_header_title two_third last"]/h5/a/@href').extract()) linkList.extend(response.xpath('//*[@class="post_header_title two_third last"]/h5/a/@href').extract())
link_list.extend(response.xpath('//*[@class="post_header_title one"]/h5/a/@href').extract()) linkList.extend(response.xpath('//*[@class="post_header_title one"]/h5/a/@href').extract())
for link in link_list:
yield scrapy.Request(url=link, callback=self.parse_item)
for link in linkList:
yield scrapy.Request(url=link, callback=self.parse_item)
def errback_http(self, failure): nextPage = response.xpath('//*[@class="pagination"]/a/@href').extract()[-1]
if failure.check(HttpError): yield scrapy.Request(url=nextPage, callback=self.parse)
response = failure.value.response
self.logger.error('HttpError on %s', response.url)
self.stop = True
def parse_item(self, response): def parse_item(self, response):
...@@ -129,9 +116,13 @@ class QuotesSpider(scrapy.Spider): ...@@ -129,9 +116,13 @@ class QuotesSpider(scrapy.Spider):
item['title'] = response.xpath('//*[@class="page_title_inner"]/h1/text()').extract_first() item['title'] = response.xpath('//*[@class="page_title_inner"]/h1/text()').extract_first()
item['topic'] = response.xpath('//*[@class="post_info_cat"]/a/text()').extract_first() item['topic'] = response.xpath('//*[@class="post_info_cat"]/a/text()').extract_first()
for paragraph in response.xpath('//*[@class="post_content_wrapper"]/p').extract(): paragraph = response.xpath('//*[@class="post_content_wrapper"]/p').extract()
text += remove_tags(paragraph) + '\n' paragraph.extend(response.xpath('//*[@title="Page 1"]/div/p').extract())
paragraph.extend(response.xpath('//*[@class="text_exposed_root text_exposed"]/p').extract())
for p in paragraph:
text += remove_tags(p) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment