Commit dd06c629 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent d835cdfc
...@@ -31,11 +31,13 @@ class QuotesSpider(scrapy.Spider): ...@@ -31,11 +31,13 @@ class QuotesSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a[@class="last"]/@href').extract_first() pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a/@href').extract()
pagination = pagination.strip('/') if len(pagination) > 0:
pages = int(pagination[pagination.rfind('/')+1:]) pagination = pagination[-2].strip('/')
for page in range(1, pages): pages = int(pagination[pagination.rfind('/')+1:])
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response): def parse_page(self, response):
...@@ -53,12 +55,24 @@ class QuotesSpider(scrapy.Spider): ...@@ -53,12 +55,24 @@ class QuotesSpider(scrapy.Spider):
d = d[:-6] + '-06:00' d = d[:-6] + '-06:00'
item['date'] = d item['date'] = d
item['topic'] = response.xpath('//*[@class="entry-crumbs"]/span/a[@class="entry-crumb"]/text()').extract()[2] try:
topic = response.xpath('//*[@class="entry-crumbs"]/span/a[@class="entry-crumb"]/text()').extract()[2]
except:
try:
topic = response.xpath('//*[@class="entry-crumbs"]/span/a[@class="entry-crumb"]/text()').extract()[1]
except:
topic = response.xpath('//*[@class="entry-crumbs"]/span/a[@class="entry-crumb"]/text()').extract_first()
item['topic'] = topic
ti = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first() ti = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first()
if ti is None: if ti is None:
ti = response.xpath('//header[@class="td-post-title"]/h1/text()').extract_first() ti = response.xpath('//header[@class="td-post-title"]/h1/text()').extract_first()
item['title'] = ti item['title'] = ti
author = response.xpath('//div[@class="td-post-author-name"]/a/text()').extract_first()
if author is not None:
item['author'] = author
paragraphs = response.xpath('//*[@class="td-post-content"]/p').extract() paragraphs = response.xpath('//*[@class="td-post-content"]/p').extract()
if len(paragraphs) <= 0: if len(paragraphs) <= 0:
paragraphs = response.xpath('//*[@dir="auto"]').extract() paragraphs = response.xpath('//*[@dir="auto"]').extract()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment