Commit d87ceed5 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 831874a7
#backup files generated by text editors # # backup files generated by text editors #
*~ *~
# python generated files # # python generated files #
......
...@@ -30,9 +30,9 @@ class QuotesSpider(scrapy.Spider): ...@@ -30,9 +30,9 @@ class QuotesSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.pagination').css('a::attr(href)').extract() pagination = response.css('div.vw-page-navigation-pagination').css('a::attr(href)').extract()
if len(pagination) > 0: if len(pagination) > 0:
pagination = pagination[-1].strip('/') pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages): for page in range(1, pages):
...@@ -40,7 +40,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -40,7 +40,7 @@ class QuotesSpider(scrapy.Spider):
def parse_page(self, response): def parse_page(self, response):
for link in response.xpath('//li[@class="infinite-post"]/a/@href').extract(): for link in response.css('div.vw-post-loop-inner').css('div.vw-post-box-inner').xpath('./h3/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
...@@ -48,12 +48,22 @@ class QuotesSpider(scrapy.Spider): ...@@ -48,12 +48,22 @@ class QuotesSpider(scrapy.Spider):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
## la fecha de la noticia ya incluye la zona horaria ## la fecha de la noticia ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() item['date'] = response.xpath('//time[@itemprop="datePublished"]/@datetime').extract_first()
item['title'] = response.css('h1.story-title::text').extract_first() item['title'] = remove_tags(response.xpath('//div[@class="vw-page-content"]/article/h1[@class="entry-title"]').extract_first())
item['topic'] = response.css('h3.story-cat::text').extract_first()
item['author'] = response.xpath('//div[@id="post-info"]/span/a/text()').extract_first() topic = response.xpath('//div[@class="vw-page-content"]/article/div[@class="vw-post-categories"]/a').extract_first()
if topic is not None:
for paragraph in response.xpath('//div[@id="content-area"]/p').extract(): item['topic'] = remove_tags(topic)
else:
item['topic'] = topic
author = response.xpath('//span[@itemprop="author"]/a[@class="author-name"]').extract_first()
if author is not None:
item['author'] = remove_tags(author)
else:
item['author'] = author
for paragraph in response.xpath('//div[@itemprop="articleBody"]/p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment