Commit eac90030 authored by Mario Chirinos's avatar Mario Chirinos

el comentario colima

parent c6e78aaa
...@@ -40,37 +40,37 @@ class QuotesSpider(scrapy.Spider): ...@@ -40,37 +40,37 @@ class QuotesSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
for link in response.css('div.articles').xpath('./article/div[@class="cnt"]/h3/a/@href').extract(): print("parse", response.url)
pages = response.css("li.mkd-pagination-last-page").css("a::attr(href)").extract_first()
pages = int(pages[pages.find("page/")+5:-1])
for p in range (pages):
next_page = response.url+"/page/"+str(p+1)
yield scrapy.Request(url=next_page, callback=self.parse_page)
def parse_page(self, response):
print("parse_page", response.url)
links = response.css("h5").css("a::attr(href)").extract()
for link in links:
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
next_page = response.css('').xpath('./a[@title="Next page"]/@href').extract_first()
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_item(self, response): def parse_item(self, response):
print("parse_item", response.url)
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = response.xpath("//meta[@property='og:title']/@content").extract_first()
item["topic"] = response.css("div.mkd-post-info-category").css("a::text").extract_first()
title = response.xpath('//header/h1').extract_first() content = response.css("").css("p").extract()
if title is not None : title = remove_tags(title)
topic = response.css('a.theme').extract_first() for p in content:
if topic is not None : topic = remove_tags(topic) text+= remove_tags(p)+"\n"
for p in response.css('').css('p').extract():
p = remove_tags(p)
text += p + "\n"
text = text.strip() text = text.strip()
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
yield item yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment