Commit c8bfa686 authored by Mario Chirinos's avatar Mario Chirinos

el financiero

parent 21e0de1b
...@@ -33,15 +33,15 @@ class NoticiasSpider(scrapy.Spider): ...@@ -33,15 +33,15 @@ class NoticiasSpider(scrapy.Spider):
#----------------------------------------------------------------------- #-----------------------------------------------------------------------
def parse_item(self, response): def parse_item(self, response):
print(response.url) # print(response.url)
item = DiariopuntualItem() item = DiariopuntualItem()
item["date"] = self.year + "/" + self.month.zfill(2) + "/" + self.day.zfill(2) item["date"] = self.year + "/" + self.month.zfill(2) + "/" + self.day.zfill(2)
item["title"] = response.xpath('//title/text()').extract_first().replace("| Diario Puntual","").strip() item["title"] = response.xpath('//title/text()').extract_first().replace("| Diario Puntual","").strip()
item["topic"] = "" item["topic"] = ""
text="" text=""
for p in response.xpath('//article/p').extract(): for p in response.xpath('//article/p').extract():
text += remove_tags(p) + "\n" text += remove_tags(p) + "\n "
item["text"]=text item["text"]=text
item["url"]=response.url item["url"]=response.url
print(item["title"]) print(item)
yield(item) yield(item)
...@@ -48,11 +48,12 @@ class NoticiasSpider(scrapy.Spider): ...@@ -48,11 +48,12 @@ class NoticiasSpider(scrapy.Spider):
# #
item['title'] = response.xpath('//meta[@property="og:title"]/@content').extract_first() item['title'] = response.xpath('//meta[@property="og:title"]/@content').extract_first()
item['date'] = self.date item['date'] = self.date
item['topic'] = response.xpath('//meta[@name="keywords"]/@content').extract() item['topic'] = response.xpath('//meta[@name="keywords"]/@content').extract_first().split(",")
# item['author'] = response.xpath('//span[contains(@class, "sc__author--name")]/text()').extract_first()[0:-2].strip() # item['author'] = response.xpath('//span[contains(@class, "sc__author--name")]/text()').extract_first()[0:-2].strip()
item['text']=text item['text']=text
item['url']= response.xpath('//link[@rel="canonical"]/@href').extract_first() item['url']= response.xpath('//link[@rel="canonical"]/@href').extract_first()
print(item['date'], item['title']) print(item['date'], item['title'])
# print(item)
yield(item) yield(item)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment