animalPolitico

parent 131e7b80
""" """
Spider for jornada.com.mx Spider for animalpolitico.com
Author: Mario Chirinos Coluga Author: Ulises Morales Ramirez
Usage:scrapy crawl noticias --nolog -O 2017-04-23.json -a year=2017 -a month=4 -a day=23 Usage:scrapy crawl noticias --nolog -O 2017-04-23.json -a year=2017 -a month=4 -a day=23
""" """
...@@ -106,10 +106,21 @@ class NoticiasSpider(scrapy.Spider): ...@@ -106,10 +106,21 @@ class NoticiasSpider(scrapy.Spider):
for edge in data.get("data", {}).get("allPostTypes", {}).get("edges", []): for edge in data.get("data", {}).get("allPostTypes", {}).get("edges", []):
node = edge.get("node", {}) node = edge.get("node", {})
item['date'] = node.get("date")
item['title'] = remove_tags(node.get("title")) date = node.get("date")
item['topic'] = remove_tags(node.get("contentTypeName")) title = node.get("title")
item['text'] = remove_tags(node.get("contentRendered")) topic = node.get("contentTypeName")
item['url'] = "https://"+self.allowed_domains[0]+"/" + node.get("uri") text = node.get("contentRendered")
item['author'] = node.get("author", {}).get("node", {}).get("name") uri = node.get("uri")
author_name = node.get("author", {}).get("node", {}).get("name")
if all([date, title, topic, text, uri, author_name]): # Verificar que todos los campos existen
item = {
'date': date,
'title': remove_tags(title),
'topic': remove_tags(topic),
'text': remove_tags(text),
'url': "https://"+self.allowed_domains[0]+ uri,
'author': author_name
}
yield item yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment