animalPolitico

parent 131e7b80
"""
Spider for jornada.com.mx
Author: Mario Chirinos Coluga
Spider for animalpolitico.com
Author: Ulises Morales Ramirez
Usage:scrapy crawl noticias --nolog -O 2017-04-23.json -a year=2017 -a month=4 -a day=23
"""
......@@ -106,10 +106,21 @@ class NoticiasSpider(scrapy.Spider):
for edge in data.get("data", {}).get("allPostTypes", {}).get("edges", []):
node = edge.get("node", {})
item['date'] = node.get("date")
item['title'] = remove_tags(node.get("title"))
item['topic'] = remove_tags(node.get("contentTypeName"))
item['text'] = remove_tags(node.get("contentRendered"))
item['url'] = "https://"+self.allowed_domains[0]+"/" + node.get("uri")
item['author'] = node.get("author", {}).get("node", {}).get("name")
yield item
date = node.get("date")
title = node.get("title")
topic = node.get("contentTypeName")
text = node.get("contentRendered")
uri = node.get("uri")
author_name = node.get("author", {}).get("node", {}).get("name")
if all([date, title, topic, text, uri, author_name]): # Verificar que todos los campos existen
item = {
'date': date,
'title': remove_tags(title),
'topic': remove_tags(topic),
'text': remove_tags(text),
'url': "https://"+self.allowed_domains[0]+ uri,
'author': author_name
}
yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment