animal politico

parent b22533c0
......@@ -7,6 +7,10 @@ import scrapy
class AnimalpoliticoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
date = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
"""
Spider for jornada.com.mx
Author: Mario Chirinos Coluga
Usage:scrapy crawl noticias --nolog -O 2017-04-23.json -a year=2017 -a month=4 -a day=23
"""
import scrapy
import json
import re
from animalPolitico.payload_manager import PayloadManager
from animalPolitico.items import AnimalpoliticoItem
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
"""
Función que elimina las etiquetas HTML de una cadena utilizando una expresión regular.
Parameters
----------
text : str
La cadena que contiene las etiquetas HTML que se desean eliminar.
Returns
-------
str
La cadena sin etiquetas HTML.
"""
if not isinstance(text, str):
return text # Devuelve el valor original si no es una cadena
return TAG_RE.sub('', text)
class NoticiasSpider(scrapy.Spider):
name = "noticias"
allowed_domains = ['animalpolitico.com']
start_urls = ["https://panel.animalpolitico.com/graphql"]
payload_manager = PayloadManager()
payload = payload_manager.get_payload()
print(payload)
def start_requests(self):
"""
Starts the requests for crawling the news of Animal Político.
The method generates a payload with the query and variables for GraphQL
and sends a POST request to the GraphQL API. The response is processed
in the parse method.
"""
url = self.start_urls[0]
self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
payload = {
"operationName": "FetchAllPostTypes",
"variables": {
"where": {
"search": "",
"orderby": [
"operationName":"FetchAllPostTypes",
"variables":{
"where":{
"search":"",
"orderby":[
{
"field": "DATE",
"order": "DESC"
"field":"DATE",
"order":"DESC"
}
],
"dateQuery": {
"inclusive": True,
"after": {
"day": 12,
"month": 1,
"year": 2025
"dateQuery":{
"inclusive":True,
"after":{
"day":int(self.day),
"month":int(self.month),
"year":int(self.year)
},
"before": {
"day": 13,
"month": 1,
"year": 2025
"before":{
"day":int(self.day),
"month":int(self.month),
"year":int(self.year)
}
},
"offsetPagination": {
"size": 1000,
"offset": 0
"offsetPagination":{
"size":1000,
"offset":0
}
}
},
"query": """query FetchAllPostTypes($first: Int, $where: RootQueryToAllPostTypesConnectionWhereArgs) {
allPostTypes(first: $first, where: $where) {
edges {
node {
databaseId
id
title
slug
titleHome
contentTypeName
uri
image
excerpt
categoryPrimarySlug
blogSlug
blogIsOrganization
sabuesoType
date
dateGmt
isContentSponsored
contentRendered
blogId
readingTime
postExcerpt
blogAuthor
authorNote
videoUrl
authorName
terms {
edges {
node {
id
slug
name
__typename
}
__typename
}
__typename
}
relatedPosts {
id
databaseId
title
slug
titleHome
contentTypeName
uri
image
sabuesoType
terms {
edges {
node {
id
slug
__typename
}
__typename
}
__typename
}
author {
node {
name
lastName
firstName
__typename
}
__typename
}
featuredImage {
node {
id
sourceUrl
imageFooter
__typename
}
__typename
}
__typename
}
featuredImage {
node {
id
sourceUrl(size: LARGE)
imageFooter
__typename
}
__typename
}
author {
node {
name
__typename
}
__typename
}
__typename
}
__typename
}
pageInfo {
offsetPagination {
total
__typename
}
__typename
}
__typename
}
}"""
"query":"query FetchAllPostTypes($first: Int, $where: RootQueryToAllPostTypesConnectionWhereArgs) {\n allPostTypes(first: $first, where: $where) {\n edges {\n node {\n databaseId\n id\n title\n slug\n titleHome\n contentTypeName\n uri\n image\n excerpt\n categoryPrimarySlug\n blogSlug\n blogIsOrganization\n sabuesoType\n date\n dateGmt\n isContentSponsored\n contentRendered\n blogId\n readingTime\n postExcerpt\n blogAuthor\n authorNote\n videoUrl\n authorName\n terms {\n edges {\n node {\n id\n slug\n name\n __typename\n }\n __typename\n }\n __typename\n }\n relatedPosts {\n id\n databaseId\n title\n slug\n titleHome\n contentTypeName\n uri\n image\n sabuesoType\n terms {\n edges {\n node {\n id\n slug\n __typename\n }\n __typename\n }\n __typename\n }\n author {\n node {\n name\n lastName\n firstName\n __typename\n }\n __typename\n }\n featuredImage {\n node {\n id\n sourceUrl\n imageFooter\n __typename\n }\n __typename\n }\n __typename\n }\n featuredImage {\n node {\n id\n sourceUrl(size: LARGE)\n imageFooter\n __typename\n }\n __typename\n }\n author {\n node {\n name\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n }\n pageInfo {\n offsetPagination {\n total\n __typename\n }\n __typename\n }\n __typename\n }\n }"
}
headers = {
'Content-Type': 'application/json'
}
print(payload)
yield scrapy.Request(
url=url,
method="POST",
......@@ -167,17 +95,23 @@ class NoticiasSpider(scrapy.Spider):
)
def parse(self, response):
# Decodifica la respuesta JSON
"""
Parse a JSON response from Animal Político's GraphQL API.
Yields a dictionary containing the title, topic, text, URL, and author
of each article in the response.
"""
data = json.loads(response.text)
# Procesa los datos según lo necesites
item = AnimalpoliticoItem()
for edge in data.get("data", {}).get("allPostTypes", {}).get("edges", []):
node = edge.get("node", {})
yield {
"title": node.get("title"),
"contentTypeName": node.get("contentTypeName"),
"uri": node.get("uri"),
"date": node.get("date"),
"authorName": node.get("author", {}).get("node", {}).get("name"),
"contentRendered": node.get("contentRendered")
}
item['date'] = node.get("date")
item['title'] = remove_tags(node.get("title"))
item['topic'] = remove_tags(node.get("contentTypeName"))
item['text'] = remove_tags(node.get("contentRendered"))
item['url'] = self.start_urls[0] + node.get("uri")
item['author'] = node.get("author", {}).get("node", {}).get("name")
yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment