animal politico

5e8eddfd · Ulises Morales Ramírez · b22533c0 · 5e8eddfd · 5e8eddfd
Commit 5e8eddfd authored Jan 14, 2025 by Ulises Morales Ramírez
Show whitespace changes
Inline Side-by-side

Showing with 91 additions and 153 deletions

items.py spiders/daily/animalPolitico/animalPolitico/items.py +7 -3

noticias.py ...s/daily/animalPolitico/animalPolitico/spiders/noticias.py +84 -150

No files found.
--- a/spiders/daily/animalPolitico/animalPolitico/items.py
+++ b/spiders/daily/animalPolitico/animalPolitico/items.py
@@ -7,6 +7,10 @@ import scrapy


 class AnimalpoliticoItem(scrapy.Item):
-    # define the fields for your item here like:
-    # name = scrapy.Field()
-    pass
+    date = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/spiders/daily/animalPolitico/animalPolitico/spiders/noticias.py
+++ b/spiders/daily/animalPolitico/animalPolitico/spiders/noticias.py
+"""
+	Spider for jornada.com.mx
+	Author: Mario Chirinos Coluga
+	Usage:scrapy crawl noticias --nolog -O 2017-04-23.json -a year=2017 -a month=4 -a day=23 
+
+"""
 import scrapy
 import json
+import re
 from animalPolitico.payload_manager import PayloadManager
+from animalPolitico.items import AnimalpoliticoItem
+
+TAG_RE = re.compile(r'<[^>]+>')
+
+def remove_tags(text):
+    """
+    Función que elimina las etiquetas HTML de una cadena utilizando una expresión regular.
+    
+    Parameters
+    ----------
+    text : str
+        La cadena que contiene las etiquetas HTML que se desean eliminar.
+    
+    Returns
+    -------
+    str
+        La cadena sin etiquetas HTML.
+    """
+    if not isinstance(text, str):
+        return text  # Devuelve el valor original si no es una cadena
+    return TAG_RE.sub('', text)

 class NoticiasSpider(scrapy.Spider):
    name = "noticias"
+    allowed_domains = ['animalpolitico.com']
    start_urls = ["https://panel.animalpolitico.com/graphql"]
    
-    payload_manager = PayloadManager()
-    payload = payload_manager.get_payload()
-    print(payload)
-
    def start_requests(self):
+        """
+        Starts the requests for crawling the news of Animal Político.
+        
+        The method generates a payload with the query and variables for GraphQL
+        and sends a POST request to the GraphQL API. The response is processed
+        in the parse method.
+        """
        url = self.start_urls[0]
+        self.year = getattr(self, "year", None)
+        self.month = getattr(self, "month", None)
+        self.day = getattr(self, "day", None)
+  
        payload = {
-            "operationName": "FetchAllPostTypes",
-            "variables": {
-                "where": {
-                    "search": "",
-                    "orderby": [
+                    "operationName":"FetchAllPostTypes",
+                    "variables":{
+                        "where":{
+                            "search":"",
+                            "orderby":[
                                {
-                            "field": "DATE",
-                            "order": "DESC"
+                                    "field":"DATE",
+                                    "order":"DESC"
                                }
                            ],
-                    "dateQuery": {
-                        "inclusive": True,
-                        "after": {
-                            "day": 12,
-                            "month": 1,
-                            "year": 2025
+                            "dateQuery":{
+                                "inclusive":True,
+                                "after":{
+                                    "day":int(self.day),
+                                    "month":int(self.month),
+                                    "year":int(self.year)
                                },
-                        "before": {
-                            "day": 13,
-                            "month": 1,
-                            "year": 2025
+                                "before":{
+                                    "day":int(self.day),
+                                    "month":int(self.month),
+                                    "year":int(self.year)
                                }
                            },
-                    "offsetPagination": {
-                        "size": 1000,
-                        "offset": 0
+                            "offsetPagination":{
+                                "size":1000,
+                                "offset":0
                            }
                        }
                    },
-            "query": """query FetchAllPostTypes($first: Int, $where: RootQueryToAllPostTypesConnectionWhereArgs) {
-                allPostTypes(first: $first, where: $where) {
-                    edges {
-                        node {
-                            databaseId
-                            id
-                            title
-                            slug
-                            titleHome
-                            contentTypeName
-                            uri
-                            image
-                            excerpt
-                            categoryPrimarySlug
-                            blogSlug
-                            blogIsOrganization
-                            sabuesoType
-                            date
-                            dateGmt
-                            isContentSponsored
-                            contentRendered
-                            blogId
-                            readingTime
-                            postExcerpt
-                            blogAuthor
-                            authorNote
-                            videoUrl
-                            authorName
-                            terms {
-                                edges {
-                                    node {
-                                        id
-                                        slug
-                                        name
-                                        __typename
-                                    }
-                                    __typename
-                                }
-                                __typename
-                            }
-                            relatedPosts {
-                                id
-                                databaseId
-                                title
-                                slug
-                                titleHome
-                                contentTypeName
-                                uri
-                                image
-                                sabuesoType
-                                terms {
-                                    edges {
-                                        node {
-                                            id
-                                            slug
-                                            __typename
-                                        }
-                                        __typename
-                                    }
-                                    __typename
-                                }
-                                author {
-                                    node {
-                                        name
-                                        lastName
-                                        firstName
-                                        __typename
-                                    }
-                                    __typename
-                                }
-                                featuredImage {
-                                    node {
-                                        id
-                                        sourceUrl
-                                        imageFooter
-                                        __typename
-                                    }
-                                    __typename
-                                }
-                                __typename
-                            }
-                            featuredImage {
-                                node {
-                                    id
-                                    sourceUrl(size: LARGE)
-                                    imageFooter
-                                    __typename
-                                }
-                                __typename
-                            }
-                            author {
-                                node {
-                                    name
-                                    __typename
-                                }
-                                __typename
-                            }
-                            __typename
-                        }
-                        __typename
-                    }
-                    pageInfo {
-                        offsetPagination {
-                            total
-                            __typename
-                        }
-                        __typename
-                    }
-                    __typename
-                }
-            }"""
+                    "query":"query FetchAllPostTypes($first: Int, $where: RootQueryToAllPostTypesConnectionWhereArgs) {\n allPostTypes(first: $first, where: $where) {\n    edges {\n node {\n databaseId\n id\n title\n slug\n titleHome\n contentTypeName\n uri\n image\n excerpt\n categoryPrimarySlug\n blogSlug\n blogIsOrganization\n sabuesoType\n date\n dateGmt\n isContentSponsored\n contentRendered\n blogId\n readingTime\n postExcerpt\n blogAuthor\n authorNote\n videoUrl\n authorName\n terms {\n edges {\n node {\n id\n slug\n name\n __typename\n }\n __typename\n }\n __typename\n }\n relatedPosts {\n id\n databaseId\n title\n slug\n titleHome\n contentTypeName\n uri\n image\n sabuesoType\n terms {\n edges {\n node {\n id\n slug\n __typename\n }\n __typename\n }\n __typename\n }\n author {\n node {\n name\n lastName\n firstName\n __typename\n }\n __typename\n }\n featuredImage {\n node {\n id\n sourceUrl\n imageFooter\n __typename\n }\n __typename\n }\n __typename\n }\n featuredImage {\n node {\n id\n sourceUrl(size: LARGE)\n imageFooter\n __typename\n }\n __typename\n }\n author {\n node {\n name\n __typename\n }\n __typename\n }\n __typename\n }\n __typename\n    }\n    pageInfo {\n offsetPagination {\n total\n __typename\n }\n __typename\n    }\n    __typename\n }\n    }"
                    }
        headers = {
            'Content-Type': 'application/json'
        }
        
+        print(payload)
+
        yield scrapy.Request(
            url=url,
            method="POST",
@@ -167,17 +95,23 @@ class NoticiasSpider(scrapy.Spider):
        )

    def parse(self, response):
-        # Decodifica la respuesta JSON
+        """
+        Parse a JSON response from Animal Político's GraphQL API.
+        
+        Yields a dictionary containing the title, topic, text, URL, and author
+        of each article in the response.
+        
+        """
        data = json.loads(response.text)
        
-        # Procesa los datos según lo necesites
+        item = AnimalpoliticoItem()
+        
        for edge in data.get("data", {}).get("allPostTypes", {}).get("edges", []):
            node = edge.get("node", {})
-            yield {
-                "title": node.get("title"),
-                "contentTypeName": node.get("contentTypeName"),
-                "uri": node.get("uri"),
-                "date": node.get("date"),
-                "authorName": node.get("author", {}).get("node", {}).get("name"),
-                "contentRendered": node.get("contentRendered")
-            }
+            item['date']  = node.get("date")
+            item['title'] = remove_tags(node.get("title"))
+            item['topic'] = remove_tags(node.get("contentTypeName"))
+            item['text']  = remove_tags(node.get("contentRendered"))
+            item['url']   = self.start_urls[0] + node.get("uri")
+            item['author'] = node.get("author", {}).get("node", {}).get("name")
+            yield item