capitalEdomex

c14dec2a · umorales · 1f690aab · c14dec2a
Commit c14dec2a authored Feb 17, 2025 by umorales
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 5 deletions

noticias.py ...lEstadoDeMexico/capitalEstadoDeMexico/spiders/noticias.py +12 -5

No files found.
--- a/spiders/daily/capitalEstadoDeMexico/capitalEstadoDeMexico/spiders/noticias.py
+++ b/spiders/daily/capitalEstadoDeMexico/capitalEstadoDeMexico/spiders/noticias.py
@@ -28,6 +28,7 @@ def remove_tags(text):
 class NoticiasSpider(scrapy.Spider):
    name = "noticias"
    allowed_domains = ["www.capitaledomex.com.mx"]
    def __init__(self, year=None, month=None, day=None, *args, **kwargs):
        super(NoticiasSpider, self).__init__(*args, **kwargs)
        self.year = year
@@ -35,7 +36,7 @@ class NoticiasSpider(scrapy.Spider):
        self.day = day.zfill(2) if day else None
        if self.year and self.month and self.day:
            self.start_urls = [
-                f"{allowed_domains[0]}/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
+                f"https://{self.allowed_domains[0]}/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
            ]
    def parse(self, response):
@@ -43,14 +44,20 @@ class NoticiasSpider(scrapy.Spider):
        Procesa una respuesta de la API de WordPress y devuelve los posts que 
        contienen contenido no vacío. Crea un item con los campos 'date', 'title', 
        'text', 'author', 'topic' y 'url' y lo devuelve como un objeto de tipo 
-        capitalestadodemexicoItem.
+        CapitalestadodemexicoItem.
        """
-        data = json.loads(response.text)
+        try:
+            data = json.loads(response.text)
+        except json.JSONDecodeError as e:
+            self.logger.error(f"Error al decodificar JSON: {e}")
+            self.logger.error(f"Contenido recibido: {response.text[:500]}")  # Muestra los primeros 500 caracteres
+            return
        for post in data:
            # Validar que el contenido no esté vacío
            content = post.get('content', {}).get('rendered', '').strip()
            if not content:
-                self.logger.info(f"Post {post.get('id')} skipped due to empty content.")
+                self.logger.info(f"Post {post.get('id')} omitido debido a contenido vacío.")
                continue
            # Crear un item con los campos requeridos
@@ -84,4 +91,4 @@ class NoticiasSpider(scrapy.Spider):
            article_section = schema_graph[5].get('articleSection', [])
            if isinstance(article_section, list) and article_section:
                return article_section[0]  # Devuelve el primer elemento si existe
        return "Sin tema"
\ No newline at end of file