cambios en cuestion de polemicua

9fded341 · umorales · 8b9f5b5f · 9fded341
Commit 9fded341 authored Jan 20, 2025 by umorales
Hide whitespace changes
Inline Side-by-side

Showing with 54 additions and 34 deletions

noticias.py ...cuestionDePolemica/cuestionDePolemica/spiders/noticias.py +54 -34

No files found.
--- a/spiders/daily/cuestionDePolemica/cuestionDePolemica/spiders/noticias.py
+++ b/spiders/daily/cuestionDePolemica/cuestionDePolemica/spiders/noticias.py
@@ -7,7 +7,7 @@ from cuestionDePolemica.items import CuestiondepolemicaItem
 TAG_RE = re.compile(r'<[^>]+>')

 def remove_tags(text):
-
+    """Elimina etiquetas HTML del texto."""
    if not isinstance(text, str):
        return text  # Devuelve el valor original si no es una cadena
    return TAG_RE.sub('', text)
@@ -15,50 +15,70 @@ def remove_tags(text):
 class NoticiasSpider(scrapy.Spider):
    name = "noticias"
    allowed_domains = ["www.cuestiondepolemica.com"]
+
    def __init__(self, year=None, month=None, day=None, *args, **kwargs):
        super(NoticiasSpider, self).__init__(*args, **kwargs)
        self.year = year
        self.month = month.zfill(2) if month else None
        self.day = day.zfill(2) if day else None
+
        if self.year and self.month and self.day:
            self.start_urls = [
-                f"https://www.cuestiondepolemica.com/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
+                f"https://www.cuestiondepolemica.com/wp-json/wp/v2/posts?"
+                f"after={self.year}-{self.month}-{self.day}T00:00:00&"
+                f"before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
            ]
+        else:
+            self.logger.error("Year, month, and day must be provided to generate start_urls.")
+            self.start_urls = []

    def parse(self, response):
-        data = json.loads(response.text)
+        try:
+            data = json.loads(response.text)
+            self.logger.info(f"Received {len(data)} posts from API.")
+        except json.JSONDecodeError as e:
+            self.logger.error(f"Failed to parse JSON: {e}")
+            return
+
        for post in data:
-            # Validar que el contenido no esté vacío
-            content = post.get('content', {}).get('rendered', '').strip()
-            if not content:
-                self.logger.info(f"Post {post.get('id')} skipped due to empty content.")
-                continue
-            
-            # Obtener el séptimo elemento de class_list si existe
-            class_list = post.get('class_list', [])
-            topic = class_list['7'] if len(class_list) > 7 else None
-            
-            # Obtener el enlace del autor
-            author_link = post.get('_links', {}).get('author', [{}])[0].get('href')
-            
-            # Crear un item con los campos requeridos
-            item = CuestiondepolemicaItem()
-            item['date'] = post.get('date')
-            item['title'] = remove_tags(post.get('title', {}).get('rendered', ''))
-            item['text'] = remove_tags(content)
-            item['topic'] = topic.split("category-")[1]
-            item['url'] = post.get('link')
-            if author_link:
-                # Hacer una solicitud adicional para obtener el nombre del autor
-                yield scrapy.Request(url=author_link, callback=self.parse_author, meta={'item': item})
-            else:
-                yield item  # Si no hay URL del autor, se devuelve el item sin autor
+            try:
+                # Validar contenido
+                content = post.get('content', {}).get('rendered', '').strip()
+                if not remove_tags(content):
+                    self.logger.warning(f"Skipped post {post.get('id')}: No meaningful content.")
+                    continue

-    def parse_author(self, response):
-        item = response.meta['item']  # Recupera el item pasado a través de meta
-        author_data = json.loads(response.text)
-        item['author'] = author_data.get('name', 'Unknown')  # Asigna el nombre del autor o 'Unknown' si no está disponible
-        print(item["title"])
-        yield item  # Devuelve el item completo con el nombre del autor incluido
+                # Obtener categoría del artículo
+                class_list = post.get('class_list', {})
+                topic = None
+                if isinstance(class_list, dict):
+                    topic = class_list.get('7', '').split("category-")[1] if '7' in class_list else None
+
+                # Preparar item
+                item = CuestiondepolemicaItem()
+                item['date'] = post.get('date')
+                item['title'] = remove_tags(post.get('title', {}).get('rendered', ''))
+                item['text'] = remove_tags(content)
+                item['topic'] = topic
+                item['url'] = post.get('link')

+                # Enlace al autor
+                author_link = post.get('_links', {}).get('author', [{}])[0].get('href', None)
+                if author_link:
+                    yield scrapy.Request(url=author_link, callback=self.parse_author, meta={'item': item})
+                else:
+                    item['author'] = 'Unknown'
+                    yield item
+            except Exception as e:
+                self.logger.error(f"Error processing post {post.get('id')}: {e}")
+                continue

+    def parse_author(self, response):
+        """Procesa la información del autor de un artículo."""
+        try:
+            item = response.meta['item']  # Recupera el item pasado a través de meta
+            author_data = json.loads(response.text)
+            item['author'] = author_data.get('name', 'Unknown')
+            yield item  # Devuelve el item completo con el nombre del autor incluido
+        except Exception as e:
+            self.logger.error(f"Failed to parse author data: {e}")