Commit 9fded341 authored by umorales's avatar umorales

cambios en cuestion de polemicua

parent 8b9f5b5f
......@@ -7,7 +7,7 @@ from cuestionDePolemica.items import CuestiondepolemicaItem
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
"""Elimina etiquetas HTML del texto."""
if not isinstance(text, str):
return text # Devuelve el valor original si no es una cadena
return TAG_RE.sub('', text)
......@@ -15,50 +15,70 @@ def remove_tags(text):
class NoticiasSpider(scrapy.Spider):
name = "noticias"
allowed_domains = ["www.cuestiondepolemica.com"]
def __init__(self, year=None, month=None, day=None, *args, **kwargs):
super(NoticiasSpider, self).__init__(*args, **kwargs)
self.year = year
self.month = month.zfill(2) if month else None
self.day = day.zfill(2) if day else None
if self.year and self.month and self.day:
self.start_urls = [
f"https://www.cuestiondepolemica.com/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
f"https://www.cuestiondepolemica.com/wp-json/wp/v2/posts?"
f"after={self.year}-{self.month}-{self.day}T00:00:00&"
f"before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
]
else:
self.logger.error("Year, month, and day must be provided to generate start_urls.")
self.start_urls = []
def parse(self, response):
data = json.loads(response.text)
try:
data = json.loads(response.text)
self.logger.info(f"Received {len(data)} posts from API.")
except json.JSONDecodeError as e:
self.logger.error(f"Failed to parse JSON: {e}")
return
for post in data:
# Validar que el contenido no esté vacío
content = post.get('content', {}).get('rendered', '').strip()
if not content:
self.logger.info(f"Post {post.get('id')} skipped due to empty content.")
continue
# Obtener el séptimo elemento de class_list si existe
class_list = post.get('class_list', [])
topic = class_list['7'] if len(class_list) > 7 else None
# Obtener el enlace del autor
author_link = post.get('_links', {}).get('author', [{}])[0].get('href')
# Crear un item con los campos requeridos
item = CuestiondepolemicaItem()
item['date'] = post.get('date')
item['title'] = remove_tags(post.get('title', {}).get('rendered', ''))
item['text'] = remove_tags(content)
item['topic'] = topic.split("category-")[1]
item['url'] = post.get('link')
if author_link:
# Hacer una solicitud adicional para obtener el nombre del autor
yield scrapy.Request(url=author_link, callback=self.parse_author, meta={'item': item})
else:
yield item # Si no hay URL del autor, se devuelve el item sin autor
try:
# Validar contenido
content = post.get('content', {}).get('rendered', '').strip()
if not remove_tags(content):
self.logger.warning(f"Skipped post {post.get('id')}: No meaningful content.")
continue
def parse_author(self, response):
item = response.meta['item'] # Recupera el item pasado a través de meta
author_data = json.loads(response.text)
item['author'] = author_data.get('name', 'Unknown') # Asigna el nombre del autor o 'Unknown' si no está disponible
print(item["title"])
yield item # Devuelve el item completo con el nombre del autor incluido
# Obtener categoría del artículo
class_list = post.get('class_list', {})
topic = None
if isinstance(class_list, dict):
topic = class_list.get('7', '').split("category-")[1] if '7' in class_list else None
# Preparar item
item = CuestiondepolemicaItem()
item['date'] = post.get('date')
item['title'] = remove_tags(post.get('title', {}).get('rendered', ''))
item['text'] = remove_tags(content)
item['topic'] = topic
item['url'] = post.get('link')
# Enlace al autor
author_link = post.get('_links', {}).get('author', [{}])[0].get('href', None)
if author_link:
yield scrapy.Request(url=author_link, callback=self.parse_author, meta={'item': item})
else:
item['author'] = 'Unknown'
yield item
except Exception as e:
self.logger.error(f"Error processing post {post.get('id')}: {e}")
continue
def parse_author(self, response):
"""Procesa la información del autor de un artículo."""
try:
item = response.meta['item'] # Recupera el item pasado a través de meta
author_data = json.loads(response.text)
item['author'] = author_data.get('name', 'Unknown')
yield item # Devuelve el item completo con el nombre del autor incluido
except Exception as e:
self.logger.error(f"Failed to parse author data: {e}")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment