diario amanecer

parent 1d65b97e
......@@ -28,41 +28,32 @@ class NoticiasSpider(scrapy.Spider):
]
def parse(self, response):
data = json.loads(response.text)
try:
data = json.loads(response.text)
self.logger.info(f"Received {len(data)} posts from API.")
except json.JSONDecodeError as e:
self.logger.error(f"Failed to parse JSON: {e}")
return
for post in data:
# Validar que el contenido no esté vacío
content = post.get('content', {}).get('rendered', '').strip()
if not content:
self.logger.info(f"Post {post.get('id')} skipped due to empty content.")
try:
content = post.get('content', {}).get('rendered', '').strip()
if content:
class_list = post.get('class_list', {})
topic = None
if isinstance(class_list, dict):
topic = class_list.get('7', '').split("category-")[1] if '7' in class_list else None
# Preparar item
item = CuestiondepolemicaItem()
item['date'] = post.get('date')
item['title'] = remove_tags(post.get('title', {}).get('rendered', ''))
item['text'] = remove_tags(content)
item['topic'] = topic
item['url'] = post.get('link')
print(item['title'])
yield item
except Exception as e:
self.logger.error(f"Error processing post {post.get('id')}: {e}")
continue
# Obtener el séptimo elemento de class_list si existe
class_list = post.get('class_list', [])
topic = class_list[7] if len(class_list) > 7 else None
# Obtener el enlace del autor
author_link = post.get('_links', {}).get('author', [{}])[0].get('href')
# Crear un item con los campos requeridos
item = DiarioamanecerItem()
item['date'] = post.get('date')
item['title'] = remove_tags(post.get('title', {}).get('rendered', ''))
item['text'] = remove_tags(content)
item['topic'] = topic.split("category-")[1]
item['url'] = post.get('link')
#item['author'] = author_link
print(author_link)
if author_link:
# Hacer una solicitud adicional para obtener el nombre del autor
yield scrapy.Request(url=author_link, callback=self.parse_author, meta={'item': item})
else:
yield item # Si no hay URL del autor, se devuelve el item sin autor
def parse_author(self, response):
item = response.meta['item'] # Recupera el item pasado a través de meta
author_data = json.loads(response.text)
item['author'] = author_data.get('name', 'Unknown') # Asigna el nombre del autor o 'Unknown' si no está disponible
print(item["title"])
yield item # Devuelve el item completo con el nombre del autor incluido
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment