diario amanecer

parent 1d65b97e
...@@ -28,41 +28,32 @@ class NoticiasSpider(scrapy.Spider): ...@@ -28,41 +28,32 @@ class NoticiasSpider(scrapy.Spider):
] ]
def parse(self, response): def parse(self, response):
try:
data = json.loads(response.text) data = json.loads(response.text)
self.logger.info(f"Received {len(data)} posts from API.")
except json.JSONDecodeError as e:
self.logger.error(f"Failed to parse JSON: {e}")
return
for post in data: for post in data:
# Validar que el contenido no esté vacío try:
content = post.get('content', {}).get('rendered', '').strip() content = post.get('content', {}).get('rendered', '').strip()
if not content: if content:
self.logger.info(f"Post {post.get('id')} skipped due to empty content.") class_list = post.get('class_list', {})
continue topic = None
if isinstance(class_list, dict):
# Obtener el séptimo elemento de class_list si existe topic = class_list.get('7', '').split("category-")[1] if '7' in class_list else None
class_list = post.get('class_list', [])
topic = class_list[7] if len(class_list) > 7 else None # Preparar item
item = CuestiondepolemicaItem()
# Obtener el enlace del autor
author_link = post.get('_links', {}).get('author', [{}])[0].get('href')
# Crear un item con los campos requeridos
item = DiarioamanecerItem()
item['date'] = post.get('date') item['date'] = post.get('date')
item['title'] = remove_tags(post.get('title', {}).get('rendered', '')) item['title'] = remove_tags(post.get('title', {}).get('rendered', ''))
item['text'] = remove_tags(content) item['text'] = remove_tags(content)
item['topic'] = topic.split("category-")[1] item['topic'] = topic
item['url'] = post.get('link') item['url'] = post.get('link')
#item['author'] = author_link print(item['title'])
print(author_link) yield item
if author_link: except Exception as e:
# Hacer una solicitud adicional para obtener el nombre del autor self.logger.error(f"Error processing post {post.get('id')}: {e}")
yield scrapy.Request(url=author_link, callback=self.parse_author, meta={'item': item}) continue
else:
yield item # Si no hay URL del autor, se devuelve el item sin autor
def parse_author(self, response):
item = response.meta['item'] # Recupera el item pasado a través de meta
author_data = json.loads(response.text)
item['author'] = author_data.get('name', 'Unknown') # Asigna el nombre del autor o 'Unknown' si no está disponible
print(item["title"])
yield item # Devuelve el item completo con el nombre del autor incluido
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment