Commit c14dec2a authored by umorales's avatar umorales

capitalEdomex

parent 1f690aab
......@@ -28,6 +28,7 @@ def remove_tags(text):
class NoticiasSpider(scrapy.Spider):
name = "noticias"
allowed_domains = ["www.capitaledomex.com.mx"]
def __init__(self, year=None, month=None, day=None, *args, **kwargs):
super(NoticiasSpider, self).__init__(*args, **kwargs)
self.year = year
......@@ -35,7 +36,7 @@ class NoticiasSpider(scrapy.Spider):
self.day = day.zfill(2) if day else None
if self.year and self.month and self.day:
self.start_urls = [
f"{allowed_domains[0]}/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
f"https://{self.allowed_domains[0]}/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
]
def parse(self, response):
......@@ -43,14 +44,20 @@ class NoticiasSpider(scrapy.Spider):
Procesa una respuesta de la API de WordPress y devuelve los posts que
contienen contenido no vacío. Crea un item con los campos 'date', 'title',
'text', 'author', 'topic' y 'url' y lo devuelve como un objeto de tipo
capitalestadodemexicoItem.
CapitalestadodemexicoItem.
"""
try:
data = json.loads(response.text)
except json.JSONDecodeError as e:
self.logger.error(f"Error al decodificar JSON: {e}")
self.logger.error(f"Contenido recibido: {response.text[:500]}") # Muestra los primeros 500 caracteres
return
for post in data:
# Validar que el contenido no esté vacío
content = post.get('content', {}).get('rendered', '').strip()
if not content:
self.logger.info(f"Post {post.get('id')} skipped due to empty content.")
self.logger.info(f"Post {post.get('id')} omitido debido a contenido vacío.")
continue
# Crear un item con los campos requeridos
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment