Commit 7897ed13 authored by umorales's avatar umorales

quadratin

parent 05bafb3c
import scrapy
import scrapy
import json
import re
from quadratinEdomex.items import QuadratinedomexItem
# Expresión regular para eliminar etiquetas HTML
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
if not isinstance(text, str):
return text # Devuelve el valor original si no es una cadena
return TAG_RE.sub('', text)
class NoticiasSpider(scrapy.Spider):
name = "noticias"
allowed_domains = ["edomex.quadratin.com.mx"]
start_urls = ["https://edomex.quadratin.com.mx/"]
def __init__(self, year=None, month=None, day=None, *args, **kwargs):
super(NoticiasSpider, self).__init__(*args, **kwargs)
self.year = year
......@@ -31,13 +25,17 @@ class NoticiasSpider(scrapy.Spider):
self.start_urls = [
f"https://edomex.quadratin.com.mx/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
]
print(self.start_urls[0])
def parse(self, response):
try:
# Intenta decodificar el JSON de la respuesta
data = json.loads(response.text)
self.logger.info(f"Received {len(data)} posts from API.")
except json.JSONDecodeError as e:
# Si hay un error al decodificar el JSON, registra el error y detén el procesamiento
self.logger.error(f"Failed to parse JSON: {e}")
self.logger.error(f"Response content: {response.text[:500]}...") # Logea los primeros 500 caracteres de la respuesta
return
for post in data:
......@@ -59,6 +57,6 @@ class NoticiasSpider(scrapy.Spider):
print(item['title'])
yield item
except Exception as e:
# Si hay un error al procesar un post, registra el error y continúa con el siguiente
self.logger.error(f"Error processing post {post.get('id')}: {e}")
continue
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment