Commit 7897ed13 authored by umorales's avatar umorales

quadratin

parent 05bafb3c
import scrapy import scrapy
import scrapy
import json import json
import re import re
from quadratinEdomex.items import QuadratinedomexItem from quadratinEdomex.items import QuadratinedomexItem
# Expresión regular para eliminar etiquetas HTML # Expresión regular para eliminar etiquetas HTML
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
if not isinstance(text, str): if not isinstance(text, str):
return text # Devuelve el valor original si no es una cadena return text # Devuelve el valor original si no es una cadena
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasSpider(scrapy.Spider): class NoticiasSpider(scrapy.Spider):
name = "noticias" name = "noticias"
allowed_domains = ["edomex.quadratin.com.mx"] allowed_domains = ["edomex.quadratin.com.mx"]
start_urls = ["https://edomex.quadratin.com.mx/"] start_urls = ["https://edomex.quadratin.com.mx/"]
def __init__(self, year=None, month=None, day=None, *args, **kwargs): def __init__(self, year=None, month=None, day=None, *args, **kwargs):
super(NoticiasSpider, self).__init__(*args, **kwargs) super(NoticiasSpider, self).__init__(*args, **kwargs)
self.year = year self.year = year
...@@ -31,13 +25,17 @@ class NoticiasSpider(scrapy.Spider): ...@@ -31,13 +25,17 @@ class NoticiasSpider(scrapy.Spider):
self.start_urls = [ self.start_urls = [
f"https://edomex.quadratin.com.mx/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100" f"https://edomex.quadratin.com.mx/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
] ]
print(self.start_urls[0])
def parse(self, response): def parse(self, response):
try: try:
# Intenta decodificar el JSON de la respuesta
data = json.loads(response.text) data = json.loads(response.text)
self.logger.info(f"Received {len(data)} posts from API.") self.logger.info(f"Received {len(data)} posts from API.")
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
# Si hay un error al decodificar el JSON, registra el error y detén el procesamiento
self.logger.error(f"Failed to parse JSON: {e}") self.logger.error(f"Failed to parse JSON: {e}")
self.logger.error(f"Response content: {response.text[:500]}...") # Logea los primeros 500 caracteres de la respuesta
return return
for post in data: for post in data:
...@@ -59,6 +57,6 @@ class NoticiasSpider(scrapy.Spider): ...@@ -59,6 +57,6 @@ class NoticiasSpider(scrapy.Spider):
print(item['title']) print(item['title'])
yield item yield item
except Exception as e: except Exception as e:
# Si hay un error al procesar un post, registra el error y continúa con el siguiente
self.logger.error(f"Error processing post {post.get('id')}: {e}") self.logger.error(f"Error processing post {post.get('id')}: {e}")
continue continue
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment