Commit b72e2c6f authored by Mario Chirinos Colunga's avatar Mario Chirinos Colunga 💬

Merge branch 'capital_estado_de_mexico' into 'master'

Capital estado de mexico, diario amanecer y cuestion de polémica

See merge request !3
parents a44e6aa8 df362f1a
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class CapitalestadodemexicoItem(scrapy.Item):
date = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# define the fields for your item here like:
# name = scrapy.Field()
pass
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class CapitalestadodemexicoSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class CapitalestadodemexicoDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class CapitalestadodemexicoPipeline:
def process_item(self, item, spider):
return item
# Scrapy settings for capitalEstadoDeMexico project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "capitalEstadoDeMexico"
SPIDER_MODULES = ["capitalEstadoDeMexico.spiders"]
NEWSPIDER_MODULE = "capitalEstadoDeMexico.spiders"
FEED_EXPORT_ENCODING="utf-8"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "capitalEstadoDeMexico (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "capitalEstadoDeMexico.middlewares.CapitalestadodemexicoSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# "capitalEstadoDeMexico.middlewares.CapitalestadodemexicoDownloaderMiddleware": 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# "capitalEstadoDeMexico.pipelines.CapitalestadodemexicoPipeline": 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy
import json
import re
from capitalEstadoDeMexico.items import CapitalestadodemexicoItem
# Expresión regular para eliminar etiquetas HTML
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
"""
Elimina las etiquetas HTML de una cadena utilizando una expresión regular.
Parameters
----------
text : str
La cadena que contiene las etiquetas HTML que se desean eliminar.
Returns
-------
str
La cadena sin etiquetas HTML.
"""
if not isinstance(text, str):
return text # Devuelve el valor original si no es una cadena
return TAG_RE.sub('', text)
class NoticiasSpider(scrapy.Spider):
name = "noticias"
allowed_domains = ["www.capitaledomex.com.mx"]
def __init__(self, year=None, month=None, day=None, *args, **kwargs):
super(NoticiasSpider, self).__init__(*args, **kwargs)
self.year = year
self.month = month.zfill(2) if month else None
self.day = day.zfill(2) if day else None
if self.year and self.month and self.day:
self.start_urls = [
f"https://www.capitaledomex.com.mx/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
]
def parse(self, response):
"""
Procesa una respuesta de la API de WordPress y devuelve los posts que
contienen contenido no vacío. Crea un item con los campos 'date', 'title',
'text', 'author', 'topic' y 'url' y lo devuelve como un objeto de tipo
capitalestadodemexicoItem.
"""
data = json.loads(response.text)
for post in data:
# Validar que el contenido no esté vacío
content = post.get('content', {}).get('rendered', '').strip()
if not content:
self.logger.info(f"Post {post.get('id')} skipped due to empty content.")
continue
# Crear un item con los campos requeridos
item = CapitalestadodemexicoItem()
item['date'] = post.get('date')
item['title'] = remove_tags(post.get('title', {}).get('rendered', ''))
item['text'] = remove_tags(content)
item['author'] = self.extract_author(post)
item['topic'] = self.extract_topic(post)
item['url'] = post.get('link')
print(item['title'])
yield item
def extract_author(self, post):
"""
Extrae el autor del artículo desde 'yoast_head_json->schema->@graph[6]->name'.
"""
yoast_data = post.get('yoast_head_json', {})
schema_graph = yoast_data.get('schema', {}).get('@graph', [])
if len(schema_graph) > 6: # Verifica que exista el índice 6
return schema_graph[6].get('name', 'Desconocido')
return "Desconocido"
def extract_topic(self, post):
"""
Extrae el tema del artículo desde 'yoast_head_json->schema->@graph[5]->articleSection[0]'.
"""
yoast_data = post.get('yoast_head_json', {})
schema_graph = yoast_data.get('schema', {}).get('@graph', [])
if len(schema_graph) > 5: # Verifica que exista el índice 5
article_section = schema_graph[5].get('articleSection', [])
if isinstance(article_section, list) and article_section:
return article_section[0] # Devuelve el primer elemento si existe
return "Sin tema"
\ No newline at end of file
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = capitalEstadoDeMexico.settings
[deploy]
#url = http://localhost:6800/
project = capitalEstadoDeMexico
[
{"date": "2025-01-16T13:45:16", "title": "Gobierno de Naucalpan reafirma compromiso de dignificar a policías municipales y recuperar la paz del municipio", "text": "Redacción / Agencia Cuestión de POLÉMICA \n• El presidente municipal destacó que la Policía Municipal está en vías de coinvertirse en Guardia Municipal para transformar la manera en la que se conduce.\n• Isaac Montoya enfatiza compromiso mutuo para recuperar la paz en el municipio, y la confianza de las y los naucalpenses.\nNaucalpan de Juárez, Méx., 15 de enero de 2025.- Al reafirmar el compromiso de la dignificación de la Policía Municipal que está en vías de cambiar a Guardia Municipal, el presidente municipal, Isaac Montoya Márquez, encabezó el acto de Pase de Lista, donde destacó el compromiso mutuo para recuperar la paz en el municipio, y la confianza de las y los naucalpenses.\n\nEn el Parque Revolución y ante toda la representatividad de la Dirección General de Seguridad Ciudadana y Movilidad Segura, el alcalde anunció que, la Policía Municipal está en vías de convertirse en la Guardia Municipal, pero no se queda solo en cambio de nombre, sino se transformará la manera en que se conduce.\nMontoya Márquez señaló que, otorgará todo el respaldo y confianza a los elementos policiacos, se mejorarán sus condiciones laborales, se garantizarán las prestaciones, y la capacitación, entrenamiento y equipamiento, pero quien no cumpla y no esté al servicio de la gente, se le puede mover también.\nEnérgicamente enfatizó que, no habrá tolerancia para malas prácticas que afecten a la ciudadanía, estaremos muy vigilantes, por ello, las y los policías tiene que ser ejemplo de servicio y honestidad. Deben recuperar la confianza de la gente, en su policía y en su gobierno, subrayó.\n\nEl presidente municipal afirmó que, el Gobierno de la Transformación en Naucalpan va por la recuperación de los Tecallis que se encuentran abandonados, en beneficio de todas las comunidades, para que se conviertan en una base de reacción inmediata.\nInsistió que el compromiso con la policía municipal es dignificar como nunca antes la labor que realiza cada elemento, pero también tiene que haber compromiso mutuo para garantizar la seguridad de las y los naucalpenses.\nAl descentralizar el gobierno, puntualizó, la policía municipal abarcará mucho más territorio y podrá brindar mayor presencia en las comunidades para atender emergencias con mayor prontitud y reducir así los tiempos de respuesta.", "topic": "municipios", "url": "https://www.cuestiondepolemica.com/gobierno-de-naucalpan-reafirma-compromiso-de-dignificar-a-policias-municipales-y-recuperar-la-paz-del-municipio/", "author": "Agencia Cuestión de POLÉMICA"}
]
\ No newline at end of file
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class CuestiondepolemicaItem(scrapy.Item):
date = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# define the fields for your item here like:
# name = scrapy.Field()
pass
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class CuestiondepolemicaSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class CuestiondepolemicaDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class CuestiondepolemicaPipeline:
def process_item(self, item, spider):
return item
# Scrapy settings for cuestionDePolemica project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "cuestionDePolemica"
SPIDER_MODULES = ["cuestionDePolemica.spiders"]
NEWSPIDER_MODULE = "cuestionDePolemica.spiders"
FEED_EXPORT_ENCODING="utf-8"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "capitalEstadoDeMexico (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "cuestionDePolemica.middlewares.CuestiondepolemicaSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# "cuestionDePolemica.middlewares.CuestiondepolemicaDownloaderMiddleware": 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# "cuestionDePolemica.pipelines.CuestiondepolemicaPipeline": 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy
import re
import json
from cuestionDePolemica.items import CuestiondepolemicaItem
# Expresión regular para eliminar etiquetas HTML
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
"""Elimina etiquetas HTML del texto."""
if not isinstance(text, str):
return text # Devuelve el valor original si no es una cadena
return TAG_RE.sub('', text)
class NoticiasSpider(scrapy.Spider):
name = "noticias"
allowed_domains = ["www.cuestiondepolemica.com"]
def __init__(self, year=None, month=None, day=None, *args, **kwargs):
super(NoticiasSpider, self).__init__(*args, **kwargs)
self.year = year
self.month = month.zfill(2) if month else None
self.day = day.zfill(2) if day else None
if self.year and self.month and self.day:
self.start_urls = [
f"https://www.cuestiondepolemica.com/wp-json/wp/v2/posts?"
f"after={self.year}-{self.month}-{self.day}T00:00:00&"
f"before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
]
else:
self.logger.error("Year, month, and day must be provided to generate start_urls.")
self.start_urls = []
def parse(self, response):
try:
data = json.loads(response.text)
self.logger.info(f"Received {len(data)} posts from API.")
except json.JSONDecodeError as e:
self.logger.error(f"Failed to parse JSON: {e}")
return
for post in data:
try:
content = post.get('content', {}).get('rendered', '').strip()
if content:
class_list = post.get('class_list', {})
topic = None
if isinstance(class_list, dict):
topic = class_list.get('7', '').split("category-")[1] if '7' in class_list else None
# Preparar item
item = CuestiondepolemicaItem()
item['date'] = post.get('date')
item['title'] = remove_tags(post.get('title', {}).get('rendered', ''))
item['text'] = remove_tags(content)
item['topic'] = topic
item['url'] = post.get('link')
print(item['title'])
yield item
except Exception as e:
self.logger.error(f"Error processing post {post.get('id')}: {e}")
continue
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = cuestionDePolemica.settings
[deploy]
#url = http://localhost:6800/
project = cuestionDePolemica
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class DiarioamanecerItem(scrapy.Item):
date = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# define the fields for your item here like:
# name = scrapy.Field()
pass
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class DiarioamanecerSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class DiarioamanecerDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class DiarioamanecerPipeline:
def process_item(self, item, spider):
return item
# Scrapy settings for diarioAmanecer project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "diarioAmanecer"
SPIDER_MODULES = ["diarioAmanecer.spiders"]
NEWSPIDER_MODULE = "diarioAmanecer.spiders"
FEED_EXPORT_ENCODING="utf-8"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "diarioAmanecer (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "diarioAmanecer.middlewares.DiarioamanecerSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# "diarioAmanecer.middlewares.DiarioamanecerDownloaderMiddleware": 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# "diarioAmanecer.pipelines.DiarioamanecerPipeline": 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy
import json
import re
from diarioAmanecer.items import DiarioamanecerItem
# Expresión regular para eliminar etiquetas HTML
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
if not isinstance(text, str):
return text # Devuelve el valor original si no es una cadena
return TAG_RE.sub('', text)
class NoticiasSpider(scrapy.Spider):
name = "noticias"
allowed_domains = ["www.diarioamanecer.com.mx", "diarioamanecer.com.mx"]
def __init__(self, year=None, month=None, day=None, *args, **kwargs):
super(NoticiasSpider, self).__init__(*args, **kwargs)
self.year = year
self.month = month.zfill(2) if month else None
self.day = day.zfill(2) if day else None
if self.year and self.month and self.day:
self.start_urls = [
f"https://www.diarioamanecer.com.mx/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
]
def parse(self, response):
try:
data = json.loads(response.text)
self.logger.info(f"Received {len(data)} posts from API.")
except json.JSONDecodeError as e:
self.logger.error(f"Failed to parse JSON: {e}")
return
for post in data:
try:
content = post.get('content', {}).get('rendered', '').strip()
if content:
class_list = post.get('class_list', {})
topic = None
if isinstance(class_list, dict):
topic = class_list.get('7', '').split("category-")[1] if '7' in class_list else None
# Preparar item
item = CuestiondepolemicaItem()
item['date'] = post.get('date')
item['title'] = remove_tags(post.get('title', {}).get('rendered', ''))
item['text'] = remove_tags(content)
item['topic'] = topic
item['url'] = post.get('link')
print(item['title'])
yield item
except Exception as e:
self.logger.error(f"Error processing post {post.get('id')}: {e}")
continue
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = diarioAmanecer.settings
[deploy]
#url = http://localhost:6800/
project = diarioAmanecer
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment