Merge branch 'capital_estado_de_mexico' into 'master'

Capital estado de mexico, diario amanecer y cuestion de polémica See merge request !3

Merge branch 'capital_estado_de_mexico' into 'master'
Capital estado de mexico, diario amanecer y cuestion de polémica See merge request !3
b72e2c6f · Mario Chirinos Colunga · a44e6aa8 · df362f1a · b72e2c6f · b72e2c6f
Commit b72e2c6f authored Jan 21, 2025 by Mario Chirinos Colunga 💬
25 changed files
--- a/spiders/daily/capitalEstadoDeMexico/capitalEstadoDeMexico/__init__.py
+++ b/spiders/daily/capitalEstadoDeMexico/capitalEstadoDeMexico/__init__.py
--- a/spiders/daily/capitalEstadoDeMexico/capitalEstadoDeMexico/items.py
+++ b/spiders/daily/capitalEstadoDeMexico/capitalEstadoDeMexico/items.py
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class CapitalestadodemexicoItem(scrapy.Item):
+    date = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    
+    pass
--- a/spiders/daily/capitalEstadoDeMexico/capitalEstadoDeMexico/middlewares.py
+++ b/spiders/daily/capitalEstadoDeMexico/capitalEstadoDeMexico/middlewares.py
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+
+
+class CapitalestadodemexicoSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class CapitalestadodemexicoDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
--- a/spiders/daily/capitalEstadoDeMexico/capitalEstadoDeMexico/pipelines.py
+++ b/spiders/daily/capitalEstadoDeMexico/capitalEstadoDeMexico/pipelines.py
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+
+
+class CapitalestadodemexicoPipeline:
+    def process_item(self, item, spider):
+        return item
--- a/spiders/daily/capitalEstadoDeMexico/capitalEstadoDeMexico/settings.py
+++ b/spiders/daily/capitalEstadoDeMexico/capitalEstadoDeMexico/settings.py
+# Scrapy settings for capitalEstadoDeMexico project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = "capitalEstadoDeMexico"
+
+SPIDER_MODULES = ["capitalEstadoDeMexico.spiders"]
+NEWSPIDER_MODULE = "capitalEstadoDeMexico.spiders"
+
+FEED_EXPORT_ENCODING="utf-8"
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = "capitalEstadoDeMexico (+http://www.yourdomain.com)"
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+#    "Accept-Language": "en",
+#}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    "capitalEstadoDeMexico.middlewares.CapitalestadodemexicoSpiderMiddleware": 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    "capitalEstadoDeMexico.middlewares.CapitalestadodemexicoDownloaderMiddleware": 543,
+#}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    "capitalEstadoDeMexico.pipelines.CapitalestadodemexicoPipeline": 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = "httpcache"
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+
+# Set settings whose default value is deprecated to a future-proof value
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+FEED_EXPORT_ENCODING = "utf-8"
--- a/spiders/daily/capitalEstadoDeMexico/capitalEstadoDeMexico/spiders/__init__.py
+++ b/spiders/daily/capitalEstadoDeMexico/capitalEstadoDeMexico/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/spiders/daily/capitalEstadoDeMexico/capitalEstadoDeMexico/spiders/noticias.py
+++ b/spiders/daily/capitalEstadoDeMexico/capitalEstadoDeMexico/spiders/noticias.py
+import scrapy
+import json
+import re
+from capitalEstadoDeMexico.items import CapitalestadodemexicoItem
+
+# Expresión regular para eliminar etiquetas HTML
+TAG_RE = re.compile(r'<[^>]+>')
+
+def remove_tags(text):
+    """
+    Elimina las etiquetas HTML de una cadena utilizando una expresión regular.
+    
+    Parameters
+    ----------
+    text : str
+        La cadena que contiene las etiquetas HTML que se desean eliminar.
+    
+    Returns
+    -------
+    str
+        La cadena sin etiquetas HTML.
+    """
+    if not isinstance(text, str):
+        return text  # Devuelve el valor original si no es una cadena
+    return TAG_RE.sub('', text)
+
+
+class NoticiasSpider(scrapy.Spider):
+    name = "noticias"
+    allowed_domains = ["www.capitaledomex.com.mx"]
+    def __init__(self, year=None, month=None, day=None, *args, **kwargs):
+        super(NoticiasSpider, self).__init__(*args, **kwargs)
+        self.year = year
+        self.month = month.zfill(2) if month else None
+        self.day = day.zfill(2) if day else None
+        if self.year and self.month and self.day:
+            self.start_urls = [
+                f"https://www.capitaledomex.com.mx/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
+            ]
+
+    def parse(self, response):
+        """
+        Procesa una respuesta de la API de WordPress y devuelve los posts que 
+        contienen contenido no vacío. Crea un item con los campos 'date', 'title', 
+        'text', 'author', 'topic' y 'url' y lo devuelve como un objeto de tipo 
+        capitalestadodemexicoItem.
+        """
+        data = json.loads(response.text)
+        for post in data:
+            # Validar que el contenido no esté vacío
+            content = post.get('content', {}).get('rendered', '').strip()
+            if not content:
+                self.logger.info(f"Post {post.get('id')} skipped due to empty content.")
+                continue
+
+            # Crear un item con los campos requeridos
+            item = CapitalestadodemexicoItem()
+            item['date'] = post.get('date')
+            item['title'] = remove_tags(post.get('title', {}).get('rendered', ''))
+            item['text'] = remove_tags(content)
+            item['author'] = self.extract_author(post)
+            item['topic'] = self.extract_topic(post)
+            item['url'] = post.get('link')
+            print(item['title'])
+            yield item
+
+    def extract_author(self, post):
+        """
+        Extrae el autor del artículo desde 'yoast_head_json->schema->@graph[6]->name'.
+        """
+        yoast_data = post.get('yoast_head_json', {})
+        schema_graph = yoast_data.get('schema', {}).get('@graph', [])
+        if len(schema_graph) > 6:  # Verifica que exista el índice 6
+            return schema_graph[6].get('name', 'Desconocido')
+        return "Desconocido"
+
+    def extract_topic(self, post):
+        """
+        Extrae el tema del artículo desde 'yoast_head_json->schema->@graph[5]->articleSection[0]'.
+        """
+        yoast_data = post.get('yoast_head_json', {})
+        schema_graph = yoast_data.get('schema', {}).get('@graph', [])
+        if len(schema_graph) > 5:  # Verifica que exista el índice 5
+            article_section = schema_graph[5].get('articleSection', [])
+            if isinstance(article_section, list) and article_section:
+                return article_section[0]  # Devuelve el primer elemento si existe
+        return "Sin tema"
\ No newline at end of file
--- a/spiders/daily/capitalEstadoDeMexico/scrapy.cfg
+++ b/spiders/daily/capitalEstadoDeMexico/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = capitalEstadoDeMexico.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = capitalEstadoDeMexico
--- a/spiders/daily/cuestionDePolemica/2017-04-23.json
+++ b/spiders/daily/cuestionDePolemica/2017-04-23.json
+[
+{"date": "2025-01-16T13:45:16", "title": "Gobierno de Naucalpan reafirma compromiso de dignificar a policías municipales y recuperar la paz del municipio", "text": "Redacción / Agencia Cuestión de POLÉMICA \n• El presidente municipal destacó que la Policía Municipal está en vías de coinvertirse en Guardia Municipal para transformar la manera en la que se conduce.\n• Isaac Montoya enfatiza compromiso mutuo para recuperar la paz en el municipio, y la confianza de las y los naucalpenses.\nNaucalpan de Juárez, Méx., 15 de enero de 2025.- Al reafirmar el compromiso de la dignificación de la Policía Municipal que está en vías de cambiar a Guardia Municipal, el presidente municipal, Isaac Montoya Márquez, encabezó el acto de Pase de Lista, donde destacó el compromiso mutuo para recuperar la paz en el municipio, y la confianza de las y los naucalpenses.\n\nEn el Parque Revolución y ante toda la representatividad de la Dirección General de Seguridad Ciudadana y Movilidad Segura, el alcalde anunció que, la Policía Municipal está en vías de convertirse en la Guardia Municipal, pero no se queda solo en cambio de nombre, sino se transformará la manera en que se conduce.\nMontoya Márquez señaló que, otorgará todo el respaldo y confianza a los elementos policiacos, se mejorarán sus condiciones laborales, se garantizarán las prestaciones, y la capacitación, entrenamiento y equipamiento, pero quien no cumpla y no esté al servicio de la gente, se le puede mover también.\nEnérgicamente enfatizó que, no habrá tolerancia para malas prácticas que afecten a la ciudadanía, estaremos muy vigilantes, por ello, las y los policías tiene que ser ejemplo de servicio y honestidad. Deben recuperar la confianza de la gente, en su policía y en su gobierno, subrayó.\n\nEl presidente municipal afirmó que, el Gobierno de la Transformación en Naucalpan va por la recuperación de los Tecallis que se encuentran abandonados, en beneficio de todas las comunidades, para que se conviertan en una base de reacción inmediata.\nInsistió que el compromiso con la policía municipal es dignificar como nunca antes la labor que realiza cada elemento, pero también tiene que haber compromiso mutuo para garantizar la seguridad de las y los naucalpenses.\nAl descentralizar el gobierno, puntualizó, la policía municipal abarcará mucho más territorio y podrá brindar mayor presencia en las comunidades para atender emergencias con mayor prontitud y reducir así los tiempos de respuesta.", "topic": "municipios", "url": "https://www.cuestiondepolemica.com/gobierno-de-naucalpan-reafirma-compromiso-de-dignificar-a-policias-municipales-y-recuperar-la-paz-del-municipio/", "author": "Agencia Cuestión de POLÉMICA"}
+]
\ No newline at end of file
--- a/spiders/daily/cuestionDePolemica/cuestionDePolemica/__init__.py
+++ b/spiders/daily/cuestionDePolemica/cuestionDePolemica/__init__.py
--- a/spiders/daily/cuestionDePolemica/cuestionDePolemica/items.py
+++ b/spiders/daily/cuestionDePolemica/cuestionDePolemica/items.py
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class CuestiondepolemicaItem(scrapy.Item):
+    date = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
--- a/spiders/daily/cuestionDePolemica/cuestionDePolemica/middlewares.py
+++ b/spiders/daily/cuestionDePolemica/cuestionDePolemica/middlewares.py
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+
+
+class CuestiondepolemicaSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class CuestiondepolemicaDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
--- a/spiders/daily/cuestionDePolemica/cuestionDePolemica/pipelines.py
+++ b/spiders/daily/cuestionDePolemica/cuestionDePolemica/pipelines.py
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+
+
+class CuestiondepolemicaPipeline:
+    def process_item(self, item, spider):
+        return item
--- a/spiders/daily/cuestionDePolemica/cuestionDePolemica/settings.py
+++ b/spiders/daily/cuestionDePolemica/cuestionDePolemica/settings.py
+# Scrapy settings for cuestionDePolemica project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = "cuestionDePolemica"
+
+SPIDER_MODULES = ["cuestionDePolemica.spiders"]
+NEWSPIDER_MODULE = "cuestionDePolemica.spiders"
+
+
+FEED_EXPORT_ENCODING="utf-8"
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = "capitalEstadoDeMexico (+http://www.yourdomain.com)"
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+#    "Accept-Language": "en",
+#}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    "cuestionDePolemica.middlewares.CuestiondepolemicaSpiderMiddleware": 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    "cuestionDePolemica.middlewares.CuestiondepolemicaDownloaderMiddleware": 543,
+#}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    "cuestionDePolemica.pipelines.CuestiondepolemicaPipeline": 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = "httpcache"
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+
+# Set settings whose default value is deprecated to a future-proof value
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+FEED_EXPORT_ENCODING = "utf-8"
--- a/spiders/daily/cuestionDePolemica/cuestionDePolemica/spiders/__init__.py
+++ b/spiders/daily/cuestionDePolemica/cuestionDePolemica/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/spiders/daily/cuestionDePolemica/cuestionDePolemica/spiders/noticias.py
+++ b/spiders/daily/cuestionDePolemica/cuestionDePolemica/spiders/noticias.py
+import scrapy
+import re
+import json
+from cuestionDePolemica.items import CuestiondepolemicaItem
+
+# Expresión regular para eliminar etiquetas HTML
+TAG_RE = re.compile(r'<[^>]+>')
+
+def remove_tags(text):
+    """Elimina etiquetas HTML del texto."""
+    if not isinstance(text, str):
+        return text  # Devuelve el valor original si no es una cadena
+    return TAG_RE.sub('', text)
+
+class NoticiasSpider(scrapy.Spider):
+    name = "noticias"
+    allowed_domains = ["www.cuestiondepolemica.com"]
+
+    def __init__(self, year=None, month=None, day=None, *args, **kwargs):
+        super(NoticiasSpider, self).__init__(*args, **kwargs)
+        self.year = year
+        self.month = month.zfill(2) if month else None
+        self.day = day.zfill(2) if day else None
+
+        if self.year and self.month and self.day:
+            self.start_urls = [
+                f"https://www.cuestiondepolemica.com/wp-json/wp/v2/posts?"
+                f"after={self.year}-{self.month}-{self.day}T00:00:00&"
+                f"before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
+            ]
+        else:
+            self.logger.error("Year, month, and day must be provided to generate start_urls.")
+            self.start_urls = []
+
+    def parse(self, response):
+        try:
+            data = json.loads(response.text)
+            self.logger.info(f"Received {len(data)} posts from API.")
+        except json.JSONDecodeError as e:
+            self.logger.error(f"Failed to parse JSON: {e}")
+            return
+
+        for post in data:
+            try:
+                content = post.get('content', {}).get('rendered', '').strip()
+                if content:
+                    class_list = post.get('class_list', {})
+                    topic = None
+                    if isinstance(class_list, dict):
+                        topic = class_list.get('7', '').split("category-")[1] if '7' in class_list else None
+
+                    # Preparar item
+                    item = CuestiondepolemicaItem()
+                    item['date'] = post.get('date')
+                    item['title'] = remove_tags(post.get('title', {}).get('rendered', ''))
+                    item['text'] = remove_tags(content)
+                    item['topic'] = topic
+                    item['url'] = post.get('link')
+                    print(item['title'])
+                    yield item
+            except Exception as e:
+                self.logger.error(f"Error processing post {post.get('id')}: {e}")
+                continue
+                
--- a/spiders/daily/cuestionDePolemica/scrapy.cfg
+++ b/spiders/daily/cuestionDePolemica/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = cuestionDePolemica.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = cuestionDePolemica
--- a/spiders/daily/diarioAmanecer/diarioAmanecer/__init__.py
+++ b/spiders/daily/diarioAmanecer/diarioAmanecer/__init__.py
--- a/spiders/daily/diarioAmanecer/diarioAmanecer/items.py
+++ b/spiders/daily/diarioAmanecer/diarioAmanecer/items.py
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class DiarioamanecerItem(scrapy.Item):
+    date = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
--- a/spiders/daily/diarioAmanecer/diarioAmanecer/middlewares.py
+++ b/spiders/daily/diarioAmanecer/diarioAmanecer/middlewares.py
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+
+
+class DiarioamanecerSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class DiarioamanecerDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
--- a/spiders/daily/diarioAmanecer/diarioAmanecer/pipelines.py
+++ b/spiders/daily/diarioAmanecer/diarioAmanecer/pipelines.py
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+
+
+class DiarioamanecerPipeline:
+    def process_item(self, item, spider):
+        return item
--- a/spiders/daily/diarioAmanecer/diarioAmanecer/settings.py
+++ b/spiders/daily/diarioAmanecer/diarioAmanecer/settings.py
+# Scrapy settings for diarioAmanecer project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = "diarioAmanecer"
+
+SPIDER_MODULES = ["diarioAmanecer.spiders"]
+NEWSPIDER_MODULE = "diarioAmanecer.spiders"
+
+FEED_EXPORT_ENCODING="utf-8"
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = "diarioAmanecer (+http://www.yourdomain.com)"
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+#    "Accept-Language": "en",
+#}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    "diarioAmanecer.middlewares.DiarioamanecerSpiderMiddleware": 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    "diarioAmanecer.middlewares.DiarioamanecerDownloaderMiddleware": 543,
+#}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    "diarioAmanecer.pipelines.DiarioamanecerPipeline": 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = "httpcache"
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+
+# Set settings whose default value is deprecated to a future-proof value
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+FEED_EXPORT_ENCODING = "utf-8"
--- a/spiders/daily/diarioAmanecer/diarioAmanecer/spiders/__init__.py
+++ b/spiders/daily/diarioAmanecer/diarioAmanecer/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/spiders/daily/diarioAmanecer/diarioAmanecer/spiders/noticias.py
+++ b/spiders/daily/diarioAmanecer/diarioAmanecer/spiders/noticias.py
+import scrapy
+import json
+import re
+from diarioAmanecer.items import DiarioamanecerItem
+
+# Expresión regular para eliminar etiquetas HTML
+TAG_RE = re.compile(r'<[^>]+>')
+
+def remove_tags(text):
+
+    if not isinstance(text, str):
+        return text  # Devuelve el valor original si no es una cadena
+    return TAG_RE.sub('', text)
+
+class NoticiasSpider(scrapy.Spider):
+    name = "noticias"
+    allowed_domains = ["www.diarioamanecer.com.mx", "diarioamanecer.com.mx"]
+
+
+    def __init__(self, year=None, month=None, day=None, *args, **kwargs):
+        super(NoticiasSpider, self).__init__(*args, **kwargs)
+        self.year = year
+        self.month = month.zfill(2) if month else None
+        self.day = day.zfill(2) if day else None
+        if self.year and self.month and self.day:
+            self.start_urls = [
+                f"https://www.diarioamanecer.com.mx/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
+            ]
+
+    def parse(self, response):
+        try:
+            data = json.loads(response.text)
+            self.logger.info(f"Received {len(data)} posts from API.")
+        except json.JSONDecodeError as e:
+            self.logger.error(f"Failed to parse JSON: {e}")
+            return
+
+        for post in data:
+            try:
+                content = post.get('content', {}).get('rendered', '').strip()
+                if content:
+                    class_list = post.get('class_list', {})
+                    topic = None
+                    if isinstance(class_list, dict):
+                        topic = class_list.get('7', '').split("category-")[1] if '7' in class_list else None
+
+                    # Preparar item
+                    item = CuestiondepolemicaItem()
+                    item['date'] = post.get('date')
+                    item['title'] = remove_tags(post.get('title', {}).get('rendered', ''))
+                    item['text'] = remove_tags(content)
+                    item['topic'] = topic
+                    item['url'] = post.get('link')
+                    print(item['title'])
+                    yield item
+            except Exception as e:
+                self.logger.error(f"Error processing post {post.get('id')}: {e}")
+                continue
+                
--- a/spiders/daily/diarioAmanecer/scrapy.cfg
+++ b/spiders/daily/diarioAmanecer/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = diarioAmanecer.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = diarioAmanecer