cuestiondepolemica

8b9f5b5f · Ulises Morales Ramírez · 4d638c03 · 8b9f5b5f · 8b9f5b5f · 8b9f5b5f
Commit 8b9f5b5f authored Jan 16, 2025 by Ulises Morales Ramírez
10 changed files
--- a/spiders/daily/cuestionDePolemica/2017-04-23.json
+++ b/spiders/daily/cuestionDePolemica/2017-04-23.json
+[
+{"date": "2025-01-16T13:45:16", "title": "Gobierno de Naucalpan reafirma compromiso de dignificar a policías municipales y recuperar la paz del municipio", "text": "Redacción / Agencia Cuestión de POLÉMICA \n• El presidente municipal destacó que la Policía Municipal está en vías de coinvertirse en Guardia Municipal para transformar la manera en la que se conduce.\n• Isaac Montoya enfatiza compromiso mutuo para recuperar la paz en el municipio, y la confianza de las y los naucalpenses.\nNaucalpan de Juárez, Méx., 15 de enero de 2025.- Al reafirmar el compromiso de la dignificación de la Policía Municipal que está en vías de cambiar a Guardia Municipal, el presidente municipal, Isaac Montoya Márquez, encabezó el acto de Pase de Lista, donde destacó el compromiso mutuo para recuperar la paz en el municipio, y la confianza de las y los naucalpenses.\n\nEn el Parque Revolución y ante toda la representatividad de la Dirección General de Seguridad Ciudadana y Movilidad Segura, el alcalde anunció que, la Policía Municipal está en vías de convertirse en la Guardia Municipal, pero no se queda solo en cambio de nombre, sino se transformará la manera en que se conduce.\nMontoya Márquez señaló que, otorgará todo el respaldo y confianza a los elementos policiacos, se mejorarán sus condiciones laborales, se garantizarán las prestaciones, y la capacitación, entrenamiento y equipamiento, pero quien no cumpla y no esté al servicio de la gente, se le puede mover también.\nEnérgicamente enfatizó que, no habrá tolerancia para malas prácticas que afecten a la ciudadanía, estaremos muy vigilantes, por ello, las y los policías tiene que ser ejemplo de servicio y honestidad. Deben recuperar la confianza de la gente, en su policía y en su gobierno, subrayó.\n\nEl presidente municipal afirmó que, el Gobierno de la Transformación en Naucalpan va por la recuperación de los Tecallis que se encuentran abandonados, en beneficio de todas las comunidades, para que se conviertan en una base de reacción inmediata.\nInsistió que el compromiso con la policía municipal es dignificar como nunca antes la labor que realiza cada elemento, pero también tiene que haber compromiso mutuo para garantizar la seguridad de las y los naucalpenses.\nAl descentralizar el gobierno, puntualizó, la policía municipal abarcará mucho más territorio y podrá brindar mayor presencia en las comunidades para atender emergencias con mayor prontitud y reducir así los tiempos de respuesta.", "topic": "municipios", "url": "https://www.cuestiondepolemica.com/gobierno-de-naucalpan-reafirma-compromiso-de-dignificar-a-policias-municipales-y-recuperar-la-paz-del-municipio/", "author": "Agencia Cuestión de POLÉMICA"}
+]
\ No newline at end of file
--- a/spiders/daily/cuestionDePolemica/cuestionDePolemica/__init__.py
+++ b/spiders/daily/cuestionDePolemica/cuestionDePolemica/__init__.py
--- a/spiders/daily/cuestionDePolemica/cuestionDePolemica/items.py
+++ b/spiders/daily/cuestionDePolemica/cuestionDePolemica/items.py
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+import scrapy
+class CuestiondepolemicaItem(scrapy.Item):
+    date = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
--- a/spiders/daily/cuestionDePolemica/cuestionDePolemica/middlewares.py
+++ b/spiders/daily/cuestionDePolemica/cuestionDePolemica/middlewares.py
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy import signals
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+class CuestiondepolemicaSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Request or item objects.
+        pass
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+class CuestiondepolemicaDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
--- a/spiders/daily/cuestionDePolemica/cuestionDePolemica/pipelines.py
+++ b/spiders/daily/cuestionDePolemica/cuestionDePolemica/pipelines.py
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+class CuestiondepolemicaPipeline:
+    def process_item(self, item, spider):
+        return item
--- a/spiders/daily/cuestionDePolemica/cuestionDePolemica/settings.py
+++ b/spiders/daily/cuestionDePolemica/cuestionDePolemica/settings.py
+# Scrapy settings for cuestionDePolemica project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+BOT_NAME = "cuestionDePolemica"
+SPIDER_MODULES = ["cuestionDePolemica.spiders"]
+NEWSPIDER_MODULE = "cuestionDePolemica.spiders"
+FEED_EXPORT_ENCODING="utf-8"
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = "capitalEstadoDeMexico (+http://www.yourdomain.com)"
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+#    "Accept-Language": "en",
+#}
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    "cuestionDePolemica.middlewares.CuestiondepolemicaSpiderMiddleware": 543,
+#}
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    "cuestionDePolemica.middlewares.CuestiondepolemicaDownloaderMiddleware": 543,
+#}
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+#}
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    "cuestionDePolemica.pipelines.CuestiondepolemicaPipeline": 300,
+#}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = "httpcache"
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+# Set settings whose default value is deprecated to a future-proof value
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+FEED_EXPORT_ENCODING = "utf-8"
--- a/spiders/daily/cuestionDePolemica/cuestionDePolemica/spiders/__init__.py
+++ b/spiders/daily/cuestionDePolemica/cuestionDePolemica/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/spiders/daily/cuestionDePolemica/cuestionDePolemica/spiders/noticias.py
+++ b/spiders/daily/cuestionDePolemica/cuestionDePolemica/spiders/noticias.py
+import scrapy
+import re
+import json
+from cuestionDePolemica.items import CuestiondepolemicaItem
+# Expresión regular para eliminar etiquetas HTML
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    if not isinstance(text, str):
+        return text  # Devuelve el valor original si no es una cadena
+    return TAG_RE.sub('', text)
+class NoticiasSpider(scrapy.Spider):
+    name = "noticias"
+    allowed_domains = ["www.cuestiondepolemica.com"]
+    def __init__(self, year=None, month=None, day=None, *args, **kwargs):
+        super(NoticiasSpider, self).__init__(*args, **kwargs)
+        self.year = year
+        self.month = month.zfill(2) if month else None
+        self.day = day.zfill(2) if day else None
+        if self.year and self.month and self.day:
+            self.start_urls = [
+                f"https://www.cuestiondepolemica.com/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
+            ]
+    def parse(self, response):
+        data = json.loads(response.text)
+        for post in data:
+            # Validar que el contenido no esté vacío
+            content = post.get('content', {}).get('rendered', '').strip()
+            if not content:
+                self.logger.info(f"Post {post.get('id')} skipped due to empty content.")
+                continue
+            # Obtener el séptimo elemento de class_list si existe
+            class_list = post.get('class_list', [])
+            topic = class_list['7'] if len(class_list) > 7 else None
+            # Obtener el enlace del autor
+            author_link = post.get('_links', {}).get('author', [{}])[0].get('href')
+            # Crear un item con los campos requeridos
+            item = CuestiondepolemicaItem()
+            item['date'] = post.get('date')
+            item['title'] = remove_tags(post.get('title', {}).get('rendered', ''))
+            item['text'] = remove_tags(content)
+            item['topic'] = topic.split("category-")[1]
+            item['url'] = post.get('link')
+            if author_link:
+                # Hacer una solicitud adicional para obtener el nombre del autor
+                yield scrapy.Request(url=author_link, callback=self.parse_author, meta={'item': item})
+            else:
+                yield item  # Si no hay URL del autor, se devuelve el item sin autor
+    def parse_author(self, response):
+        item = response.meta['item']  # Recupera el item pasado a través de meta
+        author_data = json.loads(response.text)
+        item['author'] = author_data.get('name', 'Unknown')  # Asigna el nombre del autor o 'Unknown' si no está disponible
+        print(item["title"])
+        yield item  # Devuelve el item completo con el nombre del autor incluido
--- a/spiders/daily/cuestionDePolemica/scrapy.cfg
+++ b/spiders/daily/cuestionDePolemica/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+[settings]
+default = cuestionDePolemica.settings
+[deploy]
+#url = http://localhost:6800/
+project = cuestionDePolemica
--- a/spiders/daily/diarioAmanecer/diarioAmanecer/spiders/noticias.py
+++ b/spiders/daily/diarioAmanecer/diarioAmanecer/spiders/noticias.py
@@ -62,7 +62,7 @@ class NoticiasSpider(scrapy.Spider):
        item = response.meta['item']  # Recupera el item pasado a través de meta
        author_data = json.loads(response.text)
        item['author'] = author_data.get('name', 'Unknown')  # Asigna el nombre del autor o 'Unknown' si no está disponible
+        print(item["title"])
        yield item  # Devuelve el item completo con el nombre del autor incluido