cuestiondepolemica

parent 4d638c03
[
{"date": "2025-01-16T13:45:16", "title": "Gobierno de Naucalpan reafirma compromiso de dignificar a policías municipales y recuperar la paz del municipio", "text": "Redacción / Agencia Cuestión de POLÉMICA \n• El presidente municipal destacó que la Policía Municipal está en vías de coinvertirse en Guardia Municipal para transformar la manera en la que se conduce.\n• Isaac Montoya enfatiza compromiso mutuo para recuperar la paz en el municipio, y la confianza de las y los naucalpenses.\nNaucalpan de Juárez, Méx., 15 de enero de 2025.- Al reafirmar el compromiso de la dignificación de la Policía Municipal que está en vías de cambiar a Guardia Municipal, el presidente municipal, Isaac Montoya Márquez, encabezó el acto de Pase de Lista, donde destacó el compromiso mutuo para recuperar la paz en el municipio, y la confianza de las y los naucalpenses.\n\nEn el Parque Revolución y ante toda la representatividad de la Dirección General de Seguridad Ciudadana y Movilidad Segura, el alcalde anunció que, la Policía Municipal está en vías de convertirse en la Guardia Municipal, pero no se queda solo en cambio de nombre, sino se transformará la manera en que se conduce.\nMontoya Márquez señaló que, otorgará todo el respaldo y confianza a los elementos policiacos, se mejorarán sus condiciones laborales, se garantizarán las prestaciones, y la capacitación, entrenamiento y equipamiento, pero quien no cumpla y no esté al servicio de la gente, se le puede mover también.\nEnérgicamente enfatizó que, no habrá tolerancia para malas prácticas que afecten a la ciudadanía, estaremos muy vigilantes, por ello, las y los policías tiene que ser ejemplo de servicio y honestidad. Deben recuperar la confianza de la gente, en su policía y en su gobierno, subrayó.\n\nEl presidente municipal afirmó que, el Gobierno de la Transformación en Naucalpan va por la recuperación de los Tecallis que se encuentran abandonados, en beneficio de todas las comunidades, para que se conviertan en una base de reacción inmediata.\nInsistió que el compromiso con la policía municipal es dignificar como nunca antes la labor que realiza cada elemento, pero también tiene que haber compromiso mutuo para garantizar la seguridad de las y los naucalpenses.\nAl descentralizar el gobierno, puntualizó, la policía municipal abarcará mucho más territorio y podrá brindar mayor presencia en las comunidades para atender emergencias con mayor prontitud y reducir así los tiempos de respuesta.", "topic": "municipios", "url": "https://www.cuestiondepolemica.com/gobierno-de-naucalpan-reafirma-compromiso-de-dignificar-a-policias-municipales-y-recuperar-la-paz-del-municipio/", "author": "Agencia Cuestión de POLÉMICA"}
]
\ No newline at end of file
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class CuestiondepolemicaItem(scrapy.Item):
date = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# define the fields for your item here like:
# name = scrapy.Field()
pass
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class CuestiondepolemicaSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class CuestiondepolemicaDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class CuestiondepolemicaPipeline:
def process_item(self, item, spider):
return item
# Scrapy settings for cuestionDePolemica project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "cuestionDePolemica"
SPIDER_MODULES = ["cuestionDePolemica.spiders"]
NEWSPIDER_MODULE = "cuestionDePolemica.spiders"
FEED_EXPORT_ENCODING="utf-8"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "capitalEstadoDeMexico (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "cuestionDePolemica.middlewares.CuestiondepolemicaSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# "cuestionDePolemica.middlewares.CuestiondepolemicaDownloaderMiddleware": 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# "cuestionDePolemica.pipelines.CuestiondepolemicaPipeline": 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy
import re
import json
from cuestionDePolemica.items import CuestiondepolemicaItem
# Expresión regular para eliminar etiquetas HTML
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
if not isinstance(text, str):
return text # Devuelve el valor original si no es una cadena
return TAG_RE.sub('', text)
class NoticiasSpider(scrapy.Spider):
name = "noticias"
allowed_domains = ["www.cuestiondepolemica.com"]
def __init__(self, year=None, month=None, day=None, *args, **kwargs):
super(NoticiasSpider, self).__init__(*args, **kwargs)
self.year = year
self.month = month.zfill(2) if month else None
self.day = day.zfill(2) if day else None
if self.year and self.month and self.day:
self.start_urls = [
f"https://www.cuestiondepolemica.com/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
]
def parse(self, response):
data = json.loads(response.text)
for post in data:
# Validar que el contenido no esté vacío
content = post.get('content', {}).get('rendered', '').strip()
if not content:
self.logger.info(f"Post {post.get('id')} skipped due to empty content.")
continue
# Obtener el séptimo elemento de class_list si existe
class_list = post.get('class_list', [])
topic = class_list['7'] if len(class_list) > 7 else None
# Obtener el enlace del autor
author_link = post.get('_links', {}).get('author', [{}])[0].get('href')
# Crear un item con los campos requeridos
item = CuestiondepolemicaItem()
item['date'] = post.get('date')
item['title'] = remove_tags(post.get('title', {}).get('rendered', ''))
item['text'] = remove_tags(content)
item['topic'] = topic.split("category-")[1]
item['url'] = post.get('link')
if author_link:
# Hacer una solicitud adicional para obtener el nombre del autor
yield scrapy.Request(url=author_link, callback=self.parse_author, meta={'item': item})
else:
yield item # Si no hay URL del autor, se devuelve el item sin autor
def parse_author(self, response):
item = response.meta['item'] # Recupera el item pasado a través de meta
author_data = json.loads(response.text)
item['author'] = author_data.get('name', 'Unknown') # Asigna el nombre del autor o 'Unknown' si no está disponible
print(item["title"])
yield item # Devuelve el item completo con el nombre del autor incluido
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = cuestionDePolemica.settings
[deploy]
#url = http://localhost:6800/
project = cuestionDePolemica
...@@ -62,7 +62,7 @@ class NoticiasSpider(scrapy.Spider): ...@@ -62,7 +62,7 @@ class NoticiasSpider(scrapy.Spider):
item = response.meta['item'] # Recupera el item pasado a través de meta item = response.meta['item'] # Recupera el item pasado a través de meta
author_data = json.loads(response.text) author_data = json.loads(response.text)
item['author'] = author_data.get('name', 'Unknown') # Asigna el nombre del autor o 'Unknown' si no está disponible item['author'] = author_data.get('name', 'Unknown') # Asigna el nombre del autor o 'Unknown' si no está disponible
print(item["title"])
yield item # Devuelve el item completo con el nombre del autor incluido yield item # Devuelve el item completo con el nombre del autor incluido
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment