reviviendo unoMasuno

a9862377 · Ulises Morales Ramírez · 753e5c57 · 753e5c57 · 753e5c57 · a9862377
Commit a9862377 authored Mar 24, 2025 by Ulises Morales Ramírez
11 changed files
--- a/spiders/daily/BROKEN/unoMasUno/unoMasUno/pipelines.py
+++ b/spiders/daily/BROKEN/unoMasUno/unoMasUno/pipelines.py
-# -*- coding: utf-8 -*-
-
-# Define your item pipelines here
-#
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
-
-import json
-from collections import OrderedDict
-
-
-class JsonWriterPipeline(object):
-
-    def __init__(self, filename):
-        self.filename = filename
-
-    @classmethod
-    def from_crawler(cls, crawler):
-        # Here you get whatever value was passed through the "filename" command line parameter
-        settings = crawler.settings
-        filename = settings.get('filename')
-
-        # Instantiate the pipeline with the file name
-        return cls(filename)
-
-    def open_spider(self, spider):
-        self.counter = 0
-        self.file = open(self.filename, 'w')
-        self.file.write("[")
-
-    def close_spider(self, spider):
-        self.file.write("]")
-        self.file.close()
-
-    def process_item(self, item, spider):
-        # print("this is my item", item)
-        row = []
-        try:
-            row.append(("date", item['date']))
-        except:
-            pass
-        try:
-            row.append(("topic", item['topic']))
-        except:
-            pass
-        try:
-            row.append(("title", item['title']))
-        except:
-            pass
-        try:
-            row.append(("author", item['author']))
-        except:
-            pass
-        try:
-            row.append(("location", item['location']))
-        except:
-            pass
-        try:
-            row.append(("text", item['text']))
-        except:
-            pass
-        try:
-            row.append(("url", item['url']))
-        except:
-            pass
-
-        line = OrderedDict(row)
-
-        self.counter += 1
-        if self.counter == 1:
-            self.file.write(json.dumps(line))
-        elif self.counter > 1:
-            self.file.write(",\n" + json.dumps(line))
-
-        return item
--- a/spiders/daily/BROKEN/unoMasUno/unoMasUno/spiders/noticias.py
+++ b/spiders/daily/BROKEN/unoMasUno/unoMasUno/spiders/noticias.py
-# -*- coding: utf-8 -*-
-import scrapy, re, json
-from unoMasUno.items import NoticiasItem
-from datetime import datetime, timedelta, tzinfo
-
-"""
-MEDIO:
-Uno Más Uno, Yucatán
-USO:
-scrapy crawl noticias --nolog -s filename=2017-09-22.json -a year=2017 -a month=9 -a day=22
-"""
-
-
-TAG_RE = re.compile(r'<[^>]+>')
-def remove_tags(text):
-    return TAG_RE.sub('', text)
-
-
-class UTC(tzinfo):
-    """clase para el 'time zone' (zona horaria)"""
-
-    def utcoffset(self, dt):
-        # zona horaria para hidalgo (centro de mexico): utc-6
-        return timedelta(hours=-6)
-
-    def tzname(self, dt):
-        # nombre de la zona horaria
-        return 'UTC-6'
-
-
-class QuotesSpider(scrapy.Spider):
-    name = "noticias"
-
-    def start_requests(self):
-        self.tz = UTC()
-        self.year = getattr(self, 'year', None)
-        self.month = getattr(self, 'month', None)
-        self.day = getattr(self, 'day', None)
-        self.date_parser = {'enero': 1,      'febrero': 2,  'marzo': 3,      'abril': 4,
-                            'mayo': 5,       'junio': 6,    'julio': 7,      'agosto': 8,
-                            'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
-
-        self.baseURL = "http://www.unomasuno.com.mx/" + self.year + "/" + self.month + "/" + self.day
-
-        yield scrapy.Request(url=self.baseURL, callback=self.parse)
-
-
-    def parse(self, response):
-        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-
-        pagination = response.xpath('//*[@class="pagination"]/a[@class="last"]/@href').extract_first()
-        if pagination is None:
-            pagination = response.xpath('//*[@class="pagination"]/a/@href').extract()
-
-            if len(pagination) > 0:
-                pagination = pagination[-1].strip('/')
-                pages = int(pagination[pagination.rfind('/')+1:])
-
-                for page in range(1, pages):
-                    yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-
-        else:
-            pagination = pagination.strip('/')
-            pages = int(pagination[pagination.rfind('/')+1:])
-
-            for page in range(1, pages):
-                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-
-
-    def parse_page(self, response):
-        for link in response.xpath('//h2[@class="post-box-title"]/a/@href').extract():
-            yield scrapy.Request(url=link, callback=self.parse_item)
-
-
-    def parse_item(self, response):
-        item = NoticiasItem()
-        text = ''
-
-        try:
-            jsonInfo = response.xpath('//script[@type="application/ld+json"]').extract_first()
-            jsonInfo = json.loads(remove_tags(jsonInfo))
-            dat = jsonInfo['datePublished']
-        except:
-            try:
-                d = response.xpath('//p[@class="post-meta"]/span/text()').extract_first()
-                d = d.replace(',', '').split(' ')
-                dat = datetime(int(d[2]), self.date_parser[d[1].lower()], int(d[0]), tzinfo=self.tz).isoformat("T")
-            except:
-                dat = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
-        item['date'] = dat
-
-        item['topic'] = response.xpath('//span[@typeof="v:Breadcrumb"]/a/text()').extract()[1]
-        item['title'] = response.xpath('//*[@class="post-inner"]/h1/span/text()').extract_first()
-
-        for p in response.xpath('//*[@class="entry"]/p').extract():
-            text += remove_tags(p) + '\n'
-
-        item['text'] = text
-        item['url'] = response.url
-
-        # print item['title']
-        yield item
-
--- a/spiders/daily/unoMasUno/2025-03-18.json
+++ b/spiders/daily/unoMasUno/2025-03-18.json
--- a/spiders/daily/BROKEN/unoMasUno/scrapy.cfg
+++ b/spiders/daily/BROKEN/unoMasUno/scrapy.cfg
--- a/spiders/daily/BROKEN/unoMasUno/unoMasUno/__init__.py
+++ b/spiders/daily/BROKEN/unoMasUno/unoMasUno/__init__.py
--- a/spiders/daily/BROKEN/unoMasUno/unoMasUno/items.py
+++ b/spiders/daily/BROKEN/unoMasUno/unoMasUno/items.py
@@ -9,12 +9,13 @@ import scrapy


 class NoticiasItem(scrapy.Item):
-    # define the fields for your item here like:
-    # name = scrapy.Field()
+    date = scrapy.Field()
    title = scrapy.Field()
    text = scrapy.Field()
-    date = scrapy.Field()
    location = scrapy.Field()
    author = scrapy.Field()
    topic = scrapy.Field()
    url = scrapy.Field()
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
--- a/spiders/daily/BROKEN/unoMasUno/unoMasUno/middlewares.py
+++ b/spiders/daily/BROKEN/unoMasUno/unoMasUno/middlewares.py
--- a/spiders/daily/unoMasUno/unoMasUno/pipelines.py
+++ b/spiders/daily/unoMasUno/unoMasUno/pipelines.py
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+    def process_item(self, item, spider):
+        return item
\ No newline at end of file
--- a/spiders/daily/BROKEN/unoMasUno/unoMasUno/settings.py
+++ b/spiders/daily/BROKEN/unoMasUno/unoMasUno/settings.py
@@ -15,11 +15,12 @@ SPIDER_MODULES = ['unoMasUno.spiders']
 NEWSPIDER_MODULE = 'unoMasUno.spiders'


+FEED_EXPORT_ENCODING="utf-8"
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'unoMasUno (+http://www.yourdomain.com)'
+#USER_AGENT = "planaMayor (+http://www.yourdomain.com)"

 # Obey robots.txt rules
-# ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32

--- a/spiders/daily/BROKEN/unoMasUno/unoMasUno/spiders/__init__.py
+++ b/spiders/daily/BROKEN/unoMasUno/unoMasUno/spiders/__init__.py
--- a/spiders/daily/unoMasUno/unoMasUno/spiders/noticias.py
+++ b/spiders/daily/unoMasUno/unoMasUno/spiders/noticias.py
+import scrapy
+import json
+import re
+from unoMasUno.items import NoticiasItem
+
+# Expresión regular para eliminar etiquetas HTML
+TAG_RE = re.compile(r'<[^>]+>')
+
+def remove_tags(text):
+    if not isinstance(text, str):
+        return text  # Devuelve el valor original si no es una cadena
+    return TAG_RE.sub('', text)
+
+class NoticiasSpider(scrapy.Spider):
+    name = "noticias"
+    allowed_domains = ["unomasuno.com.mx"]
+    start_urls = ["https://unomasuno.com.mx/"]
+
+    def __init__(self, year=None, month=None, day=None, *args, **kwargs):
+        super(NoticiasSpider, self).__init__(*args, **kwargs)
+        self.year = year
+        self.month = month.zfill(2) if month else None
+        self.day = day.zfill(2) if day else None
+        if self.year and self.month and self.day:
+            self.start_urls = [
+                f"https://unomasuno.com.mx/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
+            ]
+            print(self.start_urls[0])
+
+    def parse(self, response):
+        try:
+            # Intenta decodificar el JSON de la respuesta
+            data = json.loads(response.text)
+            self.logger.info(f"Received {len(data)} posts from API.")
+        except json.JSONDecodeError as e:
+            # Si hay un error al decodificar el JSON, registra el error y detén el procesamiento
+            self.logger.error(f"Failed to parse JSON: {e}")
+            self.logger.error(f"Response content: {response.text[:500]}...")  # Logea los primeros 500 caracteres de la respuesta
+            return
+
+        for post in data:
+            try:
+                content = post.get('content', {}).get('rendered', '').strip()
+                if content:
+                    class_list = post.get('class_list', {})
+                    topic = None
+                    if isinstance(class_list, dict):
+                        topic = class_list.get('7', '').split("category-")[1] if '7' in class_list else None
+
+                    # Preparar item
+                    item = NoticiasItem()
+                    item['date'] = post.get('date')
+                    item['title'] = remove_tags(post.get('title', {}).get('rendered', ''))
+                    item['text'] = remove_tags(content)
+                    item['topic'] = topic
+                    item['url'] = post.get('link')
+                    print(item['title'])
+                    yield item
+            except Exception as e:
+                # Si hay un error al procesar un post, registra el error y continúa con el siguiente
+                self.logger.error(f"Error processing post {post.get('id')}: {e}")
+                continue
\ No newline at end of file