reviviendo unoMasuno

parent 753e5c57
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
import scrapy, re, json
from unoMasUno.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
"""
MEDIO:
Uno Más Uno, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-09-22.json -a year=2017 -a month=9 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para hidalgo (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
self.date_parser = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4,
'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
self.baseURL = "http://www.unomasuno.com.mx/" + self.year + "/" + self.month + "/" + self.day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//*[@class="pagination"]/a[@class="last"]/@href').extract_first()
if pagination is None:
pagination = response.xpath('//*[@class="pagination"]/a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
pagination = pagination.strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.xpath('//h2[@class="post-box-title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
try:
jsonInfo = response.xpath('//script[@type="application/ld+json"]').extract_first()
jsonInfo = json.loads(remove_tags(jsonInfo))
dat = jsonInfo['datePublished']
except:
try:
d = response.xpath('//p[@class="post-meta"]/span/text()').extract_first()
d = d.replace(',', '').split(' ')
dat = datetime(int(d[2]), self.date_parser[d[1].lower()], int(d[0]), tzinfo=self.tz).isoformat("T")
except:
dat = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
item['date'] = dat
item['topic'] = response.xpath('//span[@typeof="v:Breadcrumb"]/a/text()').extract()[1]
item['title'] = response.xpath('//*[@class="post-inner"]/h1/span/text()').extract_first()
for p in response.xpath('//*[@class="entry"]/p').extract():
text += remove_tags(p) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
This diff is collapsed.
......@@ -9,12 +9,13 @@ import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
date = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# define the fields for your item here like:
# name = scrapy.Field()
pass
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def process_item(self, item, spider):
return item
\ No newline at end of file
......@@ -15,11 +15,12 @@ SPIDER_MODULES = ['unoMasUno.spiders']
NEWSPIDER_MODULE = 'unoMasUno.spiders'
FEED_EXPORT_ENCODING="utf-8"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'unoMasUno (+http://www.yourdomain.com)'
#USER_AGENT = "planaMayor (+http://www.yourdomain.com)"
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......
import scrapy
import json
import re
from unoMasUno.items import NoticiasItem
# Expresión regular para eliminar etiquetas HTML
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
if not isinstance(text, str):
return text # Devuelve el valor original si no es una cadena
return TAG_RE.sub('', text)
class NoticiasSpider(scrapy.Spider):
name = "noticias"
allowed_domains = ["unomasuno.com.mx"]
start_urls = ["https://unomasuno.com.mx/"]
def __init__(self, year=None, month=None, day=None, *args, **kwargs):
super(NoticiasSpider, self).__init__(*args, **kwargs)
self.year = year
self.month = month.zfill(2) if month else None
self.day = day.zfill(2) if day else None
if self.year and self.month and self.day:
self.start_urls = [
f"https://unomasuno.com.mx/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
]
print(self.start_urls[0])
def parse(self, response):
try:
# Intenta decodificar el JSON de la respuesta
data = json.loads(response.text)
self.logger.info(f"Received {len(data)} posts from API.")
except json.JSONDecodeError as e:
# Si hay un error al decodificar el JSON, registra el error y detén el procesamiento
self.logger.error(f"Failed to parse JSON: {e}")
self.logger.error(f"Response content: {response.text[:500]}...") # Logea los primeros 500 caracteres de la respuesta
return
for post in data:
try:
content = post.get('content', {}).get('rendered', '').strip()
if content:
class_list = post.get('class_list', {})
topic = None
if isinstance(class_list, dict):
topic = class_list.get('7', '').split("category-")[1] if '7' in class_list else None
# Preparar item
item = NoticiasItem()
item['date'] = post.get('date')
item['title'] = remove_tags(post.get('title', {}).get('rendered', ''))
item['text'] = remove_tags(content)
item['topic'] = topic
item['url'] = post.get('link')
print(item['title'])
yield item
except Exception as e:
# Si hay un error al procesar un post, registra el error y continúa con el siguiente
self.logger.error(f"Error processing post {post.get('id')}: {e}")
continue
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment