Commit 206922ad authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 177f04d7
......@@ -41,9 +41,9 @@ class QuotesSpider(scrapy.Spider):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.date_parser = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4,
'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
self.date_parser = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4,
'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
self.baseURL='http://www.unomasuno.com.mx/'+year+'/'+month+'/'+day
......
......@@ -45,8 +45,8 @@ class QuotesSpider(scrapy.Spider):
month = getattr(self, 'month', None)
# day = getattr(self, 'day', None)
parse_month = {'1': 'enero', '2': 'febrero', '3': 'marzo', '4': 'abril',
'5': 'mayo', '6': 'junio', '7': 'julio', '8': 'agosto',
'9': 'septiembre', '10': 'octubre', '11': 'noviembre', '12': 'diciembre'}
'5': 'mayo', '6': 'junio', '7': 'julio', '8': 'agosto',
'9': 'septiembre', '10': 'octubre', '11': 'noviembre', '12': 'diciembre'}
self.date = parse_month[month]+' de '+year
......
This source diff could not be displayed because it is too large. You can view the blob instead.
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class ElfinancieroItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class ElfinancieroSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class ElfinancieroPipeline(object):
def process_item(self, item, spider):
return item
# -*- coding: utf-8 -*-
# Scrapy settings for elFinanciero project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'elFinanciero'
SPIDER_MODULES = ['elFinanciero.spiders']
NEWSPIDER_MODULE = 'elFinanciero.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'elFinanciero (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'elFinanciero.middlewares.ElfinancieroSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'elFinanciero.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'elFinanciero.pipelines.ElfinancieroPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy, re
from datetime import datetime, timedelta, tzinfo
'''
scrapy crawl noticias -t json --nolog -o noticias.json
'''
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para hidalgo (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
self.date_parser = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4,
'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
self.baseURL = 'http://www.elfinanciero.com.mx/rss'
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
for link in response.xpath('//link/text()').extract()[1:]:
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
d = response.xpath('//*[@class="details-box"]/input[@id="publicado"]/@value').extract_first()
d = d.replace('/',' ').replace(':',' ').split(' ')
item['date'] = datetime(int(d[2]), self.date_parser[d[1].lower()], int(d[0]), int(d[3]), int(d[4]), tzinfo=self.tz).isoformat('T')
item['topic'] = response.xpath('//*[@class="article-title"]/span/text()').extract_first()
item['title'] = response.xpath('//*[@class="title"]/h1/text()').extract_first()
for p in response.xpath('//*[@class="article-paragraphs"]/p').extract():
text += remove_tags(p) + '\n'
item['text'] = text
item['author'] = response.xpath('//*[@class="details-box"]/input[@id="editor"]/@value').extract_first()
item['url'] = response.url
# print item['title']
yield item
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = elFinanciero.settings
[deploy]
#url = http://localhost:6800/
project = elFinanciero
[
{"title": "Intentan asaltar Escuela de Medicina del IPN", "url": "http://www.eluniversal.com.mx/metropoli/cdmx/intentan-asaltar-escuela-de-medicina-del-ipn", "text": "", "author": "Andrea Ahedo", "topic": "CDMX", "location": "\nCiudad de M\u00e9xico ", "date": "2017-12-14T15:42:14-06:00"},
{"title": "Ahued arranca precampa\u00f1a en mercado de la Cuauht\u00e9moc", "url": "http://www.eluniversal.com.mx/elecciones-2018/ahued-arranca-precampana-en-mercado-de-la-cuauhtemoc", "text": "Esta tarde, Armando Ahued Ortega arranc\u00f3 su precampa\u00f1a a la Gobierno de la capital en el mercado San Juan Pugibet, en la delegaci\u00f3n Cuauht\u00e9moc.\nVestido con un traje azul y camisa blanca, Ahued Ortega recorri\u00f3 los locales del mercado y platic\u00f3 con los comerciantes; comi\u00f3 fruta y\u00a0 bebi\u00f3 caf\u00e9.\nEl precandidato estuvo acompa\u00f1ado de su equipo de trabajo, quienes portaron un pin en forma de bigote con la palabra \"Ahued\".\n\"Decid\u00ed arrancar aqu\u00ed en este mercado, al cual vine 11 a\u00f1os como Secretario de Salud a ver todo el tema sanitario, a verificar los pescados y mariscos\".\n\"Durante la precampa\u00f1a vamos a estar cerca de la gente escuchar a la gente, estar viviendo lo que vive la gente. Una campa\u00f1a constructiva, no denostaci\u00f3n, no agresi\u00f3n, no descalificar a nadie. Cada qui\u00e9n que proponga lo que mejor pueda y su trayectoria y su vocaci\u00f3n de servicio\".\nEn su discurso, el precandidato afirm\u00f3 que el objetivo es que la ciudadan\u00eda conozca qui\u00e9n es Armando Ahued.\nDijo, que as\u00ed como los m\u00e9dicos, los gobernantes tambi\u00e9n deben ocupar el m\u00e9todo cient\u00edfico para con la ciudadan\u00eda, que es escuchar y explorar al paciente, llegar a un diagn\u00f3stico y ofrecer un buen tratamiento.\n\"Tengo conocimiento de todas las \u00e1reas de gobierno porque la salud incide en la movilidad, en la seguridad, trabajo, el agua, contaminaci\u00f3n\".\n\"Me juntar\u00e9 con todos mis amigos perredistas y tambi\u00e9n simpatizantes para platicar, para escucharlos\".\npmba\n", "author": "Sandra Hern\u00e1ndez", "topic": "Elecciones 2018", "location": "\nCiudad de M\u00e9xico ", "date": "2017-12-14T16:06:11-06:00"},
{"title": "Revocan sanci\u00f3n de p\u00e9rdida de militancia al PRI para Humberto Moreira", "url": "http://www.eluniversal.com.mx/nacion/politica/revocan-sancion-de-perdida-de-militancia-al-pri-para-humberto-moreira", "text": "El Tribunal Electoral del Poder Judicial de la Federaci\u00f3n (TEPJF) revoc\u00f3, por violaciones al debido proceso, la sanci\u00f3n de p\u00e9rdida de militancia al PRI impuesta al ex gobernador de Coahuila y ex l\u00edder pri\u00edsta Humberto Moreira Vald\u00e9s, declarada por la Comisi\u00f3n Nacional de Justicia Partidaria de ese partido.\nLos magistrados establecieron que ese procedimiento requer\u00eda emplazar a Moreira para que pidiera argumentar en su defensa\u00a0y eso no ocurri\u00f3.\nPor ello, orden\u00f3 al PRI emplazar debidamente a Moreira, quien a consecuencia de la sentencia a\u00fan es militante pri\u00edsta.\netp\n", "author": "Carina Garc\u00eda", "topic": "Pol\u00edtica", "location": "\nCiudad de Mexico ", "date": "2017-12-14T15:47:05-06:00"},
{"title": "Mexico pressured to stop using European cheese names", "url": "http://www.eluniversal.com.mx/english/mexico-pressured-stop-using-european-cheese-names", "text": "Cheese industry in Mexico may be affected by the update to the Free Trade Agreement Mexico-European Community due to the pressures of the European Union regarding the protected designation of origin.\nFood processing corporations have stumbled upon a few bumps in the road since European countries are seeking to stop Mexico from branding domestic cheese products under names such as\u00a0Mozzarella, Gorgonzola, Camembert, and Parmesan\u00a0\u2013\u00a0which have a designated name of origin. Pursuant to this law, only products produced in those European regions can bear the name. This means food products, to keep the European name, should be imported from the respective regions.\nDuring the remainder of this week and the upcoming one, both Mexico and the European Union will list during their meetings in Brussels, Belgium, the products which are protected according to a designation of origin.\nThe most notable issue is the domestic production of cheese\u00a0because Mexico developed years ago an industry with European technology which allows us to market cheese products under labels similar to \u201cFeta-type\u201d and \"Mozarella-type\".\nOur sources have confirmed it's possible the Mexican government will only be able to negotiate the use of five European cheese names, like Manchego.\nThe farming and stockbreeding industry in Mexico is uncertain about the reach this agreement modification will have in the country, given the European Union is demanding to Mexican negotiators more market openness and protection of their products.\nYet while Europe is pushing to secure 150 products with protected designated origins, Mexico has less than 20, including tequila, mezcal, and Talavera\nam\n", "author": "Ivette Salda\u00f1a", "topic": "English", "location": "\nWashington ", "date": "2017-12-14T15:58:21-06:00"},
{"title": "\"Star Wars: The Last Jedi\" monopoliza los estrenos en cine", "url": "http://www.eluniversal.com.mx/espectaculos/cine/star-wars-last-jedi-monopoliza-los-estrenos-en-cine", "text": "El esperado estreno de \"Star Wars: The Last Jedi\" monopoliza las novedades de la cartelera en Estados Unidos esta semana, en la que tambi\u00e9n llegar\u00e1 a los cines la cinta animada \"Ferdinand\".\nLa octava entrega de la famosa saga gal\u00e1ctica ideada por George Lucas se presenta con la direcci\u00f3n de Rian Johnson y con un reparto encabezado por Daisy Ridley, John Boyega, Mark Hamill y Carrie Fisher.\nLa historia de los Skywalker contin\u00faa con los nuevos personajes Rey y Finn uniendo fuerzas con Luke y Leia en una nueva aventura que desvelar\u00e1 antiguos misterios y desenterrar\u00e1 revelaciones del pasado.\nPocas cintas se atreven esta semana a hacer sombra a \"Star Wars: The Last Jedi\", pero la pel\u00edcula de animaci\u00f3n \"Ferdinand\" tratar\u00e1 de atraer al p\u00fablico infantil con un reparto que en su versi\u00f3n original incluye las voces de John Cena, Kate McKinnon, Bobby Cannavale, Gina Rodr\u00edguez y Juanes.\nEste largometraje del cineasta brasile\u00f1o Carlos Saldanha gira en torno a un toro manso y sensible, que disfruta oliendo flores y cuidando de los suyos, pero al que toman por un animal bravo y env\u00edan a las corridas de toros.\nEn los estrenos limitados aparecen la comedia independiente \"Permanent\" y el thriller de ciencia-ficci\u00f3n \"Beyond Skyline\".\nRainn Wilson y Patricia Arquette encabezan el elenco de \"Permanent\", una pel\u00edcula sobre los enredos familiares de una familia de los a\u00f1os 80 con una hija adolescente.\nPor su parte, Frank Grillo protagoniza \"Beyond Skyline\", una cinta en la que un padre debe hacer frente a una invasi\u00f3n alien\u00edgena si quiere recuperar a su hijo.\u00a0\nrad\u00a0\n", "author": "EFE", "topic": "Cine", "location": "\nLos \u00c1ngeles ", "date": "2017-12-14T16:03:09-06:00"},
{"title": "Liam Gallagher pone su voz contra el cambio clim\u00e1tico", "url": "http://www.eluniversal.com.mx/espectaculos/musica/liam-gallagher-pone-su-voz-contra-el-cambio-climatico", "text": "El exvocalista de Oasis, Liam Gallagher, puso su voz a un video de navide\u00f1o producido por un grupo activista que busca llamar la atenci\u00f3n sobre el cambio clim\u00e1tico.\nEl video \"The Very Hot Snowman\" muestra un mu\u00f1eco de nieve de dibujos animados que se derrite bajo el calor del sol, mientras que en la narraci\u00f3n de Gallagher se advierte que \"nuestro planeta est\u00e1 demasiado caliente\" e insta a las personas a involucrarse en una campa\u00f1a para crear conciencia sobre el cambio clim\u00e1tico.\nEl video fue producido por el director y fot\u00f3grafo Rankin, en nombre de The Climate Coalition, un grupo que agrupa a 130 organizaciones brit\u00e1nicas, incluidas National Trust y Oxfam, que abogan por medidas para proteger el ambiente.\n\nGallagher consigui\u00f3 varios \u00e9xitos con la banda Oasis a finales de la d\u00e9cada de 1990 y principios de la de 2000, incluidos \"Live Forever\" y \"Wonderwall\", antes de la separaci\u00f3n de la banda en 2009.\nSu primer \u00e1lbum como solista, \"As You Were\", se lanz\u00f3 en octubre de 2017 y encabez\u00f3 las listas de Reino Unido.\u00a0\nnrv\n", "author": "Reuters", "topic": "M\u00fasica", "location": "\nLondres ", "date": "2017-12-14T15:44:17-06:00"},
{"title": "Corte de EU da 18 a\u00f1os de prisi\u00f3n a sobrinos de la esposa de Nicol\u00e1s Maduro", "url": "http://www.eluniversal.com.mx/mundo/corte-de-eu-da-18-anos-de-prision-sobrinos-de-la-esposa-de-nicolas-maduro", "text": "Un juez estadounidense sentenci\u00f3 el jueves a dos sobrinos de la primera dama de Venezuela a 18 a\u00f1os de prisi\u00f3n cada uno tras una condena por narcotr\u00e1fico.\nEl juez de distrito Paul Crotty sentenci\u00f3 a Franqui Francisco Flores de Freitas y a Efra\u00edn Antonio Campo Flores, sobrinos de Cilia Flores, la esposa del mandatario venezolano Nicol\u00e1s Maduro.\nLos hombres fueron arrestados en Hait\u00ed en noviembre del 2015 y fueron llevados a Estados Unidos despu\u00e9s de una operaci\u00f3n encubierta de la Administraci\u00f3n para el Control de Drogas (DEA, por su sigla en ingl\u00e9s).\nLos fiscales dijeron que ambos conspiraron para usar un hangar presidencial de un aeropuerto venezolano para enviar 800 kilogramos de coca\u00edna a Honduras, los que posteriormente viajar\u00edan a Estados Unidos.\nae\n", "author": "Reuters", "topic": "Mundo", "location": null, "date": "2017-12-14T15:59:47-06:00"},
{"title": "Marco Fabi\u00e1n regresa a los entrenamientos", "url": "http://www.eluniversal.com.mx/universal-deportes/futbol/marco-fabian-regresa-los-entrenamientos", "text": "Por primera ocasi\u00f3n desde que se lesion\u00f3 la espalda, el mediocampista mexicano Marco Fabi\u00e1n de la Mora cumpli\u00f3 con parte de la sesi\u00f3n al parejo del equipo en el Eintracht Frankfurt, lo cual fue una buena noticia para el club.Luego de padecer ese inconveniente en un disco de la espalda a mediados de a\u00f1o, el jalisciense pr\u00e1cticamente qued\u00f3 descartado para el comienzo de la temporada 2017-2018 con las \u201c\u00c1guilas\u201d y se prev\u00e9 que a inicios de 2018 pueda tener sus primeros minutos de juego.\nDespu\u00e9s de meses de reposo, a inicios de este mes, el ex de Chivas de Guadalajara sostuvo entrenamientos por separado del plantel enfocados en una mejor rehabilitaci\u00f3n y trabajo f\u00edsico.\nPero este jueves, Fabi\u00e1n de la Mora hizo parte de la sesi\u00f3n con el resto de sus compa\u00f1eros y bajo la supervisi\u00f3n del t\u00e9cnico croata Niko Kovac, as\u00ed lo inform\u00f3 el club Eintracht Frankfurt.\nEl mexicano lleg\u00f3 al conjunto alem\u00e1n a inicios de 2016 y fue apenas que la campa\u00f1a 2016-2017 la pudo jugar de manera m\u00e1s completa y en la actual ya se perdi\u00f3 la primera mitad.\n", "author": "Notimex", "topic": "Futbol", "location": null, "date": "2017-12-14T16:10:09-06:00"},
{"title": "Emiten alerta amarilla en 5 delegaciones por bajas temperaturas", "url": "http://www.eluniversal.com.mx/metropoli/cdmx/emiten-alerta-amarilla-en-5-delegaciones-por-bajas-temperaturas", "text": "La Secretar\u00eda de Protecci\u00f3n Civil capitalina emiti\u00f3 la alerta amarilla por bajas temperaturas en cinco delegaciones.\n\u00a0\nA trav\u00e9s de la cuenta de Twitter @SPCCDMX se inform\u00f3 que en \u00c1lvaro Obreg\u00f3n, Cuajimalpa, Magdalena Contreras, Milpa Alta y Tlalpan se esperan entre 4 y 8 grados cent\u00edgrados entre las 5 y 8 horas de ma\u00f1ana.\n\u00a0\nLa dependencia recomend\u00f3 a la poblaci\u00f3n utilizar al menos tres capas de ropa de algod\u00f3n o lana, cubrir nariz y boca, as\u00ed como evitar los cambios bruscos de temperatura.\n\u00a0\nEstas condiciones son provocadas por el frente fr\u00edo 15 que se localiza en sobre la pen\u00ednsula de Yucat\u00e1n junto con una masa de aire fr\u00edo que lo impulsa en direcci\u00f3n al oriente.\npmba\n", "author": "Redacci\u00f3n", "topic": "CDMX", "location": "\nCiudad de M\u00e9xico ", "date": "2017-12-14T15:58:53-06:00"}
]
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class EluniversalItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class EluniversalSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class EluniversalPipeline(object):
def process_item(self, item, spider):
return item
# -*- coding: utf-8 -*-
# Scrapy settings for elUniversal project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'elUniversal'
SPIDER_MODULES = ['elUniversal.spiders']
NEWSPIDER_MODULE = 'elUniversal.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'elUniversal (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'elUniversal.middlewares.EluniversalSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'elUniversal.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'elUniversal.pipelines.EluniversalPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy, re
'''
scrapy crawl noticias -t json --nolog -o noticias.json
'''
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.baseURL = 'http://www.eluniversal.com.mx/rss.xml'
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
for link in response.xpath('//link/text()').extract()[1:]:
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['topic'] = response.xpath('//*[@class="breadcrumb"]/span/a/text()').extract()[-2]
item['title'] = response.xpath('//*[@class="pane-content"]/h1/text()').extract_first()
for p in response.xpath('//*[@class="pane-content"]/div/p').extract():
text += remove_tags(p) + '\n'
item['text'] = text
item['location'] = response.xpath('//*[@class="field field-name-field-lugar field-type-text field-label-hidden"]/text()').extract_first()
item['author'] = response.xpath('//*[@class="field-items"]/div/text()').extract_first()
item['url'] = response.url
# print item['title']
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = elUniversal.settings
[deploy]
#url = http://localhost:6800/
project = elUniversal
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class ExcelsiorItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class ExcelsiorSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class ExcelsiorPipeline(object):
def process_item(self, item, spider):
return item
# -*- coding: utf-8 -*-
# Scrapy settings for excelsior project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'excelsior'
SPIDER_MODULES = ['excelsior.spiders']
NEWSPIDER_MODULE = 'excelsior.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'excelsior (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
# 'excelsior.middlewares.ExcelsiorSpiderMiddleware': 543,
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
# 'excelsior.middlewares.MyCustomDownloaderMiddleware': 543,
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'excelsior.pipelines.ExcelsiorPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
SPLASH_URL = 'http://localhost:8050/'
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy, re
from datetime import datetime, date, timedelta, tzinfo
from scrapy_splash import SplashRequest
"""
Para este sitio se hace uso de 'scrapy-splash' porque el contenido es cargado a traves de javascript
USO:
scrapy crawl noticias -t json --nolog -o noticias.json
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para el centro de mexico: utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
parse_month = {'1': 'enero', '2': 'febrero', '3': 'marzo', '4': 'abril',
'5': 'mayo', '6': 'junio', '7': 'julio', '8': 'agosto',
'9': 'septiembre', '10': 'octubre', '11': 'noviembre', '12': 'diciembre'}
self.baseURL = 'http://www.excelsior.com.mx/rss.xml'
# yield scrapy.Request(url=self.baseURL, callback=self.parse)
yield SplashRequest(url=self.baseURL, callback=self.parse, endpoint='render.html', args={'wait': 0.5})
# def parse(self, response):
# for post in response.css('div.catpor-box'):
# post_date = post.xpath('./div/span[@class="catpor-published clearfix"]/text()').extract_first()
# post_date = post_date[post_date.find('d') + 3:]
#
# if post_date == self.date:
# link = post.xpath('./div/div/a/@href').extract_first()
# yield scrapy.Request(url=link, callback=self.parse_2)
def parse(self, response):
print 'hey'
print response.url
print response.body
print response.xpath('//link').extract()[1:]
# for link in response.xpath('//*[@class="post-container clearfix"]/h2/a/@href').extract():
# # yield scrapy.Request(url=link, callback=self.parse_item)
# yield SplashRequest(url=link, callback=self.parse_item, endpoint='render.html', args={'wait': 0.5})
# def parse_item(self, response):
# item = NoticiasItem()
# text = ''
#
# d = response.xpath('//div[@id="primary"]/div/div/div/div/span[@class="published"]/text()').extract_first()
# d, t = d.split(' ')
# d = map(int, d.split('-'))
# t = map(int, t.split(':'))
# d = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat('T')
# item['date'] = d
#
# item['title'] = response.xpath('//div[@id="primary"]/div/h1/text()').extract_first()
# item['topic'] = response.xpath('//span[@class="entry-categories"]/text()').extract_first()
#
# for paragraph in response.xpath('//div[@id="primary"]/div/div/div/div[@class="entry-content"]/div/p').extract():
# text += remove_tags(paragraph) + '\n'
# item['text'] = text
# item['url'] = response.url
#
# # print item['title']
# yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = excelsior.settings
[deploy]
#url = http://localhost:6800/
project = excelsior
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment