Commit f1dfa7e9 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 720f6059
...@@ -211,6 +211,16 @@ Se incluyen los siguientes medios: ...@@ -211,6 +211,16 @@ Se incluyen los siguientes medios:
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
``` ```
No se encontró notas antes del 2011.11.28. No se encontró notas antes del 2011.11.28.
* [Proceso](http://www.proceso.com.mx/)
Uso:
```bash
cd proceso
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3
```
No se encontró notas antes de Nov. 1976.
* [Punto Medio](https://www.puntomedio.mx/) * [Punto Medio](https://www.puntomedio.mx/)
......
[ [
{"nombre": "Al Chile", "crawler": "sitios_yucatan/alChile", "desde": "01-06-2014", "url": "http://alchile.com.mx/"}, {"nombre": "Al Chile", "crawler": "descarga_por_fecha/alChile", "desde": "01-06-2014", "url": "http://alchile.com.mx/"},
{"nombre": "Desde el Balcón", "crawler": "sitios_yucatan/desdeElBalcon", "desde": "11-01-2014", "url": "http://www.desdeelbalcon.com/"}, {"nombre": "Desde el Balcón", "crawler": "descarga_por_fecha/desdeElBalcon", "desde": "11-01-2014", "url": "http://www.desdeelbalcon.com/"},
{"nombre": "Diario de Yucatán", "crawler": "sitios_yucatan/diarioYucatan", "desde": "02-04-2012", "url": "http://yucatan.com.mx/"}, {"nombre": "Diario del Yaqui", "crawler": "descarga_por_fecha/diarioYaqui", "desde": "30-12-2016", "url": "http://diariodelyaqui.mx"},
{"nombre": "El Grillo", "crawler": "sitios_yucatan/grilloPorteno", "desde": "04-11-2013", "url": "http://grilloporteno.com/"}, {"nombre": "Diario de Yucatán", "crawler": "descarga_por_fecha/diarioYucatan", "desde": "02-04-2012", "url": "http://yucatan.com.mx/"},
{"nombre": "La Jornada Maya", "crawler": "sitios_yucatan/laJornadaMaya", "desde": "12-03-2015", "url": "https://www.lajornadamaya.mx/"}, {"nombre": "El Grillo", "crawler": "descarga_por_fecha/grilloPorteno", "desde": "04-11-2013", "url": "http://grilloporteno.com/"},
{"nombre": "La Verdad Yucatán", "crawler": "sitios_yucatan/laVerdadYuc", "desde": "01-12-2015", "url": "http://laverdadnoticias.com/"}, {"nombre": "La Jornada", "crawler": "descarga_por_fecha/laJornada", "desde": "01-02-2005", "url": "http://www.jornada.unam.mx"},
{"nombre": "Lector MX", "crawler": "sitios_yucatan/lectorMX", "desde": "23-10-2015", "url": "http://lectormx.com/"}, {"nombre": "La Jornada Aguascalientes", "crawler": "descarga_por_fecha/laJornadaAgs", "desde": "01-12-2008", "url": "http://www.lja.mx/"},
{"nombre": "Mi Punto de Vista", "crawler": "sitios_yucatan/miPuntoDeVista", "desde": "04-10-2008", "url": "http://www.mipuntodevista.com.mx/"}, {"nombre": "La Jornada Baja California", "crawler": "descarga_por_fecha/laJornadaBC", "desde": "09-02-2015", "url": "http://jornadabc.mx"},
{"nombre": "Notirivas", "crawler": "sitios_yucatan/notirivas", "desde": "22-11-2016", "url": "http://gruporivas.com.mx/notirivas/"}, {"nombre": "La Jornada Guerrero", "crawler": "descarga_por_fecha/laJornadaGro", "desde": "22-01-2007", "url": "http://www.lajornadaguerrero.com.mx"},
{"nombre": "Notisureste", "crawler": "sitios_yucatan/notisureste", "desde": "28-11-2011", "url": "http://www.notisureste.com/"}, {"nombre": "La Jornada Maya", "crawler": "descarga_por_fecha/laJornadaMaya", "desde": "12-03-2015", "url": "https://www.lajornadamaya.mx/"},
{"nombre": "Punto Medio", "crawler": "sitios_yucatan/puntoMedio", "desde": "02-08-2015", "url": "https://www.puntomedio.mx/"}, {"nombre": "La Jornada de Oriente", "crawler": "descarga_por_fecha/laJornadaOte", "desde": "01-06-2013", "url": "http://www.lajornadadeoriente.com.mx/"},
{"nombre": "Sona 89.3", "crawler": "sitios_yucatan/sona893", "desde": "09-04-2012", "url": "http://sona893.fm/"}, {"nombre": "La Jornada San Luis", "crawler": "descarga_por_fecha/laJornadaSanLuis", "desde": "08-10-2014", "url": "http://lajornadasanluis.com.mx"},
{"nombre": "Yucatán a la Mano", "crawler": "sitios_yucatan/yucatanALaMano", "desde": "11-05-2015", "url": "http://www.yucatanalamano.com/"}, {"nombre": "La Jornada Veracruz", "crawler": "descarga_por_fecha/laJornadaVer", "desde": "11-05-2009", "url": "http://www.jornadaveracruz.com.mx"},
{"nombre": "Yucatán al Minuto", "crawler": "sitios_yucatan/yucatanAlMinuto", "desde": "17-01-2017", "url": "http://www.yucatanalminuto.com/"}, {"nombre": "La Jornada Zacatecas", "crawler": "descarga_por_fecha/laJornadaZac", "desde": "10-06-2013", "url": "http://ljz.mx"},
{"nombre": "Yucatán en Corto", "crawler": "sitios_yucatan/yucatanEnCorto", "desde": "02-04-2011", "url": "http://florcastillo.mx/noticias/"}, {"nombre": "La Verdad Yucatán", "crawler": "descarga_por_fecha/laVerdadYuc", "desde": "01-12-2015", "url": "http://laverdadnoticias.com/"},
{"nombre": "Diario del Yaqui", "crawler": "otros_sitios/diarioYaqui", "desde": "30-12-2016", "url": "http://diariodelyaqui.mx"}, {"nombre": "Lector MX", "crawler": "descarga_por_fecha/lectorMX", "desde": "23-10-2015", "url": "http://lectormx.com/"},
{"nombre": "La Jornada", "crawler": "otros_sitios/laJornada", "desde": "01-02-2005", "url": "http://www.jornada.unam.mx"}, {"nombre": "Mi Punto de Vista", "crawler": "descarga_por_fecha/miPuntoDeVista", "desde": "04-10-2008", "url": "http://www.mipuntodevista.com.mx/"},
{"nombre": "La Jornada Aguascalientes", "crawler": "otros_sitios/laJornadaAgs", "desde": "01-12-2008", "url": "http://www.lja.mx/"}, {"nombre": "Notirivas", "crawler": "descarga_por_fecha/notirivas", "desde": "22-11-2016", "url": "http://gruporivas.com.mx/notirivas/"},
{"nombre": "La Jornada Baja California", "crawler": "otros_sitios/laJornadaBC", "desde": "09-02-2015", "url": "http://jornadabc.mx"}, {"nombre": "Notisureste", "crawler": "descarga_por_fecha/notisureste", "desde": "28-11-2011", "url": "http://www.notisureste.com/"},
{"nombre": "La Jornada Guerrero", "crawler": "otros_sitios/laJornadaGro", "desde": "22-01-2007", "url": "http://www.lajornadaguerrero.com.mx"}, {"nombre": "Proceso", "crawler": "descarga_por_mes/proceso", "desde": "11-1976", "url": "http://www.proceso.com.mx/"},
{"nombre": "La Jornada de Oriente", "crawler": "otros_sitios/laJornadaOte", "desde": "01-06-2013", "url": "http://www.lajornadadeoriente.com.mx/"}, {"nombre": "Punto Medio", "crawler": "descarga_por_fecha/puntoMedio", "desde": "02-08-2015", "url": "https://www.puntomedio.mx/"},
{"nombre": "La Jornada San Luis", "crawler": "otros_sitios/laJornadaSanLuis", "desde": "08-10-2014", "url": "http://lajornadasanluis.com.mx"}, {"nombre": "Sona 89.3", "crawler": "descarga_por_fecha/sona893", "desde": "09-04-2012", "url": "http://sona893.fm/"},
{"nombre": "La Jornada Veracruz", "crawler": "otros_sitios/laJornadaVer", "desde": "11-05-2009", "url": "http://www.jornadaveracruz.com.mx"}, {"nombre": "Yucatán a la Mano", "crawler": "descarga_por_fecha/yucatanALaMano", "desde": "11-05-2015", "url": "http://www.yucatanalamano.com/"},
{"nombre": "La Jornada Zacatecas", "crawler": "otros_sitios/laJornadaZac", "desde": "10-06-2013", "url": "http://ljz.mx"} {"nombre": "Yucatán al Minuto", "crawler": "descarga_por_fecha/yucatanAlMinuto", "desde": "17-01-2017", "url": "http://www.yucatanalminuto.com/"},
{"nombre": "Yucatán en Corto", "crawler": "descarga_por_fecha/yucatanEnCorto", "desde": "02-04-2011", "url": "http://florcastillo.mx/noticias/"}
] ]
\ No newline at end of file
...@@ -69,8 +69,4 @@ with open(sys.argv[1]) as data_file: ...@@ -69,8 +69,4 @@ with open(sys.argv[1]) as data_file:
os.chdir("..") os.chdir("..")
print today.year print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio # scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
...@@ -8,17 +8,17 @@ import re ...@@ -8,17 +8,17 @@ import re
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
date = scrapy.Field() date = scrapy.Field()
location = scrapy.Field() location = scrapy.Field()
author = scrapy.Field() author = scrapy.Field()
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
...@@ -49,10 +49,10 @@ class QuotesSpider(scrapy.Spider): ...@@ -49,10 +49,10 @@ class QuotesSpider(scrapy.Spider):
section = response.url[response.url.rfind('/')+1:] section = response.url[response.url.rfind('/')+1:]
if ( section == 'opinion' ): # la seccion 'opinion' tiene una estructura diferente a las otras if ( section == 'opinion' ): # la seccion 'opinion' tiene una estructura diferente a las otras
path_list = ['//*[@id="columnas"]/p/a/@href', path_list = ['//*[@id="columnas"]/p/a/@href',
'//*[@id="opinion"]/p/a/@href'] '//*[@id="opinion"]/p/a/@href']
else: else:
path_list = ['//*[@id="article_list"]/h2/a/@href', path_list = ['//*[@id="article_list"]/h2/a/@href',
'//*[@id="article_list"]/h3/a/@href'] '//*[@id="article_list"]/h3/a/@href']
for path in path_list: for path in path_list:
for link in response.xpath(path).extract(): for link in response.xpath(path).extract():
......
This source diff could not be displayed because it is too large. You can view the blob instead.
import scrapy import scrapy
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re import re
from scrapy_splash import SplashRequest from scrapy_splash import SplashRequest
""" """
Para este sitio se hace uso de scrapy-splash porque el contenido es cargago a traves de javascript Para este sitio se hace uso de 'scrapy-splash' porque el contenido es cargado a traves de javascript
USO:
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3
""" """
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
...@@ -29,19 +29,26 @@ class QuotesSpider(scrapy.Spider): ...@@ -29,19 +29,26 @@ class QuotesSpider(scrapy.Spider):
def start_requests(self): def start_requests(self):
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
# month = getattr(self, 'month', None) month = getattr(self, 'month', None)
# day = getattr(self, 'day', None) # day = getattr(self, 'day', None)
self.baseURL='http://hemeroteca.proceso.com.mx/?page_id=111058&edicion=mexico&page=' parse_month = {'1': 'Enero', '2': 'Febrero', '3': 'Marzo', '4': 'Abril',
this_year = 2017 '5': 'Mayo', '6': 'Junio', '7': 'Julio', '8': 'Agosto',
'9': 'Septiembre', '10': 'Octubre', '11': 'Noviembre', '12': 'Diciembre'}
while this_year >= int(year): self.date = parse_month[month]+' de '+year
yield scrapy.Request(url=self.baseURL+str(year), callback=self.parse)
this_year -= 1 self.baseURL='http://hemeroteca.proceso.com.mx/?page_id=111058&edicion=mexico&page='
yield scrapy.Request(url=self.baseURL+self.year, callback=self.parse)
def parse(self, response): def parse(self, response):
for link in response.xpath('//*[@class="catpor-post-thumb"]/a/@href').extract(): for post in response.css('div.catpor-box'):
yield scrapy.Request(url=link, callback=self.parse_2) post_date = post.xpath('./div/span[@class="catpor-published clearfix"]/text()').extract_first()
post_date = post_date[post_date.find('d')+3:]
if post_date == self.date:
link = post.xpath('./div/div/a/@href').extract_first()
yield scrapy.Request(url=link, callback=self.parse_2)
def parse_2(self, response): def parse_2(self, response):
......
...@@ -36,8 +36,7 @@ Instalacion SPLASH ...@@ -36,8 +36,7 @@ Instalacion SPLASH
> Inicializar el contenedor: > Inicializar el contenedor:
$ sudo docker run -p 8050:8050 scrapinghub/splash ## con esto splash esta disponible en puerto 8050 (http) $ sudo docker run -p 8050:8050 scrapinghub/splash ## con esto splash esta disponible en puerto 8050 (http) en navegador (localhost:8050)
##verificar en navegador (localhost:8050)
Consulta: http://splash.readthedocs.io/en/latest/install.html Consulta: http://splash.readthedocs.io/en/latest/install.html
......
#!/bin/bash
for y in `seq 2010 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o - > laJornada_$y-$m-$d.json -a year=$y -a month=$m -a day=$d
# jsonlint-py -f laJornada_$y-$m-$d.json > laJornada_$y-$m-$d.json
done
done
cd ..
done
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class LajornadaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class LajornadaPipeline(object):
def process_item(self, item, spider):
return item
# -*- coding: utf-8 -*-
# Scrapy settings for lajornada project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'lajornada'
SPIDER_MODULES = ['lajornada.spiders']
NEWSPIDER_MODULE = 'lajornada.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'lajornada (+http://www.yourdomain.com)'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS=32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY=3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16
# Disable cookies (enabled by default)
#COOKIES_ENABLED=False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED=False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'lajornada.middlewares.MyCustomSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'lajornada.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'lajornada.pipelines.SomePipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# NOTE: AutoThrottle will honour the standard settings for concurrency and delay
#AUTOTHROTTLE_ENABLED=True
# The initial download delay
#AUTOTHROTTLE_START_DELAY=5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY=60
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG=False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED=True
#HTTPCACHE_EXPIRATION_SECS=0
#HTTPCACHE_DIR='httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES=[]
#HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
"""@package laJornada_scrapy
Crawlwer para la jornada.unam.mx
uso:
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2016 month=12 day=24
"""
import scrapy
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class NoticiasSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
# Lista de url a explorar.
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://www.jornada.unam.mx/'+year+'/'+month+'/'+day+'/'
urls = [
self.baseURL+"opinion",
self.baseURL+"politica",
self.baseURL+"economia",
self.baseURL+"mundo",
self.baseURL+"estados",
self.baseURL+"capital",
self.baseURL+"sociedad",
self.baseURL+"ciencias",
self.baseURL+"cultura",
self.baseURL+"espectaculos",
self.baseURL+"deporte",
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
"""parser principal."""
item = NoticiasItem()
for noticia in response.css('a.cabeza'):
url = self.baseURL + noticia.css('::attr(href)').extract_first()
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
"""Parser para la pagina de cada noticia."""
item = NoticiasItem()
item['title'] = response.css('div.cabeza::text').extract_first()
item['url'] = response.url
item['author'] = response.css('div.credito-autor::text').extract_first()
item['location'] = response.css('p.s-s::text').extract_first()
item['text'] = remove_tags( response.css('div.text').extract_first() )
item['topic'] = response.css('img.title::attr(title)').extract_first()
yield item
\ No newline at end of file
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = lajornada.settings
[deploy]
#url = http://localhost:6800/
project = lajornada
This source diff could not be displayed because it is too large. You can view the blob instead.
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class Diarioyucatan2Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class Diarioyucatan2SpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class Diarioyucatan2Pipeline(object):
def process_item(self, item, spider):
return item
# -*- coding: utf-8 -*-
# Scrapy settings for diarioYucatan2 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'diarioYucatan2'
SPIDER_MODULES = ['diarioYucatan2.spiders']
NEWSPIDER_MODULE = 'diarioYucatan2.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'diarioYucatan2 (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'diarioYucatan2.middlewares.Diarioyucatan2SpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'diarioYucatan2.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'diarioYucatan2.pipelines.Diarioyucatan2Pipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
import scrapy
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2016 -a month=12 -a day=24
import re
from datetime import datetime, date, timedelta
from scrapy.spidermiddlewares.httperror import HttpError
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
section_list = ['merida', 'yucatan', 'mexico', 'internacional', 'deportes',
'espectaculos', 'imagen', 'economia', 'tecnologia', 'salud']
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://yucatan.com.mx/seccion/'
self.date = date(int(year), int(month), int(day))
self.parsing_month = { 'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6, 'julio': 7,
'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12 }
self.stop = False
urls = [
self.baseURL,
]
for s in section_list:
yield scrapy.Request(url=self.baseURL+s, callback=self.parse)
def parse(self, response):
if ( len(response.xpath('//a[@class="show-more-link"]/@href').extract()) > 0 ):
for link in response.xpath('//a[@class="show-more-link"]/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_pagination)
elif ( len(response.xpath('//a[@class="show-more-link"]/@href').extract()) == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_pagination, dont_filter=True)
def errback_http(self, failure):
if failure.check(HttpError):
response = failure.value.response
self.logger.error('HttpError on %s', response.url)
self.stop = True
def parse_pagination(self, response):
pagination = response.xpath('//*[@class="pagination"]/a/@href').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-1]
pages = int(pagination[pagination.rfind('/')+1:])
p = 1
while p <= pages:
if ( self.stop ):
p = pages+1
else:
if ( p == 1 ):
yield scrapy.Request(url=response.url, callback=self.parse_link, dont_filter=True)
elif ( p > 1 ):
yield scrapy.Request(url=response.url+'/page/'+str(p+1), callback=self.parse_link)
p += 1
else:
yield scrapy.Request(url=response.url, callback=self.parse_link, dont_filter=True)
def parse_link(self, response):
for entry in response.xpath('//*[@class="bp-entry"]'):
entry_date = entry.xpath('./*[@class="bp-head"]/div/span/text()').extract_first()
entry_date = entry_date[entry_date.rfind(',')+2:][:entry_date[entry_date.rfind(',')+2:].rfind('-')-2]
news_date = date(int(entry_date[-4:]), self.parsing_month[entry_date[:-8][entry_date[:-8].rfind(' ')+1:]], int(entry_date[:entry_date.find(' ')]))
link = entry.xpath('./*[@class="bp-head"]/h2/a/@href').extract_first()
if news_date >= self.date and link is not None:
yield scrapy.Request(url=link, callback=self.parse_item)
elif news_date < self.date:
self.stop = True
def parse_item(self, response):
text = ''
item = NoticiasItem()
item['title'] = response.css('h1.entry-title::text').extract_first()
item['date'] = response.css('div.base-box').css('span.entry-date::attr(datetime)').extract_first()
for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['topic'] = [response.xpath('//*[@class="breadcrumbs-plus"]/span/a/span/text()').extract()[1]]
item['url'] = response.url
# print item['title']
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = diarioYucatan2.settings
[deploy]
#url = http://localhost:6800/
project = diarioYucatan2
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment