Commit 4bedded4 authored by Mario Chirinos Colunga's avatar Mario Chirinos Colunga 💬

upload files

parents
Crawlers para medios de información en linea nacionales basados en [Scrapy] (http://scrapy.org/)
Se incluyen los siguientes medios:
* [La Jornada](http://www.lajornada.unam.mx)
Uso:
```bash
cd laJornada
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2016 month=12 day=24
```
* [Diario de Yucatán](http://www.yucatan.com)
Uso:
```bash
cd diarioYucatan
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2016 month=12 day=24
```
* [Yucatán al Minuto](http://www.yucatanalminuto.com/)
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class DiarioyucatanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class DiarioyucatanPipeline(object):
def process_item(self, item, spider):
return item
# -*- coding: utf-8 -*-
# Scrapy settings for diarioYucatan project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'diarioYucatan'
SPIDER_MODULES = ['diarioYucatan.spiders']
NEWSPIDER_MODULE = 'diarioYucatan.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'diarioYucatan (+http://www.yourdomain.com)'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS=32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY=3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16
# Disable cookies (enabled by default)
#COOKIES_ENABLED=False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED=False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'diarioYucatan.middlewares.MyCustomSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'diarioYucatan.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'diarioYucatan.pipelines.SomePipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# NOTE: AutoThrottle will honour the standard settings for concurrency and delay
#AUTOTHROTTLE_ENABLED=True
# The initial download delay
#AUTOTHROTTLE_START_DELAY=5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY=60
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG=False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED=True
#HTTPCACHE_EXPIRATION_SECS=0
#HTTPCACHE_DIR='httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES=[]
#HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2016 month=12 day=24
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://yucatan.com.mx/'+year+'/'+month+'/'+day
urls = [
self.baseURL,
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
pages = response.css("div.pagination").css("a::attr(href)")[-1].extract()
pages = int(pages[pages.rfind('/')+1:])
for p in range(0,pages):
yield scrapy.Request(url=response.url+"/page/"+str(p+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.css("div.bp-head").css("h2").css("a::attr(href)").extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
item['title'] = response.css("h1.entry-title::text").extract_first()
item['date'] = response.css("div.base-box").css("span.entry-date::attr(datetime)").extract_first()
item['text'] = response.css("div.entry-content").css("p::text").extract()
item['url'] = response.url
yield item
This diff is collapsed.
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = diarioYucatan.settings
[deploy]
#url = http://localhost:6800/
project = diarioYucatan
#!/bin/bash
for y in `seq 2010 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o - > laJornada_$y-$m-$d.json -a year=$y -a month=$m -a day=$d
# jsonlint-py -f laJornada_$y-$m-$d.json > laJornada_$y-$m-$d.json
done
done
cd ..
done
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class LajornadaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class LajornadaPipeline(object):
def process_item(self, item, spider):
return item
# -*- coding: utf-8 -*-
# Scrapy settings for lajornada project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'lajornada'
SPIDER_MODULES = ['lajornada.spiders']
NEWSPIDER_MODULE = 'lajornada.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'lajornada (+http://www.yourdomain.com)'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS=32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY=3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16
# Disable cookies (enabled by default)
#COOKIES_ENABLED=False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED=False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'lajornada.middlewares.MyCustomSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'lajornada.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'lajornada.pipelines.SomePipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# NOTE: AutoThrottle will honour the standard settings for concurrency and delay
#AUTOTHROTTLE_ENABLED=True
# The initial download delay
#AUTOTHROTTLE_START_DELAY=5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY=60
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG=False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED=True
#HTTPCACHE_EXPIRATION_SECS=0
#HTTPCACHE_DIR='httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES=[]
#HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
"""@package laJornada_scrapy
Crawlwer para la jornada.unam.mx
uso:
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2016 month=12 day=24
"""
import scrapy
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class NoticiasSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
"""Lista de url a explorar."""
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://www.jornada.unam.mx/'+year+'/'+month+'/'+day+'/'
urls = [
self.baseURL+"opinion",
self.baseURL+"politica",
self.baseURL+"economia",
self.baseURL+"mundo",
self.baseURL+"estados",
self.baseURL+"capital",
self.baseURL+"sociedad",
self.baseURL+"ciencias",
self.baseURL+"cultura",
self.baseURL+"espectaculos",
self.baseURL+"deporte",
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
"""parser principal."""
item = NoticiasItem()
for noticia in response.css('a.cabeza'):
url = self.baseURL + noticia.css('::attr(href)').extract_first()
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
"""Parser para la pagina de cada noticia."""
item = NoticiasItem()
item['title'] = response.css('div.cabeza::text').extract_first()
item['url'] = response.url
item['author'] = response.css('div.credito-autor::text').extract_first()
item['location'] = response.css('p.s-s::text').extract_first()
item['text'] = remove_tags( response.css('div.text').extract_first() )
item['topic'] = response.css('img.title::attr(title)').extract_first()
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = lajornada.settings
[deploy]
#url = http://localhost:6800/
project = lajornada
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = yucatanAlMinuto.settings
[deploy]
#url = http://localhost:6800/
project = yucatanAlMinuto
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class YucatanalminutoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class YucatanalminutoPipeline(object):
def process_item(self, item, spider):
return item
# -*- coding: utf-8 -*-
# Scrapy settings for yucatanAlMinuto project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'yucatanAlMinuto'
SPIDER_MODULES = ['yucatanAlMinuto.spiders']
NEWSPIDER_MODULE = 'yucatanAlMinuto.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'yucatanAlMinuto (+http://www.yourdomain.com)'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS=32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY=3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16
# Disable cookies (enabled by default)
#COOKIES_ENABLED=False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED=False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'yucatanAlMinuto.middlewares.MyCustomSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'yucatanAlMinuto.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'yucatanAlMinuto.pipelines.SomePipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# NOTE: AutoThrottle will honour the standard settings for concurrency and delay
#AUTOTHROTTLE_ENABLED=True
# The initial download delay
#AUTOTHROTTLE_START_DELAY=5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY=60
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG=False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED=True
#HTTPCACHE_EXPIRATION_SECS=0
#HTTPCACHE_DIR='httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES=[]
#HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment