Commit 093e0e82 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

merge with dev

parents fc91c136 15cbb498
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class DiariodechiapasSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class DiariodechiapasDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for diarioDeChiapas project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'diarioDeChiapas'
SPIDER_MODULES = ['diarioDeChiapas.spiders']
NEWSPIDER_MODULE = 'diarioDeChiapas.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'diarioDeChiapas (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'diarioDeChiapas.middlewares.DiariodechiapasSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'diarioDeChiapas.middlewares.DiariodechiapasDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'diarioDeChiapas.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
MEDIA:
Diario de Chiapas, Chiapas
USAGE
$ cd diarioDeChiapas
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
for the news contained in noticias.json being splitted into files by date. ##
$ scrapy crawl noticias --nolog -s filename=noticias.json
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to a specific date. ##
$ scrapy crawl noticias --nolog -s filename=2018-08-30.json -a year=2018 -a month=8 -a day=30
"""
import scrapy, re, json
from datetime import datetime, date
from diarioDeChiapas.items import NoticiasItem
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class ImportantData(scrapy.Item):
"""
Useful data for the flow of the implementation
"""
to_next_page = scrapy.Field()
is_last_link = scrapy.Field()
news_section = scrapy.Field()
return_url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
if year is not None and month is not None and day is not None:
self.stopDate = date(int(year), int(month), int(day))
else:
self.stopDate = None
baseURL = "http://www.diariodechiapas.com/landing/"
section_list = ["editorial", "portada", "metropoli", "region", "la-roja",
"deportes", "boga", "ae", "trascendio"]
# section_list = ["editorial"]
if self.stopDate is None:
for s in section_list:
yield scrapy.Request(url=baseURL + s, callback=self.parse)
else:
for s in section_list:
flow_info = ImportantData()
flow_info['to_next_page'] = False
request = scrapy.Request(url=baseURL + s, callback=self.parse_with_stop_date)
request.meta['item'] = flow_info
yield request
def parse(self, response):
link_list = response.xpath('//section[@class="page__content"]').css('section.post').xpath('./a[@class="post__link"]/@href').extract()
section = response.xpath('//section[@class="wrapper"]/h1').extract_first()
if section is not None : section = remove_tags(section)
for link in link_list:
flow_info = ImportantData()
flow_info['news_section'] = section
request = scrapy.Request(url=link, callback=self.parse_item)
request.meta['item'] = flow_info
yield request
next_page = response.css('div.wp-pagenavi').css('a.nextpostslink').css('::attr(href)').extract_first()
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_with_stop_date(self, response):
flow_info = response.meta['item']
if not flow_info['to_next_page']:
link_list = response.xpath('//section[@class="page__content"]').css('section.post').xpath('./a[@class="post__link"]/@href').extract()
section = response.xpath('//section[@class="wrapper"]/h1').extract_first()
if section is not None : section = remove_tags(section)
for link in link_list:
flow_info = ImportantData()
flow_info['news_section'] = section
flow_info['return_url'] = response.url
if link == link_list[-1] : flow_info['is_last_link'] = True
else : flow_info['is_last_link'] = False
request = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
request.meta['item'] = flow_info
yield request
else:
next_page = response.css('div.wp-pagenavi').css('a.nextpostslink').css('::attr(href)').extract_first()
if next_page is not None:
flow_info['to_next_page'] = False
request = scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
request.meta['item'] = flow_info
yield request
def parse_item(self, response):
flow_info = response.meta['item']
item = NoticiasItem()
text = ''
title = response.xpath('//section[@class="single__content"]/h1').extract_first()
if title is not None : title = remove_tags(title)
for p in response.xpath('//section[@class="single__content"]').css('p').extract():
text += remove_tags(p) + "\n"
## News item info ##
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['topic'] = flow_info['news_section']
item['title'] = title
item['text'] = text.strip()
item['url'] = response.url
yield item
def parse_item_with_stop_date(self, response):
news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
news_date = news_date[:news_date.find('T')]
news_date = datetime.strptime(news_date, '%Y-%m-%d').date()
if news_date >= self.stopDate:
flow_info = response.meta['item']
item = NoticiasItem()
text = ''
title = response.xpath('//section[@class="single__content"]/h1').extract_first()
if title is not None : title = remove_tags(title)
for p in response.xpath('//section[@class="single__content"]').css('p').extract():
text += remove_tags(p) + "\n"
## News item info ##
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['topic'] = flow_info['news_section']
item['title'] = title
item['text'] = text.strip()
item['url'] = response.url
yield item
if flow_info['is_last_link']:
flow_info['to_next_page'] = True
request = scrapy.Request(url=flow_info['return_url'], callback=self.parse_with_stop_date, dont_filter=True)
request.meta['item'] = flow_info
yield request
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = diarioDeChiapas.settings
[deploy]
#url = http://localhost:6800/
project = diarioDeChiapas
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class DiarioindependienteSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class DiarioindependienteDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for diarioIndependiente project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'diarioIndependiente'
SPIDER_MODULES = ['diarioIndependiente.spiders']
NEWSPIDER_MODULE = 'diarioIndependiente.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'diarioIndependiente (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'diarioIndependiente.middlewares.DiarioindependienteSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'diarioIndependiente.middlewares.DiarioindependienteDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'diarioIndependiente.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
MEDIA:
Diario El Independiente, Baja California Sur
USAGE:
$ cd elIndependiente/
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
for the news contained in noticias.json being splitted into files by date. ##
$ scrapy crawl noticias --nolog -s filename=noticias.json
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to a specific date. ##
$ scrapy crawl noticias --nolog -s filename=2018-08-10.json -a year=2018 -a month=8 -a day=10
"""
import scrapy, re, json
from datetime import datetime, date
from diarioIndependiente.items import NoticiasItem
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class ImportantFlowData(scrapy.Item):
"""
Useful data for the flow of the implementation
"""
to_next_page = scrapy.Field()
is_last_link = scrapy.Field()
return_url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
if year is not None and month is not None and day is not None:
self.stop_date = date(int(year), int(month), int(day))
base_url = "https://www.diarioelindependiente.mx/" + year + "/" + month + "/"
else:
self.stop_date = None
section_list = ["la-paz", "los-cabos", "policiaca", "deportes", "cultura", "nacional",
"internacional", "opinion", "espectaculos", "tecnologia"]
base_url = "https://www.diarioelindependiente.mx/"
if self.stop_date is None:
for s in section_list:
yield scrapy.Request(url=base_url + s, callback=self.parse)
else:
flow_info = ImportantFlowData()
flow_info['to_next_page'] = False
request = scrapy.Request(url=base_url, callback=self.parse_with_stop_date)
request.meta['item'] = flow_info
yield request
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.paginacion').xpath('./ul/li/a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-2]
pages = int(pagination[pagination.rfind('=') + 1:])
for page in xrange(1, pages):
yield scrapy.Request(url=response.url + "?page=" + str(page + 1), callback=self.parse_page)
def parse_page(self, response):
link_list = response.xpath('//div[@id="colNoticias"]').css('article.card__article').xpath('./h2/a/@href').extract()
for link in link_list:
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_with_stop_date(self, response):
flow_info = response.meta['item']
if not flow_info['to_next_page']:
link_list = response.xpath('//div[@id="colNoticias"]').css('article.card__article').xpath('./h2/a/@href').extract()
for link in link_list:
flow_info = ImportantFlowData()
flow_info['return_url'] = response.url
if link == link_list[-1] : flow_info['is_last_link'] = True
else : flow_info['is_last_link'] = False
request = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
request.meta['item'] = flow_info
yield request
else:
next_page = response.css('div.paginacion').xpath('./ul/li/a[@rel="next"]/@href').extract_first()
if next_page is not None:
flow_info['to_next_page'] = False
request = scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
request.meta['item'] = flow_info
yield request
def parse_item(self, response):
item = NoticiasItem()
text = ''
news_date = response.xpath('//meta[@name="date"]/@content').extract_first()
try:
topic = response.xpath('//span[@class="badge"]').extract_first()
except:
topic = None
for p in response.css('div.cuerpo_noticia').css('p').extract():
text += remove_tags(p) + "\n"
## News item info ##
item['date'] = datetime.strptime(news_date, '%Y-%m-%d').isoformat("T")
item['title'] = remove_tags(response.css('h1.colorRojo').extract_first())
item['topic'] = remove_tags(topic)
item['text'] = text.strip()
item['url'] = response.url
yield item
def parse_item_with_stop_date(self, response):
news_date = response.xpath('//meta[@name="date"]/@content').extract_first()
news_date = datetime.strptime(news_date, '%Y-%m-%d').date()
if news_date >= self.stop_date:
flow_info = response.meta['item']
item = NoticiasItem()
text = ''
news_date = datetime.strptime(news_date.isoformat(), '%Y-%m-%d').isoformat("T")
title = response.css('h1.colorRojo').extract_first()
if title is not None : title = remove_tags(title)
topic = response.xpath('//span[@class="badge"]').extract_first()
if topic is not None : topic = remove_tags(topic)
for p in response.css('div.cuerpo_noticia').css('p').extract():
text += remove_tags(p) + "\n"
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
if flow_info['is_last_link']:
flow_info['to_next_page'] = True
request = scrapy.Request(url=flow_info['return_url'], callback=self.parse_with_stop_date, dont_filter=True)
request.meta['item'] = flow_info
yield request
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = diarioIndependiente.settings
[deploy]
#url = http://localhost:6800/
project = diarioIndependiente
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class HeraldochihuahuaSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class HeraldochihuahuaDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for heraldoChihuahua project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'heraldoChihuahua'
SPIDER_MODULES = ['heraldoChihuahua.spiders']
NEWSPIDER_MODULE = 'heraldoChihuahua.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'heraldoChihuahua (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'heraldoChihuahua.middlewares.HeraldochihuahuaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'heraldoChihuahua.middlewares.HeraldochihuahuaDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'heraldoChihuahua.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
MEDIA:
El Heraldo de Chihuahua, Chihuahua
USAGE
$ cd heraldoChihuahua
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
for the news contained in noticias.json being splitted into files by date. ##
$ scrapy crawl noticias --nolog -s filename=noticias.json
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to a specific date. ##
$ scrapy crawl noticias --nolog -s filename=2018-08-30.json -a year=2018 -a month=8 -a day=30
"""
import scrapy, re, json, urllib
from datetime import datetime, date, tzinfo, timedelta
from collections import OrderedDict
from heraldoChihuahua.items import NoticiasItem
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
DAY_NUM = re.compile(r'[0-9]{1,2}')
class UTC(tzinfo):
"""
Class for Time Zone
"""
def utcoffset(self, dt):
## Time zone for Chihuahua: UTC-7 ##
return timedelta(hours=-7)
def tzname(self, dt):
## Time zone name ##
return 'UTC-7'
class ImportantData(scrapy.Item):
"""
Useful data for the flow of the implementation
"""
to_next_page = scrapy.Field()
is_last_link = scrapy.Field()
news_section = scrapy.Field()
news_page = scrapy.Field()
return_url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
self.tz = UTC()
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
if year is not None and month is not None and day is not None:
self.stopDate = date(int(year), int(month), int(day))
else:
self.stopDate = None
self.baseURL = "https://www.elheraldodechihuahua.com.mx/"
section_list = ["local", "policiaca", "mexico", "republica", "mundo", "finanzas",
"analisis", "gossip", "circulos", "cultura", "doble-via", "deportes"]
# section_list = ["local"]
self.month_parser = dict(enero='01', febrero='02', marzo='03', abril='04', mayo='05', junio='06',
julio='07', agosto='08', septiembre='09', octubre='10', noviembre='11', diciembre='12')
if self.stopDate is None:
for s in section_list:
flow_info = ImportantData()
flow_info['news_page'] = 8
flow_info['news_section'] = s
request = scrapy.Request(url=self.baseURL + s, callback=self.parse)
request.meta['item'] = flow_info
yield request
else:
for s in section_list:
flow_info = ImportantData()
flow_info['to_next_page'] = False
flow_info['news_page'] = 8
flow_info['news_section'] = s
request = scrapy.Request(url=self.baseURL + s, callback=self.parse_with_stop_date)
request.meta['item'] = flow_info
yield request
def parse(self, response):
flow_info = response.meta['item']
link_list = response.css('div.hard-news-row').css('h4.title > a::attr(href)').extract()
if len(link_list) <= 0 : link_list = response.css('h4.title > a::attr(href)').extract()
for link in link_list:
yield scrapy.Request(url=link, callback=self.parse_item)
load_more = response.css('div.load-more').extract_first()
if load_more is not None:
flow_info['news_page'] += 4
params = OrderedDict()
params['widgetContentId'] = '148'
params['widgetName'] = 'viewPicker'
params['offset'] = str(flow_info['news_page'])
url_params = urllib.urlencode(params)
next_load = self.baseURL + flow_info['news_section'] + "/widget/?" + url_params
request = scrapy.Request(url=next_load, callback=self.parse)
request.meta['item'] = flow_info
yield request
def parse_with_stop_date(self, response):
flow_info = response.meta['item']
if not flow_info['to_next_page']:
link_list = response.css('div.hard-news-row').css('h4.title > a::attr(href)').extract()
if len(link_list) <= 0 : link_list = response.css('h4.title > a::attr(href)').extract()
news_page = flow_info['news_page']
news_section = flow_info['news_section']
for link in link_list:
flow_info = ImportantData()
flow_info['return_url'] = response.url
flow_info['news_page'] = news_page
flow_info['news_section'] = news_section
if link == link_list[-1] : flow_info['is_last_link'] = True
else : flow_info['is_last_link'] = False
request = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
request.meta['item'] = flow_info
yield request
else:
load_more = response.css('div.load-more').extract_first()
if load_more is not None:
flow_info['to_next_page'] = False
flow_info['news_page'] += 4
params = OrderedDict()
params['widgetContentId'] = '148'
params['widgetName'] = 'viewPicker'
params['offset'] = str(flow_info['news_page'])
url_params = urllib.urlencode(params)
next_load = self.baseURL + flow_info['news_section'] + "/widget/?" + url_params
request = scrapy.Request(url=next_load, callback=self.parse_with_stop_date)
request.meta['item'] = flow_info
yield request
def parse_item(self, response):
published_date = response.css('p.published-date').extract_first()
if published_date is not None:
published_date = remove_tags(published_date)
published_date = published_date.split('/')
published_date = published_date[1].strip().replace('\n', '')
date_lst = published_date.split(' de ')
element = DAY_NUM.search(date_lst[0])
if element:
item = NoticiasItem()
text = ''
date_lst[0] = element.group()
date_lst[1] = self.month_parser[date_lst[1].lower()]
date_lst = map(int, date_lst)
news_date = datetime(date_lst[2], date_lst[1], date_lst[0], tzinfo=self.tz).isoformat("T")
topic = response.css('div.breadcrumb > a').extract_first()
if topic is not None : topic = remove_tags(topic).strip("\n")
title = response.css('h1.title').extract_first()
if title is not None : title = remove_tags(title).strip("\n")
for p in response.css('div.content-body').xpath('./div[contains(@id, "content-body")]/p').extract():
text += remove_tags(p) + "\n"
## News item info ##
item['date'] = news_date
item['topic'] = topic
item['title'] = title
item['text'] = text.strip()
item['url'] = response.url
yield item
def parse_item_with_stop_date(self, response):
published_date = response.css('p.published-date').extract_first()
if published_date is not None:
published_date = remove_tags(published_date)
published_date = published_date.split('/')
published_date = published_date[1].strip().replace('\n', '')
date_lst = published_date.split(' de ')
element = DAY_NUM.search(date_lst[0])
if element:
date_lst[0] = element.group()
date_lst[1] = self.month_parser[date_lst[1].lower()]
news_date = "-".join(date_lst)
news_date = datetime.strptime(news_date, '%d-%m-%Y').date()
if news_date >= self.stopDate:
flow_info = response.meta['item']
item = NoticiasItem()
text = ''
date_lst = map(int, date_lst)
news_date = datetime(date_lst[2], date_lst[1], date_lst[0], tzinfo=self.tz).isoformat("T")
topic = response.css('div.breadcrumb > a').extract_first()
if topic is not None : topic = remove_tags(topic).strip("\n")
title = response.css('h1.title').extract_first()
if title is not None : title = remove_tags(title).strip("\n")
for p in response.css('div.content-body').xpath('./div[contains(@id, "content-body")]/p').extract():
text += remove_tags(p) + "\n"
## News item info ##
item['date'] = news_date
item['topic'] = topic
item['title'] = title
item['text'] = text.strip()
item['url'] = response.url
yield item
if flow_info['is_last_link']:
flow_info['to_next_page'] = True
request = scrapy.Request(url=flow_info['return_url'], callback=self.parse_with_stop_date, dont_filter=True)
request.meta['item'] = flow_info
yield request
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = heraldoChihuahua.settings
[deploy]
#url = http://localhost:6800/
project = heraldoChihuahua
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class LajornadamayaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class LajornadamayaSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class LajornadamayaPipeline(object):
def process_item(self, item, spider):
return item
# -*- coding: utf-8 -*-
# Scrapy settings for laJornadaMaya project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'laJornadaMaya'
SPIDER_MODULES = ['laJornadaMaya.spiders']
NEWSPIDER_MODULE = 'laJornadaMaya.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'laJornadaMaya (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'laJornadaMaya.middlewares.LajornadamayaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'laJornadaMaya.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'laJornadaMaya.pipelines.LajornadamayaPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy, json, re
from datetime import datetime, date, timedelta, tzinfo
"""
Esta version descarga ingresando una fecha.
USO:
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
No es recomendable para fechas de mas de un mes de antiguas.
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
# self.found = False
# self.flag = False
self.tz = UTC()
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
self.req_date = date(int(self.year), int(self.month), int(self.day))
self.date_format = "%Y-%m-%d"
self.baseURL = 'https://www.lajornadamaya.mx'
section_list = ['yucatan', 'quintana-roo', 'campeche', 'deportes', 'nacional',
'internacional', 'opinion']
# section_list = ['deportes']
for section in section_list:
self.section = section
for count in range(0,2):
if ( count == 0 ):
yield scrapy.Request(url=self.baseURL+'/'+section, callback=self.parse_2)
elif (count == 1):
# self.section = section
self.page = 0
self.flag = False
self.found = False
page = -1
if not ( section == 'opinion' ):
while True:
if ( self.flag ):
self.flag = False
break
page+=1
yield scrapy.Request(url=self.baseURL+'/'+section+'?p='+str(page), callback=self.parse)
if ( self.found ):
self.found = False
self.page -= 1
if ( self.page > 0 ):
self.page -= 1
for pag in range(self.page, self.page+6):
yield scrapy.Request(url=self.baseURL+'/'+section+'?p='+str(pag), callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+'/notas?opinion', callback=self.parse_page)
def parse_2(self, response): # para las primeras noticias
path_list = ['//h1[@class="title"]/a/@href', '//h2[@class="title"]/a/@href']
link_list = []
for path in path_list:
link_list += response.xpath(path).extract()
for link in link_list:
if ( link[:link.rfind('/')] == self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2) ):
item = NoticiasItem()
d = link[:link.rfind('/')]
if len(d) == 10:
d = map(int, d.split('-'))
d = datetime(d[0], d[1], d[2], tzinfo=self.tz).isoformat('T')
elif len(d) == 19:
d, t = d.split(' ')
d = map(int, d.split('-'))
t = map(int, t.split(':'))
d = datetime(d[0],d[1],d[2],t[0],t[1],t[2],tzinfo=self.tz).isoformat('T')
item['date'] = d
item['topic'] = response.url[response.url.rfind('/')+1:].title()
# yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item_2)
request = scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item_2)
request.meta['item'] = item
yield request
def parse(self, response): # para los json
json_response = json.loads(response.text)
if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
json_list = json_response
else:
json_list = json_response['articles']
for line in json_list:
this_date = datetime.strptime(line['publishDate'][:line['publishDate'].rfind(' ')], self.date_format)
this_date = this_date.date()
if ( this_date == self.req_date ):
self.page = int(response.url[response.url.rfind('=')+1:])
self.found = True
self.flag = True
break
elif ( this_date < self.req_date ):
self.flag = True
break
def parse_item_2(self, response): # para las primeras noticias
item = response.meta['item']
# item = NoticiasItem()
text = ''
# item['date'] = response.url[:response.url.rfind('/')][response.url[:response.url.rfind('/')].rfind('/')+1:]
# item['topic'] = self.section.title()
item['title'] = response.xpath('//article/h1/text()').extract_first()
for paragraph in response.xpath('//*[@class="txt"]').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
print item['title']
yield item
def parse_page(self, response): # para los json
json_response = json.loads(response.text)
if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
topic = response.url[response.url.rfind('/')+1:response.url.rfind('=')-2].title()
json_list = json_response
else:
json_list = json_response['articles']
topic = 'Opinion'
for line in json_list:
this_date = datetime.strptime(line['publishDate'][:line['publishDate'].rfind(' ')], self.date_format)
this_date = this_date.date()
if ( this_date == self.req_date ):
item = NoticiasItem()
# item['date'] = line['publishDate']
d = line['publishDate']
if len(d) == 10:
d = map(int, d.split('-'))
d = datetime(d[0], d[1], d[2], tzinfo=self.tz).isoformat('T')
elif len(d) == 19:
d, t = d.split(' ')
d = map(int, d.split('-'))
t = map(int, t.split(':'))
d = datetime(d[0],d[1],d[2],t[0],t[1],t[2],tzinfo=self.tz).isoformat('T')
item['date'] = d
item['topic'] = topic
item['title'] = line['name']
if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
request = scrapy.Request(url=self.baseURL+line['url'], callback=self.parse_item)
else:
request = scrapy.Request(url=self.baseURL+'/'+line['publishDate'][:line['publishDate'].rfind(' ')]+'/'+line['uriComponent'], callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_item(self, response): # para los json
item = response.meta['item']
text = ''
for paragraph in response.xpath('//*[@class="txt"]').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
print item['title']
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = laJornadaMaya.settings
[deploy]
#url = http://localhost:6800/
project = laJornadaMaya
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = tintaFresca.settings
[deploy]
#url = http://localhost:6800/
project = tintaFresca
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class TintafrescaSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class TintafrescaDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for tintaFresca project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'tintaFresca'
SPIDER_MODULES = ['tintaFresca.spiders']
NEWSPIDER_MODULE = 'tintaFresca.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tintaFresca (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'tintaFresca.middlewares.TintafrescaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'tintaFresca.middlewares.TintafrescaDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'tintaFresca.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
MEDIA:
Tinta Fresca, Chiapas
USAGE
$ cd tintaFresca
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
for the news contained in noticias.json being splitted into files by date. ##
$ scrapy crawl noticias --nolog -s filename=noticias.json
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to a specific date. ##
$ scrapy crawl noticias --nolog -s filename=2018-08-30.json -a year=2018 -a month=8 -a day=30
"""
import scrapy, re, json
from datetime import datetime, date, tzinfo, timedelta
from tintaFresca.items import NoticiasItem
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""
Class for Time Zone
"""
def utcoffset(self, dt):
## Time zone for Chiapas: UTC-6 ##
return timedelta(hours=-6)
def tzname(self, dt):
## Time zone name ##
return 'UTC-6'
class ImportantData(scrapy.Item):
"""
Useful data for the flow of the implementation
"""
to_next_page = scrapy.Field()
is_last_link = scrapy.Field()
news_section = scrapy.Field()
return_url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
self.tz = UTC()
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
if year is not None and month is not None and day is not None:
self.stopDate = date(int(year), int(month), int(day))
else:
self.stopDate = None
baseURL = "http://tintafresca.com.mx/"
# section_list = ["letras_en_su_tinta/page1/", "tgz/page1/", "patria_chica/page1/", "hecho_en_chiapas/page1/", "show/page1/", "rafaga/page1/"]
section_list = ["tgz/page1/", "patria_chica/page1/", "hecho_en_chiapas/page1/", "show/page1/"]
self.month_parser = dict(Enero='01', Febrero='02', Marzo='03', Abril='04', Mayo='05', Junio='06',
Julio='07', Agosto='08', Septiembre='09', Octubre='10', Noviembre='11', Diciembre='12')
if self.stopDate is None:
for s in section_list:
yield scrapy.Request(url=baseURL + s, callback=self.parse)
else:
for s in section_list:
flow_info = ImportantData()
flow_info['to_next_page'] = False
request = scrapy.Request(url=baseURL + s, callback=self.parse_with_stop_date)
request.meta['item'] = flow_info
yield request
def parse(self, response):
link_list = response.css('ul.tintas').css('a.leer::attr(href)').extract()
# section = response.xpath('//div[@id="ruta"]').extract_first()
# if section is not None:
# section = remove_tags(section)
# section = section.replace("Inicio &gt; ", '')
for link in link_list:
flow_info = ImportantData()
# flow_info['news_section'] = section
request = scrapy.Request(url=link, callback=self.parse_item)
request.meta['item'] = flow_info
yield request
next_page = response.xpath('//ul[@class="pagination"]/li[3]/a/@href').extract_first()
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_with_stop_date(self, response):
flow_info = response.meta['item']
if not flow_info['to_next_page']:
link_list = response.css('ul.tintas').css('a.leer::attr(href)').extract()
# section = response.xpath('//div[@id="ruta"]').extract_first()
# if section is not None:
# section = remove_tags(section)
# section = section.replace("Inicio &gt; ", '')
for link in link_list:
flow_info = ImportantData()
# flow_info['news_section'] = section
flow_info['return_url'] = response.url
if link == link_list[-1] : flow_info['is_last_link'] = True
else : flow_info['is_last_link'] = False
request = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
request.meta['item'] = flow_info
yield request
else:
next_page = response.xpath('//ul[@class="pagination"]/li[3]/a/@href').extract_first()
if next_page is not None:
flow_info['to_next_page'] = False
request = scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
request.meta['item'] = flow_info
yield request
def parse_item(self, response):
flow_info = response.meta['item']
item = NoticiasItem()
text = ''
date_str = response.xpath('//div[@class="balazo"]').extract_first()
if date_str.find('<br>') > -1 : date_str = date_str[date_str.find('<br>'):]
date_str = remove_tags(date_str)
date_lst = date_str.split('/')
date_lst[1] = self.month_parser[date_lst[1]]
date_lst = map(int, date_lst)
news_date = datetime(date_lst[2], date_lst[1], date_lst[0], tzinfo=self.tz).isoformat("T")
topic = response.css('div.seccion > h3.left > a').extract_first()
if topic is not None : topic = remove_tags(topic)
if topic is not None:
if topic == "Letras en su Tinta":
title = ''
lines = response.css('div.sumario > p').extract()
for line in lines:
if lines.index(line) != len(lines)-1 : title += remove_tags(line) + ". "
else : title += remove_tags(line) + "."
else:
title = response.css('div.titulo > h1').extract_first()
if title is not None : title = remove_tags(title)
else:
title = None
for p in response.css('div.contenido > p').extract():
text += remove_tags(p) + "\n"
## News item info ##
item['date'] = news_date
item['topic'] = topic
item['title'] = title
item['text'] = text.strip()
item['url'] = response.url
yield item
def parse_item_with_stop_date(self, response):
date_str = response.xpath('//div[@class="balazo"]').extract_first()
if date_str.find('<br>') > -1 : date_str = date_str[date_str.find('<br>'):]
date_str = remove_tags(date_str)
date_lst = date_str.split('/')
date_lst[1] = self.month_parser[date_lst[1]]
news_date = "-".join(date_lst)
news_date = datetime.strptime(news_date, '%d-%m-%Y').date()
if news_date >= self.stopDate:
flow_info = response.meta['item']
item = NoticiasItem()
text = ''
date_lst = map(int, date_lst)
news_date = datetime(date_lst[2], date_lst[1], date_lst[0], tzinfo=self.tz).isoformat("T")
topic = response.css('div.seccion > h3.left > a').extract_first()
if topic is not None : topic = remove_tags(topic)
if topic is not None:
if topic == "Letras en su Tinta":
title = ''
lines = response.css('div.sumario > p').extract()
for line in lines:
if lines.index(line) != len(lines)-1 : title += remove_tags(line) + ". "
else : title += remove_tags(line) + "."
else:
title = response.css('div.titulo > h1').extract_first()
if title is not None : title = remove_tags(title)
else:
title = None
for p in response.css('div.contenido > p').extract():
text += remove_tags(p) + "\n"
## News item info ##
item['date'] = news_date
item['topic'] = topic
item['title'] = title
item['text'] = text.strip()
item['url'] = response.url
yield item
if flow_info['is_last_link']:
flow_info['to_next_page'] = True
request = scrapy.Request(url=flow_info['return_url'], callback=self.parse_with_stop_date, dont_filter=True)
request.meta['item'] = flow_info
yield request
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class CuartopoderSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class CuartopoderDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for cuartoPoder project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'cuartoPoder'
SPIDER_MODULES = ['cuartoPoder.spiders']
NEWSPIDER_MODULE = 'cuartoPoder.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'cuartoPoder (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'cuartoPoder.middlewares.CuartopoderSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'cuartoPoder.middlewares.CuartopoderDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'cuartoPoder.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
MEDIA:
Cuarto Poder, Chiapas
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd cuartoPoder/
$ scrapy crawl noticias --nolog -s filename=2018-08-30.json -a year=2018 -a month=8 -a day=30
"""
import scrapy, re
from cuartoPoder.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""
Class for Time Zone
"""
def utcoffset(self, dt):
## Time zone for Chiapas: UTC-6 ##
return timedelta(hours=-6)
def tzname(self, dt):
## Time zone name ##
return 'UTC-6'
class ImportantData(scrapy.Item):
"""
Useful data for the flow of the implementation
"""
to_next_page = scrapy.Field()
next_page = scrapy.Field()
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
self.tz = UTC()
self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
self.baseURL = "http://www.cuartopoder.mx"
first_URL = self.baseURL + "/archivo/portada/listado/{1}-{2}-{0}/{1}-{2}-{0}/".format(self.year, self.month.zfill(2), self.day.zfill(2))
self.second_URL = self.baseURL + "/XStatic/cuartopoder/template/cargaBloque.aspx?strControl=ctrlArchivoResultadosPaginadoListado&"
flow_info = ImportantData()
flow_info['to_next_page'] = False
flow_info['next_page'] = 2
request = scrapy.Request(url=first_URL, callback=self.parse)
request.meta['item'] = flow_info
yield request
def parse(self, response):
flow_info = response.meta['item']
for link in response.css('ul.news-list').xpath('./li/h5/a/@href').extract():
to_next_page = True
news_link = self.baseURL + link
yield scrapy.Request(url=news_link, callback=self.parse_item)
if flow_info['to_next_page']:
page = flow_info['next_page']
page_URL = self.second_URL + "p={3}&eids=&fd={1}-{2}-{0}&fh={1}-{2}-{0}&id=portada".format(self.year, self.month.zfill(2), self.day.zfill(2), str(page))
flow_info['to_next_page'] = False
flow_info['next_page'] += 1
request = scrapy.Request(url=page_URL, callback=self.parse)
request.meta['item'] = flow_info
yield request
def parse_item(self, response):
item = NoticiasItem()
text = ''
news_date = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
title = response.css('div.post-title').css('h1').extract_first()
if title is not None : remove_tags(title)
topic = response.css('div.big-title').xpath('./h2/a/span').extract_first()
if topic is not None : topic = remove_tags(topic)
for p in response.css('div.post-content').css('p').extract():
p = remove_tags(p)
text += p + "\n"
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = cuartoPoder.settings
[deploy]
#url = http://localhost:6800/
project = cuartoPoder
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class DiariopuntualSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class DiariopuntualDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for diarioPuntual project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'diarioPuntual'
SPIDER_MODULES = ['diarioPuntual.spiders']
NEWSPIDER_MODULE = 'diarioPuntual.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'diarioPuntual (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'diarioPuntual.middlewares.DiariopuntualSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'diarioPuntual.middlewares.DiariopuntualDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'diarioPuntual.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
MEDIA:
Puntual, EDOMEX
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd diarioPuntual/
$ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
"""
import scrapy, re
from diarioPuntual.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""
Class for Time Zone
"""
def utcoffset(self, dt):
## Time zone for EDOMEX: UTC-6 ##
return timedelta(hours=-6)
def tzname(self, dt):
## Time zone name ##
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
self.tz = UTC()
self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
baseURL = "http://diario-puntual.com.mx/{0}/{1}/{2}/".format(self.year, self.month.zfill(2), self.day.zfill(2))
yield scrapy.Request(url=baseURL, callback=self.parse)
def parse(self, response):
for link in response.css('div.post-column').css('h2.posttitle > a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
next_page = response.css('div.archive-pagination').xpath('./a[@class="next page-numbers"]/@href').extract_first()
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_item(self, response):
item = NoticiasItem()
text = ''
news_date = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
title = response.css('div.post-container').css('h1.post-title').extract_first()
if title is not None : title = remove_tags(title)
topic = None
for p in response.css('div.post-column > article').css('p').extract():
p = remove_tags(p)
text += p + "\n"
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = diarioPuntual.settings
[deploy]
#url = http://localhost:6800/
project = diarioPuntual
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class ElcomentarioSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class ElcomentarioDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for elComentario project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'elComentario'
SPIDER_MODULES = ['elComentario.spiders']
NEWSPIDER_MODULE = 'elComentario.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'elComentario (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'elComentario.middlewares.ElcomentarioSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'elComentario.middlewares.ElcomentarioDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'elComentario.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
MEDIA:
El Comentario, Colima
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd elComentario/
$ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
"""
import scrapy, re
from elComentario.items import NoticiasItem
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
baseURL = "https://elcomentario.ucol.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
yield scrapy.Request(url=baseURL, callback=self.parse)
def parse(self, response):
for link in response.css('div.articles').xpath('./article/div[@class="cnt"]/h3/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
next_page = response.css('div.post-pagination').xpath('./a[@title="Next page"]/@href').extract_first()
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_item(self, response):
item = NoticiasItem()
text = ''
news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
title = response.xpath('//header/h1').extract_first()
if title is not None : title = remove_tags(title)
topic = response.css('a.theme').extract_first()
if topic is not None : topic = remove_tags(topic)
for p in response.css('div.pf-content').css('p').extract():
p = remove_tags(p)
text += p + "\n"
text = text.strip()
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = elComentario.settings
[deploy]
#url = http://localhost:6800/
project = elComentario
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class ElsurSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class ElsurDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for elSur project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'elSur'
SPIDER_MODULES = ['elSur.spiders']
NEWSPIDER_MODULE = 'elSur.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'elSur (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'elSur.middlewares.ElsurSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'elSur.middlewares.ElsurDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'elSur.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
MEDIA:
El Sur, Guerrero
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd elComentario/
$ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
"""
import scrapy, re
from elSur.items import NoticiasItem
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
HEAD_RE_1 = re.compile(r'Texto:.*?[/y] Foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
HEAD_RE_2 = re.compile(r'Texto y foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
baseURL = "https://suracapulco.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
yield scrapy.Request(url=baseURL, callback=self.parse)
def parse(self, response):
for link in response.css('div.dslc-blog-posts').css('div.dslc-blog-post-title > h2 > a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
pag_lst = response.css('div.dslc-pagination > ul > li')
if len(pag_lst) > 0:
del pag_lst[0]
del pag_lst[0]
next_page = None
for li_obj in pag_lst:
li = remove_tags(li_obj.extract())
if not li.isdigit():
next_page = li_obj.xpath('./a/@href').extract_first()
break
if next_page is not None : yield scrapy.Request(url=next_page, callback=self.parse)
def parse_item(self, response):
item = NoticiasItem()
text = ''
news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
title = response.css('div.dslc-tp-title > h1').extract_first()
if title is not None : title = remove_tags(title)
topic = response.css('div.dslc-tp-meta').xpath('./ul/li[3]/a[1]').extract_first()
if topic is not None : topic = remove_tags(topic)
for p in response.xpath('//div[@id="dslc-theme-content-inner"]').css('p').extract():
p = remove_tags(p)
text += p + "\n"
dateline = response.css('span.dateline').extract_first()
if dateline is not None:
dateline = remove_tags(dateline)
text = text.replace(dateline, '')
text = text.replace(u'\u00a0', ' ')
text = HEAD_RE_1.sub('', text)
text = HEAD_RE_2.sub('', text)
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = elSur.settings
[deploy]
#url = http://localhost:6800/
project = elSur
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import scrapy, re
from diarioCoLatino.items import NoticiasItem
""" """
MEDIO: MEDIA:
Diario Co Latino, El Salvador Diario Co Latino, El Salvador
USO:
scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23 USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd diarioCoLatino/
$ scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
""" """
import scrapy, re
from diarioCoLatino.items import NoticiasItem
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
...@@ -20,9 +26,14 @@ LOC_RE = re.compile(r'\n.*?\/(PL|AFP|DPA|SIGNIS ALC)\n', re.I) ...@@ -20,9 +26,14 @@ LOC_RE = re.compile(r'\n.*?\/(PL|AFP|DPA|SIGNIS ALC)\n', re.I)
EM_RE = re.compile(r'\n((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?\n') EM_RE = re.compile(r'\n((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?\n')
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
year = getattr(self, "year", None) year = getattr(self, "year", None)
month = getattr(self, "month", None) month = getattr(self, "month", None)
...@@ -33,6 +44,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -33,6 +44,7 @@ class QuotesSpider(scrapy.Spider):
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response): def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
...@@ -45,20 +57,22 @@ class QuotesSpider(scrapy.Spider): ...@@ -45,20 +57,22 @@ class QuotesSpider(scrapy.Spider):
yield scrapy.Request(url=self.baseURL + "/page/" + str(page + 1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL + "/page/" + str(page + 1), callback=self.parse_page)
def parse_page(self, response): def parse_page(self, response):
for link in response.xpath('//div[@class="post-listing"]/article/h2/a/@href').extract(): for link in response.css('div.content').css('div.post-listing').xpath('./article/h2/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
"La fecha obtenida ya incluye formato y zona horaria" # La fecha obtenida ya incluye formato y zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = remove_tags(response.css('h1.entry-title').css('span').extract_first()).strip() news_title = remove_tags(response.css('h1.entry-title').css('span').extract_first()).strip()
item['topic'] = None news_topic = None
for p in response.xpath('//div[@class="entry"]/p').extract(): for p in response.xpath('//div[@class="entry"]/p').extract():
text += remove_tags(p) + "\n" text += remove_tags(p) + "\n"
...@@ -69,27 +83,30 @@ class QuotesSpider(scrapy.Spider): ...@@ -69,27 +83,30 @@ class QuotesSpider(scrapy.Spider):
text = "\n" + text text = "\n" + text
""" Obtiene autor """ """ Obtiene autor """
news_author = None
res = AUTH_RE.match(text) res = AUTH_RE.match(text)
if res: if res:
m = res.group(0) m = res.group(0)
item['author'] = m[m.find('Por')+len('Por'):].strip() news_author = m[m.find('Por')+len('Por'):].strip()
text = text[text.find(m) + len(m):].strip() text = text[text.find(m) + len(m):].strip()
text = "\n" + text text = "\n" + text
""" Elimina twitter """ """ Elimina twitter """
news_twitter = None
res = TW_RE.search(text) res = TW_RE.search(text)
if res: if res:
m = res.group(0) m = res.group(0)
item['twitter'] = m.strip() news_twitter = m.strip()
text = text[text.find(m) + len(m):].strip() text = text[text.find(m) + len(m):].strip()
text = "\n" + text text = "\n" + text
""" Obtiene lugar """ """ Obtiene lugar """
news_loc = None
res = LOC_RE.match(text) res = LOC_RE.match(text)
if res: if res:
m = res.group(0) m = res.group(0)
if m[m.find('/') + 1:].strip().lower() != 'dpa': if m[m.find('/') + 1:].strip().lower() != 'dpa':
item['location'] = m[:m.find('/')].strip() news_loc = m[:m.find('/')].strip()
text = text[text.find(m) + len(m):].strip() text = text[text.find(m) + len(m):].strip()
text = "\n" + text text = "\n" + text
else: else:
...@@ -97,10 +114,11 @@ class QuotesSpider(scrapy.Spider): ...@@ -97,10 +114,11 @@ class QuotesSpider(scrapy.Spider):
text = "\n" + text text = "\n" + text
""" Elimina correo """ """ Elimina correo """
news_email = None
res = EM_RE.search(text) res = EM_RE.search(text)
if res: if res:
m = res.group(0) m = res.group(0)
item['email'] = m.strip() news_email = m.strip()
# text = text[text.find(m) + len(m):].strip() # text = text[text.find(m) + len(m):].strip()
text = text.replace(m, '').strip() text = text.replace(m, '').strip()
text = "\n" + text text = "\n" + text
...@@ -108,7 +126,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -108,7 +126,7 @@ class QuotesSpider(scrapy.Spider):
res = EM_RE.search(text) res = EM_RE.search(text)
if res: if res:
m = res.group(0) m = res.group(0)
item['email'] = m.strip() news_email = m.strip()
# text = text[text.find(m) + len(m):].strip() # text = text[text.find(m) + len(m):].strip()
text = text.replace(m, '').strip() text = text.replace(m, '').strip()
text = "\n" + text text = "\n" + text
...@@ -119,8 +137,16 @@ class QuotesSpider(scrapy.Spider): ...@@ -119,8 +137,16 @@ class QuotesSpider(scrapy.Spider):
text = "\n" + text text = "\n" + text
text = text.replace("\nCo Latino\n", '').strip() text = text.replace("\nCo Latino\n", '').strip()
item['text'] = text.strip()
## News item info ##
item['date'] = news_date
item['title'] = news_title
item['topic'] = news_topic
item['author'] = news_author
item['twitter'] = news_twitter
item['location'] = news_loc
item['email'] = news_email
item['text'] = text.strip()
item['url'] = response.url item['url'] = response.url
yield item yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = surDeCampeche.settings
[deploy]
#url = http://localhost:6800/
project = surDeCampeche
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class SurdecampecheSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class SurdecampecheDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for surDeCampeche project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'surDeCampeche'
SPIDER_MODULES = ['surDeCampeche.spiders']
NEWSPIDER_MODULE = 'surDeCampeche.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'surDeCampeche (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'surDeCampeche.middlewares.SurdecampecheSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'surDeCampeche.middlewares.SurdecampecheDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'surDeCampeche.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from surDeCampeche.items import NoticiasItem
"""
MEDIO:
El Sur de Campeche, Campeche
USO:
scrapy crawl noticias --nolog -s filename=2018-08-10.json -a year=2018 -a month=8 -a day=10
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "http://www.elsur.mx/" + year + "/" + month + "/" + day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//a[@class="page-numbers"]/@href').extract()
if pagination is not None and len(pagination) > 0:
pages = pagination[-1].rstrip("/")
pages = int(pages[pages.rfind("/") + 1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page + 1), callback=self.parse_page)
def parse_page(self, response):
for link in response.css('div.news_box_inner_content').css('div.news_box_item_content').xpath('./h3/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
# La fecha obtenida ya incluye formato y zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = remove_tags(response.xpath('//div[@class="post_title_wrapper"]/h1').extract_first())
try:
topic = remove_tags(response.css('span.blog_meta_category').css('a').extract_first())
except:
topic = None
item['topic'] = topic
for p in response.css('div.entry-content').css('p').extract():
text += remove_tags(p) + "\n"
item['text'] = text
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = tribunaCampeche.settings
[deploy]
#url = http://localhost:6800/
project = tribunaCampeche
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class TribunacampecheSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class TribunacampecheDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for tribunaCampeche project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'tribunaCampeche'
SPIDER_MODULES = ['tribunaCampeche.spiders']
NEWSPIDER_MODULE = 'tribunaCampeche.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tribunaCampeche (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'tribunaCampeche.middlewares.TribunacampecheSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'tribunaCampeche.middlewares.TribunacampecheDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'tribunaCampeche.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from tribunaCampeche.items import NoticiasItem
"""
MEDIO:
Tribuna, Campeche
USO:
scrapy crawl noticias --nolog -s filename=2018-08-10.json -a year=2018 -a month=8 -a day=10
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "http://tribunacampeche.com/" + year + "/" + month + "/" + day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//a[@class="page-numbers"]/@href').extract()
if pagination is not None and len(pagination) > 0:
pages = pagination[-1].rstrip("/")
pages = int(pages[pages.rfind("/") + 1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page + 1), callback=self.parse_page)
def parse_page(self, response):
for link in response.css('div.vw-post-box').css('div.vw-post-box-inner').xpath('./h3/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
# La fecha obtenida ya incluye formato y zona horaria
item['date'] = response.xpath('//time[@itemprop="datePublished"]/@datetime').extract_first()
item['title'] = remove_tags(response.xpath('//h1[@class="entry-title"]').extract_first())
try:
topic = remove_tags(response.css('article.vw-main-post').xpath('./div[@class="vw-post-categories"]/div/a').extract_first())
except:
topic = None
item['topic'] = topic
for p in response.css('div.vw-post-content').css('p').extract():
text += remove_tags(p) + "\n"
item['text'] = text
item['url'] = response.url
yield item
...@@ -3,12 +3,18 @@ ...@@ -3,12 +3,18 @@
# Define here the models for your scraped items # Define here the models for your scraped items
# #
# See documentation in: # See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html # https://doc.scrapy.org/en/latest/topics/items.html
import scrapy import scrapy
class ProcesoItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,52 +5,100 @@ ...@@ -5,52 +5,100 @@
# See documentation in: # See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals import os, tempfile, time, sys, logging, dryscrape
from scrapy.downloadermiddlewares.redirect import RedirectMiddleware
logger = logging.getLogger(__name__)
class ProcesoSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod class ThreatDefenceRedirectMiddleware(RedirectMiddleware):
def from_crawler(cls, crawler): def __init__(self, settings):
# This method is used by Scrapy to create your spiders. super(ThreatDefenceRedirectMiddleware, self).__init__(settings)
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider): # start xvfb to support headless scraping
# Called for each response that goes through the spider if 'linux' in sys.platform:
# middleware and into the spider. dryscrape.start_xvfb()
# Should return None or raise an exception. self.dryscrape_session = dryscrape.Session(base_url='https://hemeroteca.proceso.com.mx/')
return None for key, value in settings['DEFAULT_REQUEST_HEADERS'].items():
# seems to be a bug with how webkit-server handles accept-encoding
if key.lower() != 'accept-encoding':
self.dryscrape_session.set_header(key, value)
def process_spider_output(response, result, spider): def _redirect(self, redirected, request, spider, reason):
# Called with the results returned from the Spider, after # act normally if this isn't a threat defense redirect
# it has processed the response. if not self.is_threat_defense_url(redirected.url):
return super(ThreatDefenceRedirectMiddleware, self)._redirect(redirected, request, spider, reason)
# Must return an iterable of Request, dict or Item objects. logger.debug('Proceso threat defense triggered for {0}'.format(request.url))
for i in result: request.cookies = self.bypass_threat_defense(redirected.url)
yield i request.dont_filter = True # prevents the original link being marked a dupe
return request
def process_spider_exception(response, exception, spider): def is_threat_defense_url(self, url):
# Called when a spider or process_spider_input() method return 'proceso.com.mx' in url
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict def bypass_threat_defense(self, url=None):
# or Item objects. # only navigate if any explicit url is provided
pass if url:
self.dryscrape_session.visit(url)
def process_start_requests(start_requests, spider): # solve the captcha if there is one
# Called with the start requests of the spider, and works # captcha_images = self.dryscrape_session.css('img[src *= captcha]')
# similarly to the process_spider_output() method, except # if len(captcha_images) > 0:
# that it doesn’t have a response associated. # return self.solve_captcha(captcha_images[0])
# Must return only requests (not items). # click on any explicit retry links
for r in start_requests: # retry_links = self.dryscrape_session.css('a[href *= threat_defence]')
yield r # if len(retry_links) > 0:
# return self.bypass_threat_defense(retry_links[0].get_attr('href'))
def spider_opened(self, spider): # otherwise, we're on a redirect page so wait for the redirect and try again
spider.logger.info('Spider opened: %s' % spider.name) self.wait_for_redirect()
return self.bypass_threat_defense()
def wait_for_redirect(self, url = None, wait = 0.1, timeout=10):
url = url or self.dryscrape_session.url()
for i in range(int(timeout//wait)):
time.sleep(wait)
if self.dryscrape_session.url() != url:
return self.dryscrape_session.url()
logger.error("Maybe {0} isn't a redirect URL?".format(self.dryscrape_session.url()))
raise Exception('Timed out on the zipru redirect page.')
# def solve_captcha(self, img, width=1280, height=800):
# # take a screenshot of the page
# self.dryscrape_session.set_viewport_size(width, height)
# filename = tempfile.mktemp('.png')
# self.dryscrape_session.render(filename, width, height)
# # inject javascript to find the bounds of the captcha
# js = 'document.querySelector("img[src *= captcha]").getBoundingClientRect()'
# rect = self.dryscrape_session.eval_script(js)
# box = (int(rect['left']), int(rect['top']), int(rect['right']), int(rect['bottom']))
# # solve the captcha in the screenshot
# image = Image.open(filename)
# os.unlink(filename)
# captcha_image = image.crop(box)
# captcha = pytesseract.image_to_string(captcha_image)
# logger.debug(f'Solved the Zipru captcha: "{captcha}"')
# # submit the captcha
# input = self.dryscrape_session.xpath('//input[@id = "solve_string"]')[0]
# input.set(captcha)
# button = self.dryscrape_session.xpath('//button[@id = "button_submit"]')[0]
# url = self.dryscrape_session.url()
# button.click()
# # try again if it we redirect to a threat defense URL
# if self.is_threat_defense_url(self.wait_for_redirect(url)):
# return self.bypass_threat_defense()
# # otherwise return the cookies as a dict
# cookies = {}
# for cookie_string in self.dryscrape_session.cookies():
# if 'domain=zipru.to' in cookie_string:
# key, value = cookie_string.split(';')[0].split('=')
# cookies[key] = value
# return cookies
...@@ -3,9 +3,73 @@ ...@@ -3,9 +3,73 @@
# Define your item pipelines here # Define your item pipelines here
# #
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class ProcesoPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -16,10 +16,10 @@ NEWSPIDER_MODULE = 'proceso.spiders' ...@@ -16,10 +16,10 @@ NEWSPIDER_MODULE = 'proceso.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent # Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'proceso (+http://www.yourdomain.com)' USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,22 +27,27 @@ ROBOTSTXT_OBEY = True ...@@ -27,22 +27,27 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = True
COOKIES_DEBUG = True
SPLASH_COOKIES_DEBUG = True
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
# Override the default request headers: # Override the default request headers:
#DEFAULT_REQUEST_HEADERS = { # DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'User-Agent': USER_AGENT,
# 'Connection': 'Keep-Alive',
# # 'Accept-Encoding': 'gzip, deflate',
# 'Accept-Language': 'en', # 'Accept-Language': 'en',
#} # }
# Enable or disable spider middlewares # Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
...@@ -54,10 +59,11 @@ SPIDER_MIDDLEWARES = { ...@@ -54,10 +59,11 @@ SPIDER_MIDDLEWARES = {
# Enable or disable downloader middlewares # Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = { DOWNLOADER_MIDDLEWARES = {
# 'proceso.middlewares.MyCustomDownloaderMiddleware': 543, # 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None,
'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725, 'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
# 'proceso.middlewares.ThreatDefenceRedirectMiddleware': 820,
} }
# Enable or disable extensions # Enable or disable extensions
...@@ -68,9 +74,9 @@ DOWNLOADER_MIDDLEWARES = { ...@@ -68,9 +74,9 @@ DOWNLOADER_MIDDLEWARES = {
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'proceso.pipelines.ProcesoPipeline': 300, 'proceso.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
# Scrapy settings for proceso project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'proceso'
SPIDER_MODULES = ['proceso.spiders']
NEWSPIDER_MODULE = 'proceso.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'proceso (+http://www.yourdomain.com)'
# USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
# 'proceso.middlewares.ProcesoSpiderMiddleware': 543,
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
# 'proceso.middlewares.MyCustomDownloaderMiddleware': 543,
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'proceso.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
SPLASH_URL = 'http://localhost:8050/'
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
import scrapy, re # -*- coding: utf-8 -*-
from datetime import datetime, date, timedelta, tzinfo
from scrapy_splash import SplashRequest
""" """
Para este sitio se hace uso de 'scrapy-splash' porque el contenido es cargado a traves de javascript MEDIA:
Proceso, CDMX
USAGE:
## For this crawler 'scrapy-splash' is used because the content is loaded through javascript. ##
## Read especs_sitio_proceso.txt file. ##
$ cd proceso/
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
for the news contained in noticias.json being splitted into files by date. ##
USO: $ scrapy crawl noticias --nolog -s filename=noticias.json
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 ------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to a specific year-month date. ##
$ scrapy crawl noticias --nolog -s filename=2018-09.json -a year=2018 -a month=9
""" """
import scrapy, re, cfscrape
from proceso.items import NoticiasItem
from datetime import datetime, date, timedelta, tzinfo
from scrapy.http.cookies import CookieJar
from scrapy_splash import SplashRequest, SplashFormRequest
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
class UTC(tzinfo):
"""
Class for Time Zone
"""
def utcoffset(self, dt): def utcoffset(self, dt):
# zona horaria para el centro de mexico: utc-6 ## Time zone for CDMX: UTC-6 ##
return timedelta(hours=-6) return timedelta(hours=-6)
def tzname(self, dt): def tzname(self, dt):
# nombre de la zona horaria ## Time zone name ##
return 'UTC-6' return 'UTC-6'
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
class NoticiasItem(scrapy.Item): script = """
title = scrapy.Field() function main(splash)
text = scrapy.Field() splash:init_cookies(splash.args.cookies)
date = scrapy.Field() assert(splash:go{
location = scrapy.Field() splash.args.url,
author = scrapy.Field() headers=splash.args.headers,
topic = scrapy.Field() http_method=splash.args.http_method,
url = scrapy.Field() body=splash.args.body,
})
assert(splash:wait(0.5))
local entries = splash:history()
local last_response = entries[#entries].response
return {
url = splash:url(),
headers = last_response.headers,
http_status = last_response.status,
cookies = splash:get_cookies(),
html = splash:html(),
}
end
"""
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
self.tz = UTC() self.tz = UTC()
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
# day = getattr(self, 'day', None)
parse_month = {'1': 'enero', '2': 'febrero', '3': 'marzo', '4': 'abril',
'5': 'mayo', '6': 'junio', '7': 'julio', '8': 'agosto',
'9': 'septiembre', '10': 'octubre', '11': 'noviembre', '12': 'diciembre'}
self.date = parse_month[month]+' de '+year self.month_parser = {
'enero' : '01', 'febrero' : '02', 'marzo' : '03', 'abril' : '04',
'mayo' : '05', 'junio' : '06', 'julio' : '07', 'agosto' : '08',
'septiembre' : '09', 'octubre' : '10', 'noviembre' : '11', 'diciembre' : '12'
}
self.baseURL = "https://hemeroteca.proceso.com.mx/"
login_url = "https://hemeroteca.proceso.com.mx/wp-login.php"
if year is not None and month is not None:
self.stop_date = date(int(year), int(month), 15)
# yield scrapy.Request(url=login_url, callback=self.parse_with_stop_date)
else:
self.stop_date = None
# yield scrapy.Request(url=login_url, callback=self.parse)
token, agent = cfscrape.get_tokens(login_url, user_agent=USER_AGENT)
print token
print "\n"
yield SplashRequest(url=login_url, callback=self.parse_login, endpoint='render.html', args={ 'wait': 0.5 }, cookies=token,
headers={'User-Agent' : agent})
def parse_login(self, response):
return SplashFormRequest.from_response(
response,
formdata = {
'log' : 'carlos_silvaforne@yahoo.com.mx',
'pwd' : 'carlos_silvaforne@'
},
callback = self.after_login,
dont_click = True
)
def after_login(self, response):
## Check login succeed before going on ##
print response.headers
print "\n"
print response.real_url
print "\n"
print response.request.headers
print "\n"
cookie_list = response.request.headers.getlist('Cookie')
cfc, cfd = cookie_list[0].split(';')
cfc = cfc.strip().split('=')
cfd = cfd.strip().split('=')
cookies = [cfc[1], cfd[1]]
cookies = {cfc[0]: cfc[1], cfd[0]: cfd[1]}
session_legend = response.css('div.topnav > a').extract()[-1]
print response.css('h1.entry-title').extract_first()
print "\n"
if session_legend is not None:
session_legend = remove_tags(session_legend)
self.baseURL='http://hemeroteca.proceso.com.mx/?page_id=111058&edicion=mexico&page=' if not "Cerrar" in session_legend:
print "Login failed."
yield scrapy.Request(url=self.baseURL+self.year, callback=self.parse) else:
print session_legend
print "\n"
token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
print token
print "\n"
if self.stop_date is None:
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response): else:
for post in response.css('div.catpor-box'): self.meta = response.request.meta
post_date = post.xpath('./div/span[@class="catpor-published clearfix"]/text()').extract_first() yield SplashRequest(url=self.baseURL, callback=self.parse_with_stop_date,
post_date = post_date[post_date.find('d')+3:] meta=self.meta,
endpoint='execute',
cache_args=['lua_source'],
args={'lua_source': script},
headers={'User-Agent': USER_AGENT}
)
# request = SplashRequest(url=self.baseURL, callback=self.parse_with_stop_date,
# endpoint='execute',
# cache_args=['lua_source'],
# args={'lua_source': script},
# headers={'User-Agent': USER_AGENT}
# )
# request.meta['splash']['session_id'] = cookie_list[0]
# yield request
if post_date == self.date:
link = post.xpath('./div/div/a/@href').extract_first()
yield scrapy.Request(url=link, callback=self.parse_2)
def parse_2(self, response): # if "authentication failed" in response.body:
for link in response.xpath('//*[@class="post-container clearfix"]/h2/a/@href').extract(): # self.logger.error("Login failed.")
# yield scrapy.Request(url=link, callback=self.parse_item) # return
yield SplashRequest(url=link, callback=self.parse_item, endpoint='render.html', args={ 'wait': 0.5 })
# else:
# # token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
# if self.stop_date is None:
# yield scrapy.Request(url=self.baseURL, callback=self.parse, dont_filter=True)
# else:
# yield scrapy.Request(url=self.baseURL, callback=self.parse_with_stop_date, dont_filter=True)
# yield scrapy.Request(
# url=self.baseURL,
# callback=self.parse_with_stop_date,
# cookies=token,
# headers={'User-Agent' : agent}
# )
def parse_with_stop_date(self, response):
print "parse_with_stop_date"
print "\n"
print response.css('h1.entry-title').extract_first()
# print "\n"
# print response.cookiejar
print "\n"
print response.headers
print "\n"
# session_legend = response.css('div.topnav > a').extract()[-1]
# if session_legend is not None :
# print remove_tags(session_legend)
# print "\n"
# else :
# print "No log in."
TO_NEXT_PAGE = True
for item in response.css('div.catpor-box > div'):
item_date = item.css('span.catpor-published').extract_first()
if item_date is not None:
item_date = remove_tags(item_date).replace(",", '')
item_date = item_date.split(' ')
item_date[1] = self.month_parser[item_date[1]]
item_date = map(int, item_date)
item_date = date(item_date[2], item_date[1], item_date[0])
if item_date >= self.stop_date:
item_link = item.css('span.catpor-title > a::attr(href)').extract_first()
print item_link
# token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
# yield scrapy.Request(url=item_link, callback=self.parse_links, cookies=cookies)
# yield scrapy.Request(url=item_link, callback=self.parse_links, cookies=token,
# headers={'User-Agent' : agent})
yield SplashRequest(url=item_link, callback=self.parse_links,
endpoint='execute',
cache_args=['lua_source'],
args={'lua_source': script},
headers={'User-Agent': USER_AGENT}
)
else:
TO_NEXT_PAGE = False
break
if TO_NEXT_PAGE:
next_page = response.css('div.page-navigation > div.nav-next > a::attr(href)').extract_first()
if next_page is not None:
# yield scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
yield SplashRequest(url=next_page, callback=self.parse_with_stop_date,
endpoint='execute',
cache_args=['lua_source'],
args={'lua_source': script},
headers={'User-Agent': USER_AGENT}
)
def parse_links(self, response):
print "\n\n"
print response.headers
for link in response.css('div.post-container > h2 > a::attr(href)').extract():
# print link
# token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
# yield SplashRequest(url=link, callback=self.parse_item, endpoint='render.html', args={ 'wait': 0.5 }, cookies=token,
# headers={'User-Agent' : agent})
yield SplashRequest(url=link, callback=self.parse_item,
endpoint='execute',
cache_args=['lua_source'],
args={'lua_source': script},
headers={'User-Agent': USER_AGENT}
)
def parse_item(self, response): def parse_item(self, response):
# if response.url == "https://hemeroteca.proceso.com.mx/?page_id=278958&a51dc26366d99bb5fa29cea4747565fec=420203":
# print response.body
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
d = response.xpath('//div[@id="primary"]/div/div/div/div/span[@class="published"]/text()').extract_first() news_date = response.xpath('//div[@id="primary"]').css('span.published').extract_first()
d, t = d.split(' ') if news_date is not None:
d = map(int, d.split('-')) news_date = remove_tags(news_date)
t = map(int, t.split(':')) print news_date
d = datetime(d[0],d[1],d[2],t[0],t[1],t[2],tzinfo=self.tz).isoformat('T') d, t = news_date.split(' ')
item['date'] = d d = map(int, d.split("-"))
t = map(int, t.split(":"))
news_date = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat('T')
item['title'] = response.xpath('//div[@id="primary"]/div/h1/text()').extract_first() title = response.xpath('//div[@id="primary"]/div/h1').extract_first()
item['topic'] = response.xpath('//span[@class="entry-categories"]/text()').extract_first() if title is not None : title = remove_tags(title)
for paragraph in response.xpath('//div[@id="primary"]/div/div/div/div[@class="entry-content"]/div/p').extract(): topic = response.css('span.entry-categories').extract_first()
if topic is not None : topic = remove_tags(topic)
for paragraph in response.xpath('//div[@id="primary"]').css('div.entry-content > div').css('p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url item['url'] = response.url
# print item['title']
yield item yield item
# -*- coding: utf-8 -*-
"""
MEDIA:
Proceso, CDMX
USAGE:
## For this crawler 'scrapy-splash' is used because the content is loaded through javascript. ##
## Read especs_sitio_proceso.txt file. ##
$ cd proceso/
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
for the news contained in noticias.json being splitted into files by date. ##
$ scrapy crawl noticias --nolog -s filename=noticias.json
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to a specific year-month date. ##
$ scrapy crawl noticias --nolog -s filename=2018-09.json -a year=2018 -a month=9
"""
import scrapy, re, time, cfscrape
from proceso.items import NoticiasItem
from datetime import datetime, date, timedelta, tzinfo
from scrapy_splash import SplashRequest, SplashFormRequest
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""
Class for Time Zone
"""
def utcoffset(self, dt):
## Time zone for CDMX: UTC-6 ##
return timedelta(hours=-6)
def tzname(self, dt):
## Time zone name ##
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
# name = "noticias"
def start_requests(self):
self.tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
self.month_parser = {
'enero' : '01', 'febrero' : '02', 'marzo' : '03', 'abril' : '04',
'mayo' : '05', 'junio' : '06', 'julio' : '07', 'agosto' : '08',
'septiembre' : '09', 'octubre' : '10', 'noviembre' : '11', 'diciembre' : '12'
}
self.baseURL = "https://hemeroteca.proceso.com.mx/"
login_url = "https://hemeroteca.proceso.com.mx/wp-login.php"
if year is not None and month is not None:
self.stop_date = date(int(year), int(month), 22)
# yield scrapy.Request(url=self.baseURL, callback=self.parse_with_stop_date)
else:
self.stop_date = None
# yield scrapy.Request(url=self.baseURL, callback=self.parse)
# yield scrapy.Request(url=login_url, callback=self.parse_login)
yield SplashRequest(url=login_url, callback=self.parse_login, endpoint='render.html', args={ 'wait': 0.5 })
def parse_login(self, response):
print "parse_login"
# return scrapy.FormRequest.from_response(
# response,
# formdata = {
# 'log' : 'carlos_silvaforne@yahoo.com.mx',
# 'pwd' : 'carlos_silvaforne@'
# },
# callback = self.after_login
# )
return SplashFormRequest.from_response(
response,
formdata = {
'log' : 'carlos_silvaforne@yahoo.com.mx',
'pwd' : 'carlos_silvaforne@'
# 'log' : 'myusr',
# 'pwd' : 'mypwd'
},
callback = self.after_login,
# callback = self.parse_with_stop_date,
dont_click = True
)
def after_login(self, response):
## Check login succeed before going on ##
print "after_login"
# print response.body
if "authentication failed" in response.body:
self.logger.error("Login failed.")
return
else:
print "passed"
# token, agent = cfscrape.get_tokens(self.baseURL)
if self.stop_date is None:
pass
# yield scrapy.Request(url=self.baseURL, callback=self.parse, dont_filter=True)
else:
# yield scrapy.Request(url=self.baseURL, callback=self.parse_with_stop_date, dont_filter=True)
# yield SplashRequest(url=self.baseURL, callback=self.parse_with_stop_date)
yield scrapy.Request(
url=self.baseURL,
callback=self.parse_with_stop_date,
cookies=token,
headers={'User-Agent' : agent}
)
def parse_with_stop_date(self, response):
TO_NEXT_PAGE = True
for item in response.css('div.catpor-box > div'):
item_date = item.css('span.catpor-published').extract_first()
if item_date is not None:
item_date = remove_tags(item_date).replace(",", '')
item_date = item_date.split(' ')
item_date[1] = self.month_parser[item_date[1]]
item_date = map(int, item_date)
item_date = date(item_date[2], item_date[1], item_date[0])
if item_date >= self.stop_date:
item_link = item.css('span.catpor-title > a::attr(href)').extract_first()
yield scrapy.Request(url=item_link, callback=self.parse_links)
else:
TO_NEXT_PAGE = False
break
if TO_NEXT_PAGE:
next_page = response.css('div.page-navigation > div.nav-next > a::attr(href)').extract_first()
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
def parse_links(self, response):
for link in response.css('div.post-container > h2 > a::attr(href)').extract():
# print link
yield SplashRequest(url=link, callback=self.parse_item, endpoint='render.html', args={ 'wait': 0.5 })
def parse_item(self, response):
item = NoticiasItem()
text = ''
news_date = response.xpath('//div[@id="primary"]').css('span.published').extract_first()
if news_date is not None:
news_date = remove_tags(news_date)
print news_date
d, t = news_date.split(' ')
d = map(int, d.split("-"))
t = map(int, t.split(":"))
news_date = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat('T')
title = response.xpath('//div[@id="primary"]/div/h1').extract_first()
if title is not None : title = remove_tags(title)
topic = response.css('span.entry-categories').extract_first()
if topic is not None : topic = remove_tags(topic)
for paragraph in response.xpath('//div[@id="primary"]').css('div.entry-content > div').css('p').extract():
text += remove_tags(paragraph) + '\n'
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class ProcesoPruebaSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class ProcesoPruebaDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for proceso_prueba project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'proceso_prueba'
SPIDER_MODULES = ['proceso_prueba.spiders']
NEWSPIDER_MODULE = 'proceso_prueba.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = True
COOKIES_DEBUG = True
SPLASH_COOKIES_DEBUG = True
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
# 'proceso_prueba.middlewares.ProcesoPruebaSpiderMiddleware': 543,
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'proceso_prueba.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
SPLASH_URL = 'http://localhost:8050/'
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
MEDIA:
Proceso, CDMX
USAGE:
## For this crawler 'scrapy-splash' is used because the content is loaded through javascript. ##
## Read especs_sitio_proceso.txt file. ##
$ cd proceso_prueba/
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
for the news contained in noticias.json being splitted into files by date. ##
$ scrapy crawl noticias --nolog -s filename=noticias.json
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to a specific year-month date. ##
$ scrapy crawl noticias --nolog -s filename=2018-09.json -a year=2018 -a month=9
"""
import scrapy, re, cfscrape
from proceso_prueba.items import NoticiasItem
from datetime import datetime, date, timedelta, tzinfo
from scrapy.http.cookies import CookieJar
from scrapy_splash import SplashRequest, SplashFormRequest
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""
Class for Time Zone
"""
def utcoffset(self, dt):
## Time zone for CDMX: UTC-6 ##
return timedelta(hours=-6)
def tzname(self, dt):
## Time zone name ##
return 'UTC-6'
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
script = """
function main(splash)
splash:init_cookies(splash.args.cookies)
assert(splash:go{
splash.args.url,
headers=splash.args.headers,
http_method=splash.args.http_method,
body=splash.args.body,
})
assert(splash:wait(0.5))
local entries = splash:history()
local last_response = entries[#entries].response
return {
url = splash:url(),
headers = last_response.headers,
http_status = last_response.status,
cookies = splash:get_cookies(),
html = splash:html(),
}
end
"""
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
self.tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
self.month_parser = {
'enero' : '01', 'febrero' : '02', 'marzo' : '03', 'abril' : '04',
'mayo' : '05', 'junio' : '06', 'julio' : '07', 'agosto' : '08',
'septiembre' : '09', 'octubre' : '10', 'noviembre' : '11', 'diciembre' : '12'
}
self.baseURL = "https://hemeroteca.proceso.com.mx/"
login_url = "https://hemeroteca.proceso.com.mx/wp-login.php"
if year is not None and month is not None:
self.stop_date = date(int(year), int(month), 15)
# yield scrapy.Request(url=login_url, callback=self.parse_with_stop_date)
else:
self.stop_date = None
# yield scrapy.Request(url=login_url, callback=self.parse)
self.token, self.agent = cfscrape.get_tokens(login_url, user_agent=USER_AGENT)
print "token"
print self.token
yield SplashRequest(url=login_url, callback=self.parse_login, endpoint='render.html', args={ 'wait': 0.5 }, cookies=self.token,
headers={'User-Agent' : self.agent})
def parse_login(self, response):
return SplashFormRequest.from_response(
response,
method="POST",
formdata = {
'log' : 'carlos_silvaforne@yahoo.com.mx',
'pwd' : 'carlos_silvaforne@'
},
callback = self.after_login,
dont_click = True
)
def after_login(self, response):
print "\nresponse.request.headers"
print response.request.meta
print "\nresponse.headers"
print response.headers
session_legend = response.css('div.topnav > a').extract()[-1]
if session_legend is not None :
print "\nsession_legend"
print remove_tags(session_legend)
else :
print "No log in."
item_link = "https://hemeroteca.proceso.com.mx/?page_id=420325"
yield SplashRequest(url=item_link, callback=self.parse_links,
meta=response.request.meta,
endpoint='execute',
cache_args=['lua_source'],
args={'lua_source': script},
headers={'User-Agent': self.agent}
)
# yield SplashRequest(url=item_link, callback=self.parse_links, endpoint='render.html', args={ 'wait': 0.5 }, cookies=self.token,
# headers={'User-Agent' : self.agent})
def parse_links(self, response):
session_legend = response.css('div.topnav > a').extract()
print "\nresponse.body"
print response.body
# if session_legend is not None :
# print "\nsession_legend"
# print remove_tags(session_legend)
# else :
# print "No log in."
print "\nresponse.request.headers"
print response.request.headers
print "\n"
def after_login_org(self, response):
## Check login succeed before going on ##
print response.headers
print "\n"
print response.real_url
print "\n"
print response.request.headers
print "\n"
cookie_list = response.request.headers.getlist('Cookie')
cfc, cfd = cookie_list[0].split(';')
cfc = cfc.strip().split('=')
cfd = cfd.strip().split('=')
cookies = [cfc[1], cfd[1]]
cookies = {cfc[0]: cfc[1], cfd[0]: cfd[1]}
session_legend = response.css('div.topnav > a').extract()[-1]
print response.css('h1.entry-title').extract_first()
print "\n"
if session_legend is not None:
session_legend = remove_tags(session_legend)
if not "Cerrar" in session_legend:
print "Login failed."
else:
print session_legend
print "\n"
token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
print token
print "\n"
if self.stop_date is None:
yield scrapy.Request(url=self.baseURL, callback=self.parse)
else:
yield SplashRequest(url=self.baseURL, callback=self.parse_with_stop_date,
cookies=cookies,
endpoint='execute',
cache_args=['lua_source'],
args={'lua_source': script},
headers={'User-Agent': USER_AGENT}
)
# request = SplashRequest(url=self.baseURL, callback=self.parse_with_stop_date,
# endpoint='execute',
# cache_args=['lua_source'],
# args={'lua_source': script},
# headers={'User-Agent': USER_AGENT}
# )
# request.meta['splash']['session_id'] = cookie_list[0]
# yield request
# if "authentication failed" in response.body:
# self.logger.error("Login failed.")
# return
# else:
# # token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
# if self.stop_date is None:
# yield scrapy.Request(url=self.baseURL, callback=self.parse, dont_filter=True)
# else:
# yield scrapy.Request(url=self.baseURL, callback=self.parse_with_stop_date, dont_filter=True)
# yield scrapy.Request(
# url=self.baseURL,
# callback=self.parse_with_stop_date,
# cookies=token,
# headers={'User-Agent' : agent}
# )
def parse_with_stop_date(self, response):
print "parse_with_stop_date"
print "\n"
print response.css('h1.entry-title').extract_first()
print "\n"
print response.cookiejar
print "\n"
print response.headers
print "\n"
session_legend = response.css('div.topnav > a').extract()[-1]
if session_legend is not None :
print remove_tags(session_legend)
print "\n"
else :
print "No log in."
TO_NEXT_PAGE = True
for item in response.css('div.catpor-box > div'):
item_date = item.css('span.catpor-published').extract_first()
if item_date is not None:
item_date = remove_tags(item_date).replace(",", '')
item_date = item_date.split(' ')
item_date[1] = self.month_parser[item_date[1]]
item_date = map(int, item_date)
item_date = date(item_date[2], item_date[1], item_date[0])
if item_date >= self.stop_date:
item_link = item.css('span.catpor-title > a::attr(href)').extract_first()
print item_link
# token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
# yield scrapy.Request(url=item_link, callback=self.parse_links, cookies=cookies)
# yield scrapy.Request(url=item_link, callback=self.parse_links, cookies=token,
# headers={'User-Agent' : agent})
# yield SplashRequest(url=item_link, callback=self.parse_links,
# endpoint='execute',
# cache_args=['lua_source'],
# args={'lua_source': script},
# headers={'User-Agent': USER_AGENT}
# )
else:
TO_NEXT_PAGE = False
break
if TO_NEXT_PAGE:
next_page = response.css('div.page-navigation > div.nav-next > a::attr(href)').extract_first()
if next_page is not None:
# yield scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
yield SplashRequest(url=next_page, callback=self.parse_with_stop_date,
endpoint='execute',
cache_args=['lua_source'],
args={'lua_source': script},
headers={'User-Agent': USER_AGENT}
)
def parse_links_org(self, response):
print "\n\n"
print response.headers
for link in response.css('div.post-container > h2 > a::attr(href)').extract():
# print link
# token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
# yield SplashRequest(url=link, callback=self.parse_item, endpoint='render.html', args={ 'wait': 0.5 }, cookies=token,
# headers={'User-Agent' : agent})
yield SplashRequest(url=link, callback=self.parse_item,
endpoint='execute',
cache_args=['lua_source'],
args={'lua_source': script},
headers={'User-Agent': USER_AGENT}
)
def parse_item_org(self, response):
# if response.url == "https://hemeroteca.proceso.com.mx/?page_id=278958&a51dc26366d99bb5fa29cea4747565fec=420203":
# print response.body
item = NoticiasItem()
text = ''
news_date = response.xpath('//div[@id="primary"]').css('span.published').extract_first()
if news_date is not None:
news_date = remove_tags(news_date)
print news_date
d, t = news_date.split(' ')
d = map(int, d.split("-"))
t = map(int, t.split(":"))
news_date = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat('T')
title = response.xpath('//div[@id="primary"]/div/h1').extract_first()
if title is not None : title = remove_tags(title)
topic = response.css('span.entry-categories').extract_first()
if topic is not None : topic = remove_tags(topic)
for paragraph in response.xpath('//div[@id="primary"]').css('div.entry-content > div').css('p').extract():
text += remove_tags(paragraph) + '\n'
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = proceso_prueba.settings
[deploy]
#url = http://localhost:6800/
project = proceso_prueba
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
"""
MEDIA:
El Financiero, CDMX
USAGE:
## Get the news from RSS. ##
---------------------------------------------------------------------------------------------
$ cd elFinanciero/
$ scrapy crawl noticias --nolog -s filename=2017-12-20.json
"""
import scrapy, re, json import scrapy, re, json
from elFinanciero.items import NoticiasItem from elFinanciero.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
"""
MEDIO:
El Financiero, CDMX
USO:
scrapy crawl noticias --nolog -s filename=2017-12-20.json
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
...@@ -16,65 +22,73 @@ def remove_tags(text): ...@@ -16,65 +22,73 @@ def remove_tags(text):
class UTC(tzinfo): class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)""" """
Class for Time Zone
"""
def utcoffset(self, dt): def utcoffset(self, dt):
# zona horaria para hidalgo (centro de mexico): utc-6 ## Time zone for CDMX: UTC-6 ##
return timedelta(hours=-6) return timedelta(hours=-6)
def tzname(self, dt): def tzname(self, dt):
# nombre de la zona horaria ## Time zone name ##
return 'UTC-6' return 'UTC-6'
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
self.tz = UTC() self.tz = UTC()
# self.date_parser = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4,
# 'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
# 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
self.baseURL = "http://www.elfinanciero.com.mx/rss" self.baseURL = "http://www.elfinanciero.com.mx/rss"
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response): def parse(self, response):
for link in response.xpath('//link/text()').extract()[1:]: for link in response.xpath('//link/text()').extract()[1:]:
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first()) res = response.xpath('//script[@data-schema="NewsArticle"]').extract_first()
if res is not None : res = remove_tags(res)
resDict = json.loads(res) resDict = json.loads(res)
dt = resDict['datePublished'] dt = resDict['datePublished']
d,t = dt.split() d,t = dt.split()
d = map(int, d.split("-")) d = map(int, d.split("-"))
t = map(int, t.split(":")) t = map(int, t.split(":"))
item['date'] = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat("T") news_date = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat("T")
item['title'] = remove_tags(response.css('div.column').css('div.column').css('h1').extract_first()).strip() title = remove_tags(response.css('div.column').css('div.column').css('h1').extract_first()).strip()
topic = response.xpath('//div[@class="section-line"]').extract_first() topic = response.xpath('//div[@class="section-line"]').extract_first()
if topic is not None: if topic is not None:
item['topic'] = remove_tags(topic) topic = remove_tags(topic)
else:
item['topic'] = None
author = response.xpath('//div[@class="note-author"]/a').extract_first() author = response.xpath('//div[@class="note-author"]/a').extract_first()
if author is not None: if author is not None:
item['author'] = remove_tags(author) author = remove_tags(author)
for p in response.css('div.content').css('p').extract(): for p in response.css('div.content').css('p').extract():
text += remove_tags(p) + '\n' text += remove_tags(p) + '\n'
item['text'] = text.strip()
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['author'] = author
item['text'] = text.strip()
item['url'] = response.url item['url'] = response.url
# print item['title']
yield item yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment