#!/usr/bin/python3 #!/usr/bin/python3
# -*- coding: utf-8 -*- #Author: Mario Chirinos Colunga
import sys import sys
import os
from pathlib import Path from pathlib import Path
import datetime import datetime
#from myModule import myModule import json
#=============================================================================== #===============================================================================
def findLastDate(directory): def findLastDate(directory):
path = Path(directory) path = Path(directory)
dirs = [e.name for e in path.iterdir() if e.is_dir()] dirs = [e.name for e in path.iterdir() if e.is_dir()]
if len(dirs)>0:
dirs.sort() dirs.sort()
path = Path(directory+dirs[-1]) i = -1
while (len(dirs)+1)>=0:
path = Path(directory+dirs[i])
files = [e.name[:-5] for e in path.glob("*.json")] files = [e.name[:-5] for e in path.glob("*.json")]
if len(files)>0:
files.sort() files.sort()
date = datetime.datetime.strptime(files[-1], '%Y-%m-%d') date = datetime.datetime.strptime(files[-1], '%Y-%m-%d')
return date return date
return None
#=============================================================================== #===============================================================================
def updateDir(directory): def updateDir(directory, cfg, endDate=datetime.datetime.now()):
startDate = findLastDate(directory) startDate = findLastDate(directory)
endDate = datetime.datetime.now() print(startDate, endDate)
# endDate = datetime.datetime.now()
if startDate is None:
startDate=datetime.datetime.strptime(cfg["startDate"], '%Y-%m-%d')
delta = endDate-startDate delta = endDate-startDate
for i in range(delta.days + 1): for i in range(delta.days + 1):
day = startDate + datetime.timedelta(days=i) day = startDate + datetime.timedelta(days=i)
yeardir = directory+str(day.year)+"/"
if not os.path.exists(yeardir):
print(day) print(day)
os.system("scrapy crawl noticias --nolog -O "+yeardir+day.strftime('%Y-%m-%d')+".json -a year="+str(day.year)+" -a month="+str(day.month)+" -a day="+str(day.day)+"")
#=============================================================================== #===============================================================================
def main(argv): def main(argv):
if len(sys.argv) != 2: if len(argv) != 2 and len(argv) != 3:
print ("Usage text") print ("Usage: " + argv[0] + "<directory> [endDate:YYYY-MM-DD]")
else: else:
updateDir(argv[1]) with open(argv[1]+'settings.json') as json_file:
cfg = json.load(json_file)
if len(argv)==2:
updateDir(argv[1], cfg)
if len(argv)==3:
updateDir(argv[1], cfg, datetime.datetime.strptime(argv[2], '%Y-%m-%d'))
if __name__ == "__main__": if __name__ == "__main__":
main(sys.argv) main(sys.argv)
import sys
import os
import json
from pathlib import Path
import chardet
#from myModule import myModule
def ascii2utf8(inputfilename, outputfilename):
with open(inputfilename) as json_file:
data = json.load(json_file)#.read().decode("unicode_escape")
with open(outputfilename, 'w') as outfile:
json.dump(data, outfile, ensure_ascii=False, indent=1)
def copyDirStructure(indir, outdir):
path = Path(indir)
dirs = [e.name for e in path.iterdir() if e.is_dir()]
if not os.path.exists(outdir+path.name):
for d in dirs:
yeardir = outdir+path.name+"/"+d+"/"
print(path.name, d)
if not os.path.exists(yeardir):
filepath = Path(indir+d)
files = [e.name for e in filepath.glob("*.json")]
for f in files:
ascii2utf8(indir+d+"/"+f, yeardir+f)
def main(argv):
if len(sys.argv) != 3:
print ("Usage: " + argv[0] + " <input dir> <output dir>")
copyDirStructure(argv[1], argv[2])
if __name__ == "__main__":
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class AlchileSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
def close_spider(self, spider):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
row.append(("date", item['date']))
row.append(("topic", item['topic']))
row.append(("title", item['title']))
row.append(("author", item['author']))
row.append(("location", item['location']))
row.append(("text", item['text']))
row.append(("url", item['url']))
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for alChile project
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'alChile'
SPIDER_MODULES = ['alChile.spiders']
NEWSPIDER_MODULE = 'alChile.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'alChile (+http://www.yourdomain.com)'
# Obey robots.txt rules
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# The download delay setting will honor only one of:
# Disable cookies (enabled by default)
# Disable Telnet Console (enabled by default)
# Override the default request headers:
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# 'alChile.middlewares.AlchileSpiderMiddleware': 543,
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# 'alChile.middlewares.MyCustomDownloaderMiddleware': 543,
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# 'scrapy.extensions.telnet.TelnetConsole': None,
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
'alChile.pipelines.JsonWriterPipeline': 300,
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# The initial download delay
# The maximum download delay to be set in case of high latencies
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# Enable showing throttling stats for every response received:
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from alChile.items import NoticiasItem
Al Chile, Yucatan
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL = 'http://alchile.com.mx/' + year + '/' + month + '/' + day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.page-nav').css('a::attr(href)').extract()
if len(pagination) > 0:
pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1,pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.css('div.td-block-span6').css('h3.entry-title').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.css('header.td-post-title').css('h1.entry-title::text').extract_first()
d = response.css('span.td-post-date').css('time.entry-date::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.css('div.td-post-header').css('a::text').extract_first()
for paragraph in response.css('div.td-post-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
# Automatically created by: scrapy startproject
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
default = alChile.settings
#url = http://localhost:6800/
project = alChile
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class CampechehoySpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
def close_spider(self, spider):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
row.append(("date", item['date']))
row.append(("topic", item['topic']))
row.append(("title", item['title']))
row.append(("author", item['author']))
row.append(("location", item['location']))
row.append(("text", item['text']))
row.append(("url", item['url']))
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for campecheHoy project
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'campecheHoy'
SPIDER_MODULES = ['campecheHoy.spiders']
NEWSPIDER_MODULE = 'campecheHoy.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'campecheHoy (+http://www.yourdomain.com)'
# Obey robots.txt rules
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# The download delay setting will honor only one of:
# Disable cookies (enabled by default)
# Disable Telnet Console (enabled by default)
# Override the default request headers:
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# 'campecheHoy.middlewares.CampechehoySpiderMiddleware': 543,
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# 'campecheHoy.middlewares.MyCustomDownloaderMiddleware': 543,
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# 'scrapy.extensions.telnet.TelnetConsole': None,
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
'campecheHoy.pipelines.JsonWriterPipeline': 300,
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# The initial download delay
# The maximum download delay to be set in case of high latencies
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# Enable showing throttling stats for every response received:
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from campecheHoy.items import NoticiasItem
Campeche Hoy, Campeche
scrapy crawl noticias --nolog -s filename=2018-01-17.json -a year=2018 -a month=1 -a day=17
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "http://campechehoy.mx/" + year + "/" + month + "/" + day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a/@href').extract()
if pagination is not None and len(pagination) > 0:
pages = pagination[-2].rstrip("/")
pages = int(pages[pages.rfind("/")+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.xpath('//*[@class="td-pb-span8 td-main-content"]').css('h3').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.xpath('//span[@class="td-post-date"]/time/@datetime').extract_first()
item['title'] = remove_tags(response.xpath('//header/h1[@class="entry-title"]').extract_first())
topic = response.xpath('//*[@class="td-post-source-tags td-pb-padding-side"]/ul/li/a/text()').extract()[1]
topic = response.xpath('//*[@class="td-post-source-tags td-pb-padding-side"]/ul/li/a/text()').extract_first()
item['topic'] = topic
for p in response.xpath('//*[@class="td-post-content td-pb-padding-side"]/p').extract():
p = remove_tags(p)
p = p.replace("&lt;", "<")
p = p.replace("&gt;", ">")
text += remove_tags(p) + "\n"
item['text'] = text
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
default = campecheHoy.settings
#url = http://localhost:6800/
project = campecheHoy
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class ChiapashoySpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
def close_spider(self, spider):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
row.append(("date", item['date']))
row.append(("topic", item['topic']))
row.append(("title", item['title']))
row.append(("author", item['author']))
row.append(("location", item['location']))
row.append(("text", item['text']))
row.append(("url", item['url']))
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for chiapasHoy project
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'chiapasHoy'
SPIDER_MODULES = ['chiapasHoy.spiders']
NEWSPIDER_MODULE = 'chiapasHoy.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'chiapasHoy (+http://www.yourdomain.com)'
# Obey robots.txt rules
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# The download delay setting will honor only one of:
# Disable cookies (enabled by default)
# Disable Telnet Console (enabled by default)
# Override the default request headers:
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# 'chiapasHoy.middlewares.ChiapashoySpiderMiddleware': 543,
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# 'chiapasHoy.middlewares.MyCustomDownloaderMiddleware': 543,
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# 'scrapy.extensions.telnet.TelnetConsole': None,
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
'chiapasHoy.pipelines.JsonWriterPipeline': 300,
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# The initial download delay
# The maximum download delay to be set in case of high latencies
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# Enable showing throttling stats for every response received:
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from chiapasHoy.items import NoticiasItem
import datetime
Chiapas Hoy, Chiapas
scrapy crawl noticias --nolog -s filename=2018-01-25.json -a year=2018 -a month=1 -a day=25
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
LOC_RE = re.compile(r'\A.+?(\d{1,2}[\s-][a-zA-Z]+[\s-]\d{4})?\s?\.\s?-\s?')
DAT_RE = re.compile(r'[,;]?(\sa?\s?\d{1,2}\sde\s[a-zA-Z]+\sde\s\d{4}\s?)?\.\s?-\s?')
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
date = datetime.date(int(year), int(month), int(day))
self.baseURL = "http://www.chiapashoy.com.mx/notashoy/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse, cb_kwargs={"date":date})
def parse(self, response, **kwargs):
links = response.css('article').css('h3').css('a::attr(href)').extract()
for link in links:
yield scrapy.Request(url=link, callback=self.parse_item, cb_kwargs=kwargs)
nextPage = response.xpath('//*[@class="nav-links"]/a[@class="next page-numbers"]/@href').extract_first()
if nextPage is not None and nextPage != '':
yield scrapy.Request(url=nextPage, callback=self.parse, cb_kwargs=kwargs)
def parse_item(self, response, **kwargs):
item = NoticiasItem()
text = ''
item['date'] = kwargs["date"].strftime('%Y/%m/%d') #response.xpath('//span[@class="meta-date"]/a/time/@datetime').extract_first()
item['title'] = response.css("h1.entry-title::text").extract_first()
item['topic'] = response.css('li.meta-category').css('a::text').extract_first().replace(" ", "").replace("\n", "")
paragraphs = response.css("article").css("div.entry-content").css("p").extract()
item['author'] = remove_tags(paragraphs[-1])
text = ""
for p in paragraphs:
text += remove_tags(p) + "\n"
item['text'] = text.strip()
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
default = chiapasHoy.settings
#url = http://localhost:6800/
project = chiapasHoy
# -*- coding: utf-8 -*-
# Define here the models for your scraped items # Define here the models for your scraped items
# #
# See documentation in: # See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html # https://docs.scrapy.org/en/latest/topics/items.html
import scrapy import scrapy
class NoticiasItem(scrapy.Item): class CuartopoderItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
date = scrapy.Field()
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field() location = scrapy.Field()
author = scrapy.Field() author = scrapy.Field()
topic = scrapy.Field() topic = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware # Define here the models for your spider middleware
# #
# See documentation in: # See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class CuartopoderSpiderMiddleware(object): class CuartopoderSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined, # Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the # scrapy acts as if the spider middleware does not modify the
# passed objects. # passed objects.
...@@ -31,7 +32,7 @@ class CuartopoderSpiderMiddleware(object): ...@@ -31,7 +32,7 @@ class CuartopoderSpiderMiddleware(object):
# Called with the results returned from the Spider, after # Called with the results returned from the Spider, after
# it has processed the response. # it has processed the response.
# Must return an iterable of Request, dict or Item objects. # Must return an iterable of Request, or item objects.
for i in result: for i in result:
yield i yield i
...@@ -39,8 +40,7 @@ class CuartopoderSpiderMiddleware(object): ...@@ -39,8 +40,7 @@ class CuartopoderSpiderMiddleware(object):
# Called when a spider or process_spider_input() method # Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception. # (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict # Should return either None or an iterable of Request or item objects.
# or Item objects.
pass pass
def process_start_requests(self, start_requests, spider): def process_start_requests(self, start_requests, spider):
...@@ -56,7 +56,7 @@ class CuartopoderSpiderMiddleware(object): ...@@ -56,7 +56,7 @@ class CuartopoderSpiderMiddleware(object):
spider.logger.info('Spider opened: %s' % spider.name) spider.logger.info('Spider opened: %s' % spider.name)
class CuartopoderDownloaderMiddleware(object): class CuartopoderDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined, # Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the # scrapy acts as if the downloader middleware does not modify the
# passed objects. # passed objects.
# -*- coding: utf-8 -*-
# Define your item pipelines here # Define your item pipelines here
# #
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name # useful for handling different item types with a single interface
return cls(filename) from itemadapter import ItemAdapter
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
def close_spider(self, spider):
class CuartopoderPipeline:
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
row.append(("date", item['date']))
row.append(("topic", item['topic']))
row.append(("title", item['title']))
row.append(("author", item['author']))
row.append(("location", item['location']))
row.append(("text", item['text']))
row.append(("url", item['url']))
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
# -*- coding: utf-8 -*-
# Scrapy settings for cuartoPoder project # Scrapy settings for cuartoPoder project
# #
# For simplicity, this file contains only settings considered important or # For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation: # commonly used. You can find more settings consulting the documentation:
# #
# https://doc.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'cuartoPoder' BOT_NAME = 'cuartoPoder'
SPIDER_MODULES = ['cuartoPoder.spiders'] SPIDER_MODULES = ['cuartoPoder.spiders']
NEWSPIDER_MODULE = 'cuartoPoder.spiders' NEWSPIDER_MODULE = 'cuartoPoder.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent # Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'cuartoPoder (+http://www.yourdomain.com)' #USER_AGENT = 'cuartoPoder (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
...@@ -45,31 +43,31 @@ COOKIES_ENABLED = False ...@@ -45,31 +43,31 @@ COOKIES_ENABLED = False
#} #}
# Enable or disable spider middlewares # Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# 'cuartoPoder.middlewares.CuartopoderSpiderMiddleware': 543, # 'cuartoPoder.middlewares.CuartopoderSpiderMiddleware': 543,
#} #}
# Enable or disable downloader middlewares # Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# 'cuartoPoder.middlewares.CuartopoderDownloaderMiddleware': 543, # 'cuartoPoder.middlewares.CuartopoderDownloaderMiddleware': 543,
#} #}
# Enable or disable extensions # Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html # See https://docs.scrapy.org/en/latest/topics/extensions.html
# 'scrapy.extensions.telnet.TelnetConsole': None, # 'scrapy.extensions.telnet.TelnetConsole': None,
#} #}
# Configure item pipelines # Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
'cuartoPoder.pipelines.JsonWriterPipeline': 300, # 'cuartoPoder.pipelines.CuartopoderPipeline': 300,
} #}
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# The initial download delay # The initial download delay
...@@ -82,7 +80,7 @@ ITEM_PIPELINES = { ...@@ -82,7 +80,7 @@ ITEM_PIPELINES = {
# Enable and configure HTTP caching (disabled by default) # Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_DIR = 'httpcache'
# -*- coding: utf-8 -*- import scrapy
import json
""" import datetime
MEDIA: from cuartoPoder.items import CuartopoderItem
Cuarto Poder, Chiapas #-------------------------------------------------------------------------------
import re
## Get all the news from a specific date. ##
$ cd cuartoPoder/
$ scrapy crawl noticias --nolog -s filename=2018-08-30.json -a year=2018 -a month=8 -a day=30
import scrapy, re
from cuartoPoder.items import NoticiasItem
from datetime import datetime, date, timedelta, tzinfo
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasSpider(scrapy.Spider):
name = 'noticias'
allowed_domains = ['cuartopoder.mx']
start_urls = ['https://cuartopoder.mx/']
class UTC(tzinfo):
Class for Time Zone
def utcoffset(self, dt):
## Time zone for Chiapas: UTC-6 ##
return timedelta(hours=-6)
def tzname(self, dt):
## Time zone name ##
return 'UTC-6'
class ImportantData(scrapy.Item):
Useful data for the flow of the implementation
to_next_page = scrapy.Field()
is_last_link = scrapy.Field()
next_page = scrapy.Field()
return_url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
Basic Scrapy Spider class
name = "noticias"
def start_requests(self): def start_requests(self):
self.tz = UTC()
self.year = getattr(self, "year", None) self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None) self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None) self.day = getattr(self, "day", None)
self.stop_date = date(int(self.year), int(self.month), int(self.day)) date = self.month.zfill(2)+"-"+self.day.zfill(2)+"-"+self.year
self.baseURL = "http://www.cuartopoder.mx" url = self.start_urls[0]+"XStatic/cuartopoder/template/cargaBloque.aspx?strControl=ctrlArchivoResultadosPaginadoListado&eids=&fd="+date+"&fh="+date+"&id=portada&p=1"
first_URL = self.baseURL + "/archivo/portada/listado/{1}-{2}-{0}/{1}-{2}-{0}/".format(self.year, self.month.zfill(2), self.day.zfill(2)) yield scrapy.Request(url=url, callback=self.parsePage)
self.second_URL = self.baseURL + "/XStatic/cuartopoder/template/cargaBloque.aspx?strControl=ctrlArchivoResultadosPaginadoListado&" #-----------------------------------------------------------------------
def parsePage(self, response):
self.month_parser = {"Enero" : 1, "Mayo" : 5, "Septiembre" : 9, i = response.url.index("&p=")
"Febrero" : 2, "Junio" : 6, "Octubre" : 10, url = response.url[:i+3]+str(int(response.url[i+3:])+1)
"Marzo" : 3, "Julio" : 7, "Noviembre" : 11, links = response.css('ul.news-list').xpath('./li/h5/a/@href').extract()
"Abril" : 4, "Agosto" : 8, "Diciembre" : 12} print(response.url)
flow_info = ImportantData() if len(links)>0:
flow_info['to_next_page'] = False for l in links:
flow_info['next_page'] = 2 yield scrapy.Request(url=self.start_urls[0][:-1]+l, callback=self.parse)
yield scrapy.Request(url=url, callback=self.parsePage)
request = scrapy.Request(url=first_URL, callback=self.parse) #-----------------------------------------------------------------------
request.meta['item'] = flow_info
yield request
def parse(self, response): def parse(self, response):
flow_info = response.meta['item'] item = CuartopoderItem()
page = flow_info['next_page']
date = self.year+"-"+self.month.zfill(2)+"-"+self.day.zfill(2)
if not flow_info['to_next_page']: item["date"]= datetime.datetime.strptime(date, '%Y-%m-%d').isoformat()
link_list = response.css('ul.news-list').xpath('./li/h5/a/@href').extract() item["title"] =response.xpath("//meta[@property='og:title']/@content").extract_first()
item["topic"] = response.css('div.big-title').xpath('./h2/a/span//text()').extract_first()
for link in link_list: item["text"] =response.xpath("//meta[@name='Description']/@content").extract_first()
flow_info = ImportantData() item["url"] = response.url
flow_info['next_page'] = page print(item["title"])
flow_info['return_url'] = response.url
if link == link_list[-1] : flow_info['is_last_link'] = True
else : flow_info['is_last_link'] = False
news_link = self.baseURL + link
request = scrapy.Request(url=news_link, callback=self.parse_item)
request.meta['item'] = flow_info
yield request
page_URL = self.second_URL + "p={3}&eids=&fd={1}-{2}-{0}&fh={1}-{2}-{0}&id=portada".format(self.year, self.month.zfill(2), self.day.zfill(2), str(page))
flow_info['to_next_page'] = False
flow_info['next_page'] += 1
request = scrapy.Request(url=page_URL, callback=self.parse)
request.meta['item'] = flow_info
yield request
def parse_item(self, response):
news_date = response.css('ul.metas-list > li > p').extract_first()
news_date = remove_tags(news_date)
news_date = news_date.split(u'\xa0')
news_date[1] = news_date[1].strip().replace(",", '')
news_date = date(int(self.year), self.month_parser[news_date[0]], int(news_date[1]))
if news_date == self.stop_date:
flow_info = response.meta['item']
item = NoticiasItem()
text = ''
news_date = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
title = response.css('div.post-title').css('h1').extract_first()
if title is not None : title = remove_tags(title)
topic = response.css('div.big-title').xpath('./h2/a/span').extract_first()
if topic is not None : topic = remove_tags(topic)
for p in response.css('div.post-content').css('p').extract():
p = remove_tags(p)
text += p + "\n"
## News item info ##
item['date'] = news_date
item['title'] = title.strip()
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item yield item
if flow_info['is_last_link']:
flow_info['to_next_page'] = True
request = scrapy.Request(url=flow_info['return_url'], callback=self.parse, dont_filter=True)
request.meta['item'] = flow_info
yield request
\ No newline at end of file
...@@ -41,6 +41,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -41,6 +41,7 @@ class QuotesSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract() pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()
...@@ -78,6 +79,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -78,6 +79,6 @@ class QuotesSpider(scrapy.Spider):
item['url'] = response.url item['url'] = response.url
# print item['title'] print (item['title'])
yield item yield item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class DespertaroaxacaSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
def close_spider(self, spider):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
row.append(("date", item['date']))
row.append(("topic", item['topic']))
row.append(("title", item['title']))
row.append(("author", item['author']))
row.append(("location", item['location']))
row.append(("text", item['text']))
row.append(("url", item['url']))
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for despertarOaxaca project
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'despertarOaxaca'
SPIDER_MODULES = ['despertarOaxaca.spiders']
NEWSPIDER_MODULE = 'despertarOaxaca.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'despertarOaxaca (+http://www.yourdomain.com)'
# Obey robots.txt rules
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# The download delay setting will honor only one of:
# Disable cookies (enabled by default)
# Disable Telnet Console (enabled by default)
# Override the default request headers:
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# 'despertarOaxaca.middlewares.DespertaroaxacaSpiderMiddleware': 543,
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# 'despertarOaxaca.middlewares.MyCustomDownloaderMiddleware': 543,
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# 'scrapy.extensions.telnet.TelnetConsole': None,
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
'despertarOaxaca.pipelines.JsonWriterPipeline': 300,
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# The initial download delay
# The maximum download delay to be set in case of high latencies
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# Enable showing throttling stats for every response received:
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from despertarOaxaca.items import NoticiasItem
El Despertar de Oaxaca
scrapy crawl noticias --nolog -s filename=2018-02-04.json -a year=2018 -a month=2 -a day=4
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
# LOC_RE = re.compile(r'\n.+?,? ?.+? ?\. ?- ?')
# G_RE = re.compile(r' ?- ?')
# EM_RE = re.compile(r'((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?')
# TW_RE = re.compile(r'M.{1,3}s de la P.{1,3}lvora en Twitter: @[\w.%+-]+.', re.I)
# TW2_RE = re.compile(r'((\| )?Twitter:\s+)?(@[\w.%+-]+.)?', re.I)
# TAG2_RE = re.compile(r'\ntransition_[^\]]+\]')
# TAG3_RE = re.compile(r'\[[^\]]+[\]\n]')
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "http://despertardeoaxaca.com/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//div[@class="post-pagination cat-"]/a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/') + 1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL + '/page/' + str(page + 1), callback=self.parse_page)
def parse_page(self, response):
for post in response.xpath('//div[@class="articles"]').css('div.cnt'):
item = NoticiasItem()
topic = post.css('span.category').xpath('./a').extract_first()
if topic is not None:
item['topic'] = remove_tags(topic)
link = post.css('h3').xpath('./a/@href').extract_first()
request = scrapy.Request(url=link, callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_item(self, response):
item = response.meta['item']
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = remove_tags(response.css('article.article-post').xpath('./header/h1').extract_first()).strip()
for p in response.xpath('//div[@class="article-post-content"]').css('p').extract():
text += remove_tags(p) + "\n"
# result = LOC_RE.search(text)
# if result:
# m = result.group(0)
# location = G_RE.sub('', m).strip()
# if len(location) <= 35:
# item['location'] = location
# text = text[text.find(m)+len(m):]
# text = EM_RE.sub('', text)
# text = TW_RE.sub('', text)
# text = TW2_RE.sub('', text)
# text = TAG2_RE.sub("\n", text)
# text = TAG3_RE.sub('', text)
item['text'] = text.strip()
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
default = despertarOaxaca.settings
#url = http://localhost:6800/
project = despertarOaxaca
# -*- coding: utf-8 -*-
# Define here the models for your scraped items # Define here the models for your scraped items
# #
# See documentation in: # See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html # https://docs.scrapy.org/en/latest/topics/items.html
import scrapy import scrapy
class NoticiasItem(scrapy.Item): class EdomexdiaItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
date = scrapy.Field()
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field() location = scrapy.Field()
author = scrapy.Field() author = scrapy.Field()
topic = scrapy.Field() topic = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware # Define here the models for your spider middleware
# #
# See documentation in: # See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class EdomexdiaSpiderMiddleware(object): class EdomexdiaSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined, # Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the # scrapy acts as if the spider middleware does not modify the
# passed objects. # passed objects.
...@@ -20,30 +21,29 @@ class EdomexdiaSpiderMiddleware(object): ...@@ -20,30 +21,29 @@ class EdomexdiaSpiderMiddleware(object):
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s return s
def process_spider_input(response, spider): def process_spider_input(self, response, spider):
# Called for each response that goes through the spider # Called for each response that goes through the spider
# middleware and into the spider. # middleware and into the spider.
# Should return None or raise an exception. # Should return None or raise an exception.
return None return None
def process_spider_output(response, result, spider): def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after # Called with the results returned from the Spider, after
# it has processed the response. # it has processed the response.
# Must return an iterable of Request, dict or Item objects. # Must return an iterable of Request, or item objects.
for i in result: for i in result:
yield i yield i
def process_spider_exception(response, exception, spider): def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method # Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception. # (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict # Should return either None or an iterable of Request or item objects.
# or Item objects.
pass pass
def process_start_requests(start_requests, spider): def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works # Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except # similarly to the process_spider_output() method, except
# that it doesn’t have a response associated. # that it doesn’t have a response associated.
...@@ -54,3 +54,50 @@ class EdomexdiaSpiderMiddleware(object): ...@@ -54,3 +54,50 @@ class EdomexdiaSpiderMiddleware(object):
def spider_opened(self, spider): def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name) spider.logger.info('Spider opened: %s' % spider.name)
class EdomexdiaDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here # Define your item pipelines here
# #
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name # useful for handling different item types with a single interface
return cls(filename) from itemadapter import ItemAdapter
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
def close_spider(self, spider):
class EdomexdiaPipeline:
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
row.append(("date", item['date']))
row.append(("topic", item['topic']))
row.append(("title", item['title']))
row.append(("author", item['author']))
row.append(("location", item['location']))
row.append(("text", item['text']))
row.append(("url", item['url']))
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
# -*- coding: utf-8 -*-
# Scrapy settings for edoMexDia project # Scrapy settings for edoMexDia project
# #
# For simplicity, this file contains only settings considered important or # For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation: # commonly used. You can find more settings consulting the documentation:
# #
# http://doc.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'edoMexDia' BOT_NAME = 'edoMexDia'
SPIDER_MODULES = ['edoMexDia.spiders'] SPIDER_MODULES = ['edoMexDia.spiders']
NEWSPIDER_MODULE = 'edoMexDia.spiders' NEWSPIDER_MODULE = 'edoMexDia.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent # Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'edoMexDia (+http://www.yourdomain.com)' #USER_AGENT = 'edoMexDia (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
...@@ -45,31 +43,31 @@ COOKIES_ENABLED = False ...@@ -45,31 +43,31 @@ COOKIES_ENABLED = False
#} #}
# Enable or disable spider middlewares # Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# 'edoMexDia.middlewares.EdomexdiaSpiderMiddleware': 543, # 'edoMexDia.middlewares.EdomexdiaSpiderMiddleware': 543,
#} #}
# Enable or disable downloader middlewares # Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# 'edoMexDia.middlewares.MyCustomDownloaderMiddleware': 543, # 'edoMexDia.middlewares.EdomexdiaDownloaderMiddleware': 543,
#} #}
# Enable or disable extensions # Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html # See https://docs.scrapy.org/en/latest/topics/extensions.html
# 'scrapy.extensions.telnet.TelnetConsole': None, # 'scrapy.extensions.telnet.TelnetConsole': None,
#} #}
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
'edoMexDia.pipelines.JsonWriterPipeline': 300, # 'edoMexDia.pipelines.EdomexdiaPipeline': 300,
} #}
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# The initial download delay # The initial download delay
...@@ -82,7 +80,7 @@ ITEM_PIPELINES = { ...@@ -82,7 +80,7 @@ ITEM_PIPELINES = {
# Enable and configure HTTP caching (disabled by default) # Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_DIR = 'httpcache'
File mode changed from 100755 to 100644
# -*- coding: utf-8 -*- """
Spider for edomexaldia.com
Author: Mario Chirinos Coluga
Usage:scrapy crawl noticias --nolog -O 2017-04-23.json -a year=2017 -a month=4 -a day=23
import scrapy, re import scrapy, re
from edoMexDia.items import NoticiasItem from edoMexDia.items import EdomexdiaItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
EDOMEX al Día, Estado de México
scrapy crawl noticias --nolog -s filename=2018-01-30.json -a year=2018 -a month=1 -a day=30
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para estado de méxico: utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
self.tz = UTC()
self.year = getattr(self, "year", None) self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None) self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None) self.day = getattr(self, "day", None)
self.date_parser = {'january': 1, 'february': 2, 'march': 3, 'april': 4,
'may': 5, 'june': 6, 'july': 7, 'august': 8,
'september': 9, 'october': 10, 'november': 11, 'december': 12}
self.baseURL = "http://www.edomexaldia.com.mx/" + self.year + "/" + self.month.zfill(2) + "/" + self.day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
self.baseURL = "http://edomexaldia.com/" + self.year + "/" + self.month.zfill(2) + "/" + self.day.zfill(2)
def parse(self, response): yield scrapy.Request(url=self.baseURL, callback=self.parse_page)
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
lastPage = response.xpath('//div[@class="numbered-pagination"]/a[@class="pagi-last"]/@href').extract_first()
if lastPage is None:
lastPage = response.xpath('//div[@class="numbered-pagination"]/a/@href').extract()[-1]
if lastPage is not None and lastPage != '':
lastPage = lastPage.strip('/')
lastPage = int(lastPage[lastPage.rfind('/')+1:])
for page in range(1, lastPage):
yield scrapy.Request(url=self.baseURL + "/page/" + str(page+1), callback=self.parse_page)
def parse_page(self, response): def parse_page(self, response):
for link in response.xpath('//div[@id="main"]/div/h2[@class="entry_title"]/a/@href').extract(): print("parse page", response.url)
for link in response.xpath('//main[@id="main"]/article/header/h2[@class="entry-title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
next = response.css('a.next::attr(href)').extract_first()
nextPage = response.xpath('//div[@class="numbered-pagination"]/a[@class="pagi-next"]/@href').extract_first() if next is not None:
if nextPage is not None and nextPage != '': yield scrapy.Request(url=next, callback=self.parse_page)
yield scrapy.Request(url=nextPage, callback=self.parse) #-----------------------------------------------------------------------
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = EdomexdiaItem()
text = ''
item['date'] = response.xpath("//meta[@property='article:published_time']/@content").extract_first()
try: item['title'] = response.xpath("//meta[@property='og:title']/@content").extract_first().replace(" - Edomex Al Día","")
d = remove_tags(response.xpath('//span[@class="post_author_create"]').extract_first()) text=""
d = d.replace("el ", '').replace(",", '').replace(".", '').split() for p in response.xpath('//div[@class="entry-content"]/p/text()').extract():
dat = datetime(int(d[2]), self.date_parser[d[1].lower()], int(d[0]), tzinfo=self.tz).isoformat("T") nt = remove_tags(p).replace("\n","").replace("\r","").strip()
except: text+=nt
dat = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T") if len(nt)>0:
item['date'] = dat text+="\n"
item['title'] = remove_tags(response.xpath('//div[@id="main"]/div/h1').extract_first()).strip()
item['topic'] = None
author = response.xpath('//span[@class="post_author_author"]').extract_first()
if author is not None and author != '':
author = remove_tags(author).strip()
author = author.replace(" Publicado:", '')
item['author'] = author
for p in response.xpath('//div[@id="main"]/div/p').extract():
text += remove_tags(p) + "\n"
item['text'] = text.strip() item['text'] = text.strip()
item['topic'] = ", ".join(response.xpath('//span[@class="tag-links"]/a/text()').extract())
item['url'] = response.url item['url'] = response.url
yield item yield item
# Automatically created by: scrapy startproject # Automatically created by: scrapy startproject
# #
# For more information about the [deploy] section see: # For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html # https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings] [settings]
default = edoMexDia.settings default = edoMexDia.settings
# -*- coding: utf-8 -*-
# Define here the models for your scraped items # Define here the models for your scraped items
# #
# See documentation in: # See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html # https://docs.scrapy.org/en/latest/topics/items.html
import scrapy import scrapy
...@@ -11,14 +9,12 @@ import scrapy ...@@ -11,14 +9,12 @@ import scrapy
class ElfinancieroItem(scrapy.Item): class ElfinancieroItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass date = scrapy.Field()
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field() location = scrapy.Field()
author = scrapy.Field() author = scrapy.Field()
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
media = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware # Define here the models for your spider middleware
# #
# See documentation in: # See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class ElfinancieroSpiderMiddleware(object): class ElfinancieroSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined, # Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the # scrapy acts as if the spider middleware does not modify the
# passed objects. # passed objects.
...@@ -31,7 +32,7 @@ class ElfinancieroSpiderMiddleware(object): ...@@ -31,7 +32,7 @@ class ElfinancieroSpiderMiddleware(object):
# Called with the results returned from the Spider, after # Called with the results returned from the Spider, after
# it has processed the response. # it has processed the response.
# Must return an iterable of Request, dict or Item objects. # Must return an iterable of Request, or item objects.
for i in result: for i in result:
yield i yield i
...@@ -39,8 +40,7 @@ class ElfinancieroSpiderMiddleware(object): ...@@ -39,8 +40,7 @@ class ElfinancieroSpiderMiddleware(object):
# Called when a spider or process_spider_input() method # Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception. # (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict # Should return either None or an iterable of Request or item objects.
# or Item objects.
pass pass
def process_start_requests(self, start_requests, spider): def process_start_requests(self, start_requests, spider):
...@@ -56,7 +56,7 @@ class ElfinancieroSpiderMiddleware(object): ...@@ -56,7 +56,7 @@ class ElfinancieroSpiderMiddleware(object):
spider.logger.info('Spider opened: %s' % spider.name) spider.logger.info('Spider opened: %s' % spider.name)
class ElfinancieroDownloaderMiddleware(object): class ElfinancieroDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined, # Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the # scrapy acts as if the downloader middleware does not modify the
# passed objects. # passed objects.
# -*- coding: utf-8 -*-
# Define your item pipelines here # Define your item pipelines here
# #
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import json
class ElfinancieroPipeline(object):
def __init__(self, filename):
self.filename = filename
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider): # useful for handling different item types with a single interface
self.itemList=[] from itemadapter import ItemAdapter
def close_spider(self, spider):
# print(len(self.itemList))
with open(self.filename, 'w') as fp:
json.dump(self.itemList, fp)
class ElfinancieroPipeline:
def process_item(self, item, spider): def process_item(self, item, spider):
return item return item
# -*- coding: utf-8 -*-
# Scrapy settings for elFinanciero project # Scrapy settings for elFinanciero project
# #
# For simplicity, this file contains only settings considered important or # For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation: # commonly used. You can find more settings consulting the documentation:
# #
# https://doc.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'elFinanciero' BOT_NAME = 'elFinanciero'
SPIDER_MODULES = ['elFinanciero.spiders'] SPIDER_MODULES = ['elFinanciero.spiders']
NEWSPIDER_MODULE = 'elFinanciero.spiders' NEWSPIDER_MODULE = 'elFinanciero.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent # Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'elFinanciero (+http://www.yourdomain.com)' #USER_AGENT = 'elFinanciero (+http://www.yourdomain.com)'
...@@ -25,7 +23,7 @@ ROBOTSTXT_OBEY = True ...@@ -25,7 +23,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
...@@ -45,31 +43,31 @@ ROBOTSTXT_OBEY = True ...@@ -45,31 +43,31 @@ ROBOTSTXT_OBEY = True
#} #}
# Enable or disable spider middlewares # Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# 'elFinanciero.middlewares.ElfinancieroSpiderMiddleware': 543, # 'elFinanciero.middlewares.ElfinancieroSpiderMiddleware': 543,
#} #}
# Enable or disable downloader middlewares # Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# 'elFinanciero.middlewares.ElfinancieroDownloaderMiddleware': 543, # 'elFinanciero.middlewares.ElfinancieroDownloaderMiddleware': 543,
#} #}
# Enable or disable extensions # Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html # See https://docs.scrapy.org/en/latest/topics/extensions.html
# 'scrapy.extensions.telnet.TelnetConsole': None, # 'scrapy.extensions.telnet.TelnetConsole': None,
#} #}
# Configure item pipelines # Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
'elFinanciero.pipelines.ElfinancieroPipeline': 300, # 'elFinanciero.pipelines.ElfinancieroPipeline': 300,
} #}
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# The initial download delay # The initial download delay
...@@ -82,7 +80,7 @@ ITEM_PIPELINES = { ...@@ -82,7 +80,7 @@ ITEM_PIPELINES = {
# Enable and configure HTTP caching (disabled by default) # Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_DIR = 'httpcache'
# -*- coding: utf-8 -*-
""" """
MEDIA: Spider for elfinanciero.com.mx
El Financiero Author: Mario Chirinos Coluga
USAGE: Usage:scrapy crawl noticias --nolog -O 2021-03-18.json -a year=2021 -a month=3 -a day=18
## Get all the news from a specific date. ##
$ cd elFinanciero/
$ scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
""" """
import scrapy
import json
import re
from elFinanciero.items import ElfinancieroItem
import scrapy, re, json #-------------------------------------------------------------------------------
from elFinanciero.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
allSections = [{"name":"Economía","slug":"economia"},{"name":"Empresas","slug":"empresas"},{"name":"Mercados","slug":"mercados"},{"name":"Pyme","slug":"pyme"},{"name":"Franquicias","slug":"franquicias"},{"name":"Nacional","slug":"nacional"},{"name":"Tech","slug":"tech"},{"name":"Mundo","slug":"mundo"},{"name":"Deportes","slug":"deportes"},{"name":"Culturas","slug":"culturas"},{"name":"Buena Vida","slug":"buena-vida"},{"name":"Reflector","slug":"reflector"},{"name":"Ciencia","slug":"ciencia"},{"name":"Mis Finanzas","slug":"mis-finanzas"},{"name":"Opinión","slug":"opinion"},{"name":"Interactivos","slug":"interactivos"},{"name":"Blogs","slug":"blogs"},{"name":"Fotogalerías","slug":"fotogalerias"},{"name":"Financial Times","slug":"financial-times"},{"name":"Power Tools","slug":"power-tools"},{"name":"Bajío","slug":"bajio"},{"name":"Monterrey","slug":"monterrey"},{"name":"Universidades","slug":"universidades"},{"name":"Mundo empresa","slug":"mundo-empresa"},{"name":"Texas","slug":"texas"},{"name":"Suplementos","slug":"suplementos"},{"name":"Archivo","slug":"archivo"},{"name":"Pages","slug":"pages"},{"name":"Licitaciones","slug":"licitaciones"},{"name":"Bloomberg","slug":"bloomberg"},{"name":"Startup","slug":"startup"},{"name":"Mercados - Acciones","slug":"mercados/acciones"},{"name":"Mercados - IPC","slug":"mercados/ipc"},{"name":"Mercados - Divisas","slug":"mercados/divisas"},{"name":"Mercados - Dinero","slug":"mercados/dinero"},{"name":"Mercados - Commodities","slug":"mercados/commodities"},{"name":"TLCAN","slug":"tlcan"},{"name":"Blogs - Territorio Viral","slug":"blogs/territorio-viral"},{"name":"Blogs - Templo del Morbo","slug":"blogs/templo-del-morbo"},{"name":"Sponsor","slug":"sponsor"},{"name":"Bloomberg Businessweek","slug":"bloomberg-businessweek"},{"name":"Millonarios","slug":"millonarios"},{"name":"Management","slug":"management"},{"name":"Viajes","slug":"viajes"},{"name":"Cartones","slug":"cartones"},{"name":"EF Eventos","slug":"ef-eventos"},{"name":"Blogs - Efecto Jazz","slug":"blogs/efecto-jazz"},{"name":"Blogs - Visión CFA","slug":"blogs/vision-cfa"},{"name":"Pages - Eventos","slug":"pages/eventos"},{"name":"Pages - Interactivos","slug":"pages/interactivos"},{"name":"Pages - PDF","slug":"pages/pdf"},{"name":"Pages - Documentos","slug":"pages/documentos"},{"name":"Pages - Docs","slug":"pages/docs"},{"name":"TV","slug":"tv"},{"name":"Tv - Al sonar la campana","slug":"tv/al-sonar-la-campana"},{"name":"Tv - Espresso Doble","slug":"tv/espresso-doble"},{"name":"Tv - Ganadores & Perdedores","slug":"tv/ganadores-y-perdedores"},{"name":"Tv - Entre Mercados","slug":"tv/entre-mercados"},{"name":"Tv - Mesa Central","slug":"tv/mesa-central"},{"name":"Tv - Bitácora Política","slug":"tv/bitacora-politica"},{"name":"Tv - Sin Línea","slug":"tv/sin-linea"},{"name":"Tv - Al Cierre","slug":"tv/al-cierre"},{"name":"Tv - Tiempo de Toros","slug":"tv/tiempo-de-toros"},{"name":"Tv - Nación 321","slug":"tv/nacion321"},{"name":"Tv - El mundo según...","slug":"tv/el-mundo-segun"},{"name":"Tv - En EF y por Adela","slug":"tv/en-ef-y-por-adela"},{"name":"Tv - La Nota Dura","slug":"tv/la-nota-dura"},{"name":"Tv - La Silla Roja","slug":"tv/la-silla-roja"},{"name":"Tv - Personajes","slug":"tv/personajes"},{"name":"Tv - Tech","slug":"tv/tech"},{"name":"Tv - Mundo","slug":"tv/mundo"},{"name":"Tv - Finanzas Personales","slug":"tv/finanzas-personales"},{"name":"Tv - Estilo de Vida","slug":"tv/estilo-de-vida"},{"name":"Tv - Bloomberg","slug":"tv/bloomberg"},{"name":"Tv - Viral","slug":"tv/viral"},{"name":"Tv - Nacional","slug":"tv/nacional"},{"name":"Tv - Empresas","slug":"tv/empresas"},{"name":"Tv - Economía","slug":"tv/economia"},{"name":"Tv - Reflector","slug":"tv/reflector"},{"name":"Tv - Sponsor","slug":"tv/sponsor"},{"name":"Rankings","slug":"rankings"},{"name":"Trivias","slug":"trivias"},{"name":"Elecciones 2018","slug":"elecciones-2018"},{"name":"Pages - Businessweek México","slug":"pages/businessweek-mexico"},{"name":"Fibras","slug":"fibras"},{"name":"After Office","slug":"after-office"},{"name":"New York Times Syndicate","slug":"new-york-times-syndicate"},{"name":"México en Hannover","slug":"mexico-en-hannover"},{"name":"Tv - Opinión","slug":"tv/opinion"},{"name":"Pages - Central Política","slug":"pages/central-politica"},{"name":"Relojes","slug":"relojes"},{"name":"Autos","slug":"autos"},{"name":"Sibarita","slug":"sibarita"},{"name":"Letras Libres","slug":"letras-libres"},{"name":"Rusia 2018","slug":"rusia-2018"},{"name":"Tv - Especiales","slug":"tv/especiales"},{"name":"Tv - Bloomberg Businessweek","slug":"tv/bloomberg-businessweek"},{"name":"Tv - Gabinete de Seguridad","slug":"tv/gabinete-de-seguridad"},{"name":"Transición","slug":"transicion"},{"name":"Emprendedores","slug":"emprendedores"},{"name":"Blogs - Monoblock","slug":"blogs/monoblock"},{"name":"Península","slug":"peninsula"},{"name":"ESPN","slug":"espn"},{"name":"Tv - La Cuarta Transformación","slug":"tv/la-cuarta-transformacion"},{"name":"Primeros 100 días","slug":"primeros-100-dias"}]
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasSpider(scrapy.Spider):
name = 'noticias'
allowed_domains = ['elfinanciero.com']
start_urls = ['http://elfinanciero.com/']
def start_requests(self):
yield scrapy.Request(url=self.start_urls[0]+"search/", callback=self.parseSections)
#------------------------------------------------------------------------------------------------ def parseSections(self, response):
class QuotesSpider(scrapy.Spider): print(response.url)
""" sections = json.loads(re.findall("var allSections = (.+?);\n", response.body.decode("utf-8"), re.S)[0])
Basic Scrapy Spider class
name = "noticias"
def start_requests(self): for i in sections:
year = getattr(self, "year", None) year = getattr(self, "year", None)
month = getattr(self, "month", None) month = getattr(self, "month", None)
self.day = getattr(self, "day", None) day = getattr(self, "day", None)
self.date = year + "-" + month.zfill(2) + "-" + day.zfill(2)
self.this_date = year + "-" + month.zfill(2) + "-" + self.day.zfill(2) url = 'https://api.elfinanciero.com/public/search/typed/?_format:json&json={"categoriesslug":"'+i["slug"]+'","min_date":"'+self.date+'","max_date":"'+self.date+'"}'
self.baseURL1 = "https://api.elfinanciero.com.mx/public/search/typed/?_format=json&json={%22search%22:%22*%22,%22categoriesslug%22:%22" yield scrapy.Request(url=url, callback=self.parse)
self.baseURL2 = "%22,%22min_date%22:%22"+self.this_date+"%22,%22max_date%22:%22"+self.this_date+"%22}&type=page&page=1&size=10000"
# print(self.baseURL)
for i in allSections:
yield scrapy.Request(url=self.baseURL1+i["slug"]+self.baseURL2, callback=self.parse)
def parse(self, response): def parse(self, response):
data = json.loads(response.text)["data"][1] data = json.loads(response.text)["data"][1]
for d in data: for d in data:
item = NoticiasItem() item = ElfinancieroItem()
item["title"] = d["_source"]["title"]
item["date"] = d["_source"]["createdAt"] item["date"] = d["_source"]["createdAt"]
item["title"] = d["_source"]["title"]
item["text"]=remove_tags(d["_source"]["html"]) item["text"]=remove_tags(d["_source"]["html"])
item["topic"]=d["_source"]["categoryId"]["slug"] item["topic"]=d["_source"]["categoryId"]["slug"]
item["author"]=d["_source"]["author"][0]["name"]+" "+d["_source"]["author"][0]["aPaterno"]+" "+d["_source"]["author"][0]["aMaterno"] item["author"]=d["_source"]["author"][0]["name"]+" "+d["_source"]["author"][0]["aPaterno"]+" "+d["_source"]["author"][0]["aMaterno"]
item["url"]="https://elfinanciero.com.mx/"+d["_source"]["slug"] item["url"]="https://elfinanciero.com/"+d["_source"]["slug"]
if item["url"] not in self.urllist and len(item["text"])>0:
yield item yield item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
twitter = scrapy.Field()
email = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class DiariocolatinoSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class DiariocolatinoDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
def close_spider(self, spider):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
row.append(("date", item['date']))
row.append(("topic", item['topic']))
row.append(("title", item['title']))
row.append(("author", item['author']))
row.append(("location", item['location']))
row.append(("twitter", item['twitter']))
row.append(("email", item['email']))
row.append(("text", item['text']))
row.append(("url", item['url']))
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for diarioCoLatino project
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'diarioCoLatino'
SPIDER_MODULES = ['diarioCoLatino.spiders']
NEWSPIDER_MODULE = 'diarioCoLatino.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'diarioCoLatino (+http://www.yourdomain.com)'
# Obey robots.txt rules
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# The download delay setting will honor only one of:
# Disable cookies (enabled by default)
# Disable Telnet Console (enabled by default)
# Override the default request headers:
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# 'diarioCoLatino.middlewares.DiariocolatinoSpiderMiddleware': 543,
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# 'diarioCoLatino.middlewares.DiariocolatinoDownloaderMiddleware': 543,
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
# 'scrapy.extensions.telnet.TelnetConsole': None,
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
'diarioCoLatino.pipelines.JsonWriterPipeline': 300,
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# The initial download delay
# The maximum download delay to be set in case of high latencies
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# Enable showing throttling stats for every response received:
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
Diario Co Latino, El Salvador
## Get all the news from a specific date. ##
$ cd diarioCoLatino/
$ scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
import scrapy, re
from diarioCoLatino.items import NoticiasItem
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
AUTH_RE = re.compile(r'\nPor.+?\n')
TW_RE = re.compile(r'\n((\| )?Twitter:\s+)?@[\w.%+-]+.\n', re.I)
LOC_RE = re.compile(r'\n.*?\/(PL|AFP|DPA|SIGNIS ALC)\n', re.I)
EM_RE = re.compile(r'\n((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?\n')
class QuotesSpider(scrapy.Spider):
Basic Scrapy Spider class
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "https://www.diariocolatino.com/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//div[@class="pagination"]/a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/') + 1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL + "/page/" + str(page + 1), callback=self.parse_page)
def parse_page(self, response):
for link in response.css('div.content').css('div.post-listing').xpath('./article/h2/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
# La fecha obtenida ya incluye formato y zona horaria
news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
news_title = remove_tags(response.css('h1.entry-title').css('span').extract_first()).strip()
news_topic = None
for p in response.xpath('//div[@class="entry"]/p').extract():
text += remove_tags(p) + "\n"
if text == '':
for p in response.xpath('//div[@class="entry"]/div/span').extract():
text += remove_tags(p) + "\n"
text = "\n" + text
""" Obtiene autor """
news_author = None
res = AUTH_RE.match(text)
if res:
m = res.group(0)
news_author = m[m.find('Por')+len('Por'):].strip()
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
""" Elimina twitter """
news_twitter = None
res = TW_RE.search(text)
if res:
m = res.group(0)
news_twitter = m.strip()
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
""" Obtiene lugar """
news_loc = None
res = LOC_RE.match(text)
if res:
m = res.group(0)
if m[m.find('/') + 1:].strip().lower() != 'dpa':
news_loc = m[:m.find('/')].strip()
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
""" Elimina correo """
news_email = None
res = EM_RE.search(text)
if res:
m = res.group(0)
news_email = m.strip()
# text = text[text.find(m) + len(m):].strip()
text = text.replace(m, '').strip()
text = "\n" + text
res = EM_RE.search(text)
if res:
m = res.group(0)
news_email = m.strip()
# text = text[text.find(m) + len(m):].strip()
text = text.replace(m, '').strip()
text = "\n" + text
text = text.replace("\n@Diario Co Latino\n", '').strip()
text = "\n" + text
text = text.replace("\nDiario Co Latino\n", '').strip()
text = "\n" + text
text = text.replace("\nCo Latino\n", '').strip()
## News item info ##
item['date'] = news_date
item['title'] = news_title
item['topic'] = news_topic
item['author'] = news_author
item['twitter'] = news_twitter
item['location'] = news_loc
item['email'] = news_email
item['text'] = text.strip()
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
default = diarioCoLatino.settings
#url = http://localhost:6800/
project = diarioCoLatino
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class SanpedrosunSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class SanpedrosunDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
def close_spider(self, spider):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
row.append(("date", item['date']))
row.append(("topic", item['topic']))
row.append(("title", item['title']))
row.append(("author", item['author']))
row.append(("location", item['location']))
row.append(("text", item['text']))
row.append(("url", item['url']))
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for sanPedroSun project
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'sanPedroSun'
SPIDER_MODULES = ['sanPedroSun.spiders']
NEWSPIDER_MODULE = 'sanPedroSun.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'sanPedroSun (+http://www.yourdomain.com)'
# Obey robots.txt rules
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# The download delay setting will honor only one of:
# Disable cookies (enabled by default)
# Disable Telnet Console (enabled by default)
# Override the default request headers:
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# 'sanPedroSun.middlewares.SanpedrosunSpiderMiddleware': 543,
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# 'sanPedroSun.middlewares.SanpedrosunDownloaderMiddleware': 543,
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
# 'scrapy.extensions.telnet.TelnetConsole': None,
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
'sanPedroSun.pipelines.JsonWriterPipeline': 300,
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# The initial download delay
# The maximum download delay to be set in case of high latencies
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# Enable showing throttling stats for every response received:
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from sanPedroSun.items import NoticiasItem
The San Pedro Sun, Belice
scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
DATE_RE1 = re.compile(r'(-|- )?([A-Z][a-z]+, ?)?[A-Z][a-z]+( \d{1,2})?, \d{4}( -|\n)? ?[A-Z]')
DATE_RE2 = re.compile(r', [a-zA-Z]+ \d{1,2} -( -)?')
AUTH_RE = re.compile(r'\n(- )?By.+\n')
class importantData(scrapy.Item):
page = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
globalSet = set()
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "https://www.sanpedrosun.com/wp-content/themes/sunbase/GetResults.php?year=" + year + "&monthnum=" + month + "&day=" + day + "&posts_per_page=8&n_view=standard&n_style=list"
# self.baseURL = "https://www.sanpedrosun.com/wp-content/themes/sunbase/GetResults.php?year=" + year + "&monthnum=" + month + "&posts_per_page=8&n_view=standard&n_style=list"
searchData = importantData()
searchData['page'] = 0
request = scrapy.Request(url=self.baseURL, callback=self.parse)
request.meta['item'] = searchData
yield request
def parse(self, response):
localSet = set(response.css('div.entry').xpath('./h2/a/@href').extract())
resultSet = localSet - self.globalSet
if len(resultSet) > 0:
searchData = response.meta['item']
for link in resultSet:
yield scrapy.Request(url=link, callback=self.parse_item)
searchData['page'] += 1
page = searchData['page']
request = scrapy.Request(url=response.url + "&n_more=" + str(page), callback=self.parse)
request.meta['item'] = searchData
yield request
def parse_page(self, response):
for link in response.css('div.td-ss-main-content').css('div.item-details').xpath('./h3/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = remove_tags(response.xpath('//div[@class="post"]/h1').extract_first()).strip()
topic = response.xpath('//div[@class="breadcrumb"]/p/a/text()').extract()[-1]
topic = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()
topic = None
item['topic'] = topic
for p in response.xpath('//div[@class="entry"]').css('p').extract():
text += remove_tags(p) + "\n"
text = text.strip()
text = "\n" + text
text = text.replace(u'\u2013', "-")
text = text.replace(u'\u00a0', '') ## Elimina 'no-break spaces'
res = DATE_RE1.search(text)
if res:
m = res.group(0)[:-1]
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
res = DATE_RE2.search(text)
if res:
m = res.group(0)[:-1]
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
res = AUTH_RE.match(text)
if res:
m = res.group(0)
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
text = text.replace("Follow The San Pedro Sun News on Twitter, become a fan on Facebook. Stay updated via RSS", '')
item['text'] = text.strip()
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
default = sanPedroSun.settings
#url = http://localhost:6800/
project = sanPedroSun
# Automatically created by: scrapy startproject
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
default = tiempoDigitalHn.settings
#url = http://localhost:6800/
project = tiempoDigitalHn
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class TiempodigitalhnSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class TiempodigitalhnDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
def close_spider(self, spider):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
row.append(("date", item['date']))
row.append(("topic", item['topic']))
row.append(("title", item['title']))
row.append(("author", item['author']))
row.append(("location", item['location']))
row.append(("text", item['text']))
row.append(("url", item['url']))
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for tiempoDigitalHn project
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'tiempoDigitalHn'
SPIDER_MODULES = ['tiempoDigitalHn.spiders']
NEWSPIDER_MODULE = 'tiempoDigitalHn.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tiempoDigitalHn (+http://www.yourdomain.com)'
# Obey robots.txt rules
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# The download delay setting will honor only one of:
# Disable cookies (enabled by default)
# Disable Telnet Console (enabled by default)
# Override the default request headers:
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# 'tiempoDigitalHn.middlewares.TiempodigitalhnSpiderMiddleware': 543,
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# 'tiempoDigitalHn.middlewares.TiempodigitalhnDownloaderMiddleware': 543,
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
# 'scrapy.extensions.telnet.TelnetConsole': None,
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
'tiempoDigitalHn.pipelines.JsonWriterPipeline': 300,
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# The initial download delay
# The maximum download delay to be set in case of high latencies
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# Enable showing throttling stats for every response received:
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from tiempoDigitalHn.items import NoticiasItem
Tiempo Digital, Honduras
scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
LOC_RE1 = re.compile(r'\n([A-Z]+ )+ ?[.-]')
LOC_RE2 = re.compile(r'\n.+?,? ?.+? ?(\. ?-|\.|-) ?[A-Z]')
SOURCE_RE = re.compile(r'\n ?Fuente:.+$')
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "https://tiempo.hn/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.page-nav').xpath('./a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-2]
pagination = pagination[-1]
pagination = pagination.strip('/')
pages = int(pagination[pagination.rfind('/') + 1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL + '/page/' + str(page + 1), callback=self.parse_page)
def parse_page(self, response):
for link in response.css('div.td-ss-main-content').css('div.td_module_1').xpath('./h3/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.css('span.td-post-date').css('time.entry-date::attr(datetime)').extract_first()
item['title'] = remove_tags(response.xpath('//header[@class="td-post-title"]/h1').extract_first()).strip()
topic = response.xpath('//ul[@class="td-category"]/li').extract()[-1]
item['topic'] = remove_tags(topic)
item['topic'] = None
author = response.xpath('//div[@class="td-post-author-name"]/a').extract_first()
if author is not None:
item['author'] = remove_tags(author)
for p in response.xpath('//div[@class="td-post-content"]').css('p').extract():
text += remove_tags(p) + "\n"
text = text.strip()
text = "\n" + text
text = text.replace(u'\u2013', "-")
text = text.replace(u'\u00a0', '') ## Elimina 'no-break spaces'
res = LOC_RE1.match(text)
if res:
m = res.group(0)[:-1]
location = m.replace("-", '').strip()
if len(location) <= 25:
item['location'] = location
text = text.replace(m, '').strip()
text = "\n" + text
res = LOC_RE2.match(text)
if res:
m = res.group(0)[:-1]
location = m.replace("-", '').replace(".", '').strip()
if len(location) <= 25:
item['location'] = location
text = text.replace(m, '').strip()
text = "\n" + text
res = SOURCE_RE.search(text)
if res:
m = res.group(0)
text = text.replace(m, '').strip()
text = "\n" + text
item['text'] = text.strip()
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
default = tribunaHn.settings
#url = http://localhost:6800/
project = tribunaHn
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class TribunahnSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class TribunahnDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
def close_spider(self, spider):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
row.append(("date", item['date']))
row.append(("topic", item['topic']))
row.append(("title", item['title']))
row.append(("author", item['author']))
row.append(("location", item['location']))
row.append(("text", item['text']))
row.append(("url", item['url']))
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for tribunaHn project
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'tribunaHn'
SPIDER_MODULES = ['tribunaHn.spiders']
NEWSPIDER_MODULE = 'tribunaHn.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tribunaHn (+http://www.yourdomain.com)'
# Obey robots.txt rules
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# The download delay setting will honor only one of:
# Disable cookies (enabled by default)
# Disable Telnet Console (enabled by default)
# Override the default request headers:
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# 'tribunaHn.middlewares.TribunahnSpiderMiddleware': 543,
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# 'tribunaHn.middlewares.TribunahnDownloaderMiddleware': 543,
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
# 'scrapy.extensions.telnet.TelnetConsole': None,
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
'tribunaHn.pipelines.JsonWriterPipeline': 300,
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# The initial download delay
# The maximum download delay to be set in case of high latencies
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# Enable showing throttling stats for every response received:
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
La Tribuna, Honduras
## Get all the news from a specific date. ##
$ cd tribunaHn/
$ scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=2 -a day=29
import scrapy, re
from tribunaHn.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
Class for Time Zone
def utcoffset(self, dt):
## Time zone for Honduras: UTC-6 ##
return timedelta(hours=-6)
def tzname(self, dt):
## Time zone name ##
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
Basic Scrapy Spider class
name = "noticias"
def start_requests(self):
tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.news_date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
baseURL = 'http://www.latribuna.hn/' + year + '/' + month + '/' + day
yield scrapy.Request(url=baseURL, callback=self.parse)
def parse(self, response):
for link in response.xpath('//div[@id="main"]').css('h3 > a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
next_page = response.css('span.next > a::attr(href)').extract_first()
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_item(self, response):
text = ''
item = NoticiasItem()
title = response.css('article.article-post').css('h1').extract_first()
if title is not None: title = remove_tags(title)
topic = None
topic_list = response.css('aside.tags').css('li > a').extract()
if len(topic_list) > 0:
topic = remove_tags(topic_list[0])
for p in response.css('div.article-post-content').css('p').extract():
text += remove_tags(p) + '\n'
## News item info ##
item['date'] = self.news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items # Define here the models for your scraped items
# #
# See documentation in: # See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html # https://docs.scrapy.org/en/latest/topics/items.html
import scrapy import scrapy
class NoticiasItem(scrapy.Item): class HeraldoagsItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
date = scrapy.Field()
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field() location = scrapy.Field()
author = scrapy.Field() author = scrapy.Field()
topic = scrapy.Field() topic = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware # Define here the models for your spider middleware
# #
# See documentation in: # See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class HeraldoagsSpiderMiddleware(object): class HeraldoagsSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined, # Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the # scrapy acts as if the spider middleware does not modify the
# passed objects. # passed objects.
...@@ -20,30 +21,29 @@ class HeraldoagsSpiderMiddleware(object): ...@@ -20,30 +21,29 @@ class HeraldoagsSpiderMiddleware(object):
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s return s
def process_spider_input(response, spider): def process_spider_input(self, response, spider):
# Called for each response that goes through the spider # Called for each response that goes through the spider
# middleware and into the spider. # middleware and into the spider.
# Should return None or raise an exception. # Should return None or raise an exception.
return None return None
def process_spider_output(response, result, spider): def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after # Called with the results returned from the Spider, after
# it has processed the response. # it has processed the response.
# Must return an iterable of Request, dict or Item objects. # Must return an iterable of Request, or item objects.
for i in result: for i in result:
yield i yield i
def process_spider_exception(response, exception, spider): def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method # Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception. # (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict # Should return either None or an iterable of Request or item objects.
# or Item objects.
pass pass
def process_start_requests(start_requests, spider): def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works # Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except # similarly to the process_spider_output() method, except
# that it doesn’t have a response associated. # that it doesn’t have a response associated.
...@@ -54,3 +54,50 @@ class HeraldoagsSpiderMiddleware(object): ...@@ -54,3 +54,50 @@ class HeraldoagsSpiderMiddleware(object):
def spider_opened(self, spider): def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name) spider.logger.info('Spider opened: %s' % spider.name)
class HeraldoagsDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here # Define your item pipelines here
# #
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name # useful for handling different item types with a single interface
return cls(filename) from itemadapter import ItemAdapter
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
def close_spider(self, spider):
class HeraldoagsPipeline:
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
row.append(("date", item['date']))
row.append(("topic", item['topic']))
row.append(("title", item['title']))
row.append(("author", item['author']))
row.append(("location", item['location']))
row.append(("text", item['text']))
row.append(("url", item['url']))
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
# -*- coding: utf-8 -*-
# Scrapy settings for heraldoAgs project # Scrapy settings for heraldoAgs project
# #
# For simplicity, this file contains only settings considered important or # For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation: # commonly used. You can find more settings consulting the documentation:
# #
# http://doc.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'heraldoAgs' BOT_NAME = 'heraldoAgs'
SPIDER_MODULES = ['heraldoAgs.spiders'] SPIDER_MODULES = ['heraldoAgs.spiders']
NEWSPIDER_MODULE = 'heraldoAgs.spiders' NEWSPIDER_MODULE = 'heraldoAgs.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent # Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'heraldoAgs (+http://www.yourdomain.com)' #USER_AGENT = 'heraldoAgs (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
...@@ -45,31 +43,31 @@ COOKIES_ENABLED = False ...@@ -45,31 +43,31 @@ COOKIES_ENABLED = False
#} #}
# Enable or disable spider middlewares # Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# 'heraldoAgs.middlewares.HeraldoagsSpiderMiddleware': 543, # 'heraldoAgs.middlewares.HeraldoagsSpiderMiddleware': 543,
#} #}
# Enable or disable downloader middlewares # Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# 'heraldoAgs.middlewares.MyCustomDownloaderMiddleware': 543, # 'heraldoAgs.middlewares.HeraldoagsDownloaderMiddleware': 543,
#} #}
# Enable or disable extensions # Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html # See https://docs.scrapy.org/en/latest/topics/extensions.html
# 'scrapy.extensions.telnet.TelnetConsole': None, # 'scrapy.extensions.telnet.TelnetConsole': None,
#} #}
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
'heraldoAgs.pipelines.JsonWriterPipeline': 300, # 'heraldoAgs.pipelines.HeraldoagsPipeline': 300,
} #}
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# The initial download delay # The initial download delay
...@@ -82,7 +80,7 @@ ITEM_PIPELINES = { ...@@ -82,7 +80,7 @@ ITEM_PIPELINES = {
# Enable and configure HTTP caching (disabled by default) # Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_DIR = 'httpcache'
