update

parent b3970bbd
#!/usr/bin/python3
# -*- coding: utf-8 -*-
#Author: Mario Chirinos Colunga
#===============================================================================
import sys
import os
from pathlib import Path
import datetime
#from myModule import myModule
import json
#===============================================================================
def findLastDate(directory):
print(directory)
path = Path(directory)
dirs = [e.name for e in path.iterdir() if e.is_dir()]
dirs.sort()
path = Path(directory+dirs[-1])
files = [e.name[:-5] for e in path.glob("*.json")]
files.sort()
if len(dirs)>0:
dirs.sort()
i = -1
while (len(dirs)+1)>=0:
path = Path(directory+dirs[i])
files = [e.name[:-5] for e in path.glob("*.json")]
if len(files)>0:
files.sort()
date = datetime.datetime.strptime(files[-1], '%Y-%m-%d')
return date
i-=1
return None
date = datetime.datetime.strptime(files[-1], '%Y-%m-%d')
return date
#===============================================================================
def updateDir(directory):
def updateDir(directory, cfg, endDate=datetime.datetime.now()):
startDate = findLastDate(directory)
endDate = datetime.datetime.now()
print(startDate, endDate)
# endDate = datetime.datetime.now()
if startDate is None:
startDate=datetime.datetime.strptime(cfg["startDate"], '%Y-%m-%d')
delta = endDate-startDate
for i in range(delta.days + 1):
day = startDate + datetime.timedelta(days=i)
yeardir = directory+str(day.year)+"/"
if not os.path.exists(yeardir):
os.mkdir(yeardir)
print(day)
os.system("scrapy crawl noticias --nolog -O "+yeardir+day.strftime('%Y-%m-%d')+".json -a year="+str(day.year)+" -a month="+str(day.month)+" -a day="+str(day.day)+"")
#===============================================================================
def main(argv):
if len(sys.argv) != 2:
print ("Usage text")
if len(argv) != 2 and len(argv) != 3:
print ("Usage: " + argv[0] + "<directory> [endDate:YYYY-MM-DD]")
else:
updateDir(argv[1])
with open(argv[1]+'settings.json') as json_file:
cfg = json.load(json_file)
if len(argv)==2:
updateDir(argv[1], cfg)
if len(argv)==3:
updateDir(argv[1], cfg, datetime.datetime.strptime(argv[2], '%Y-%m-%d'))
if __name__ == "__main__":
main(sys.argv)
#!/usr/bin/python3
import sys
import os
import json
from pathlib import Path
import chardet
#from myModule import myModule
#===============================================================================
def ascii2utf8(inputfilename, outputfilename):
print(inputfilename)
with open(inputfilename) as json_file:
data = json.load(json_file)#.read().decode("unicode_escape")
print(data)
with open(outputfilename, 'w') as outfile:
json.dump(data, outfile, ensure_ascii=False, indent=1)
#===============================================================================
def copyDirStructure(indir, outdir):
print(indir)
path = Path(indir)
dirs = [e.name for e in path.iterdir() if e.is_dir()]
if not os.path.exists(outdir+path.name):
os.mkdir(outdir+path.name)
for d in dirs:
yeardir = outdir+path.name+"/"+d+"/"
print(path.name, d)
if not os.path.exists(yeardir):
os.mkdir(yeardir)
filepath = Path(indir+d)
files = [e.name for e in filepath.glob("*.json")]
for f in files:
ascii2utf8(indir+d+"/"+f, yeardir+f)
#===============================================================================
def main(argv):
if len(sys.argv) != 3:
print ("Usage: " + argv[0] + " <input dir> <output dir>")
else:
copyDirStructure(argv[1], argv[2])
if __name__ == "__main__":
main(sys.argv)
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class AlchileSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for alChile project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'alChile'
SPIDER_MODULES = ['alChile.spiders']
NEWSPIDER_MODULE = 'alChile.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'alChile (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'alChile.middlewares.AlchileSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'alChile.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'alChile.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from alChile.items import NoticiasItem
"""
MEDIO:
Al Chile, Yucatan
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL = 'http://alchile.com.mx/' + year + '/' + month + '/' + day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.page-nav').css('a::attr(href)').extract()
if len(pagination) > 0:
pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1,pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.css('div.td-block-span6').css('h3.entry-title').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.css('header.td-post-title').css('h1.entry-title::text').extract_first()
d = response.css('span.td-post-date').css('time.entry-date::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.css('div.td-post-header').css('a::text').extract_first()
for paragraph in response.css('div.td-post-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = alChile.settings
[deploy]
#url = http://localhost:6800/
project = alChile
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class CampechehoySpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for campecheHoy project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'campecheHoy'
SPIDER_MODULES = ['campecheHoy.spiders']
NEWSPIDER_MODULE = 'campecheHoy.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'campecheHoy (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'campecheHoy.middlewares.CampechehoySpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'campecheHoy.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'campecheHoy.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from campecheHoy.items import NoticiasItem
"""
MEDIO:
Campeche Hoy, Campeche
USO:
scrapy crawl noticias --nolog -s filename=2018-01-17.json -a year=2018 -a month=1 -a day=17
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "http://campechehoy.mx/" + year + "/" + month + "/" + day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a/@href').extract()
if pagination is not None and len(pagination) > 0:
pages = pagination[-2].rstrip("/")
pages = int(pages[pages.rfind("/")+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.xpath('//*[@class="td-pb-span8 td-main-content"]').css('h3').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.xpath('//span[@class="td-post-date"]/time/@datetime').extract_first()
item['title'] = remove_tags(response.xpath('//header/h1[@class="entry-title"]').extract_first())
try:
topic = response.xpath('//*[@class="td-post-source-tags td-pb-padding-side"]/ul/li/a/text()').extract()[1]
except:
topic = response.xpath('//*[@class="td-post-source-tags td-pb-padding-side"]/ul/li/a/text()').extract_first()
item['topic'] = topic
for p in response.xpath('//*[@class="td-post-content td-pb-padding-side"]/p').extract():
p = remove_tags(p)
p = p.replace("&lt;", "<")
p = p.replace("&gt;", ">")
text += remove_tags(p) + "\n"
item['text'] = text
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = campecheHoy.settings
[deploy]
#url = http://localhost:6800/
project = campecheHoy
This diff is collapsed.
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class ChiapashoySpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for chiapasHoy project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'chiapasHoy'
SPIDER_MODULES = ['chiapasHoy.spiders']
NEWSPIDER_MODULE = 'chiapasHoy.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'chiapasHoy (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'chiapasHoy.middlewares.ChiapashoySpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'chiapasHoy.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'chiapasHoy.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from chiapasHoy.items import NoticiasItem
import datetime
"""
MEDIO:
Chiapas Hoy, Chiapas
USO:
scrapy crawl noticias --nolog -s filename=2018-01-25.json -a year=2018 -a month=1 -a day=25
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
LOC_RE = re.compile(r'\A.+?(\d{1,2}[\s-][a-zA-Z]+[\s-]\d{4})?\s?\.\s?-\s?')
DAT_RE = re.compile(r'[,;]?(\sa?\s?\d{1,2}\sde\s[a-zA-Z]+\sde\s\d{4}\s?)?\.\s?-\s?')
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
date = datetime.date(int(year), int(month), int(day))
self.baseURL = "http://www.chiapashoy.com.mx/notashoy/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse, cb_kwargs={"date":date})
def parse(self, response, **kwargs):
links = response.css('article').css('h3').css('a::attr(href)').extract()
print(links)
for link in links:
yield scrapy.Request(url=link, callback=self.parse_item, cb_kwargs=kwargs)
nextPage = response.xpath('//*[@class="nav-links"]/a[@class="next page-numbers"]/@href').extract_first()
if nextPage is not None and nextPage != '':
yield scrapy.Request(url=nextPage, callback=self.parse, cb_kwargs=kwargs)
def parse_item(self, response, **kwargs):
item = NoticiasItem()
text = ''
item['date'] = kwargs["date"].strftime('%Y/%m/%d') #response.xpath('//span[@class="meta-date"]/a/time/@datetime').extract_first()
item['title'] = response.css("h1.entry-title::text").extract_first()
item['topic'] = response.css('li.meta-category').css('a::text').extract_first().replace(" ", "").replace("\n", "")
paragraphs = response.css("article").css("div.entry-content").css("p").extract()
item['author'] = remove_tags(paragraphs[-1])
text = ""
for p in paragraphs:
text += remove_tags(p) + "\n"
item['text'] = text.strip()
item['url'] = response.url
print(item)
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = chiapasHoy.settings
[deploy]
#url = http://localhost:6800/
project = chiapasHoy
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class CuartopoderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
date = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class CuartopoderSpiderMiddleware(object):
class CuartopoderSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
......@@ -31,7 +32,7 @@ class CuartopoderSpiderMiddleware(object):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
......@@ -39,8 +40,7 @@ class CuartopoderSpiderMiddleware(object):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
......@@ -56,7 +56,7 @@ class CuartopoderSpiderMiddleware(object):
spider.logger.info('Spider opened: %s' % spider.name)
class CuartopoderDownloaderMiddleware(object):
class CuartopoderDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
......
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class CuartopoderPipeline:
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for cuartoPoder project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'cuartoPoder'
SPIDER_MODULES = ['cuartoPoder.spiders']
NEWSPIDER_MODULE = 'cuartoPoder.spiders'
FEED_EXPORT_ENCODING="utf-8"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'cuartoPoder (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -45,31 +43,31 @@ COOKIES_ENABLED = False
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'cuartoPoder.middlewares.CuartopoderSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'cuartoPoder.middlewares.CuartopoderDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'cuartoPoder.pipelines.JsonWriterPipeline': 300,
}
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'cuartoPoder.pipelines.CuartopoderPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
......@@ -82,7 +80,7 @@ ITEM_PIPELINES = {
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
......
# -*- coding: utf-8 -*-
"""
MEDIA:
Cuarto Poder, Chiapas
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd cuartoPoder/
$ scrapy crawl noticias --nolog -s filename=2018-08-30.json -a year=2018 -a month=8 -a day=30
"""
import scrapy, re
from cuartoPoder.items import NoticiasItem
from datetime import datetime, date, timedelta, tzinfo
import scrapy
import json
import datetime
from cuartoPoder.items import CuartopoderItem
#-------------------------------------------------------------------------------
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""
Class for Time Zone
"""
def utcoffset(self, dt):
## Time zone for Chiapas: UTC-6 ##
return timedelta(hours=-6)
def tzname(self, dt):
## Time zone name ##
return 'UTC-6'
class ImportantData(scrapy.Item):
"""
Useful data for the flow of the implementation
"""
to_next_page = scrapy.Field()
is_last_link = scrapy.Field()
next_page = scrapy.Field()
return_url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
self.tz = UTC()
self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
self.stop_date = date(int(self.year), int(self.month), int(self.day))
self.baseURL = "http://www.cuartopoder.mx"
first_URL = self.baseURL + "/archivo/portada/listado/{1}-{2}-{0}/{1}-{2}-{0}/".format(self.year, self.month.zfill(2), self.day.zfill(2))
self.second_URL = self.baseURL + "/XStatic/cuartopoder/template/cargaBloque.aspx?strControl=ctrlArchivoResultadosPaginadoListado&"
self.month_parser = {"Enero" : 1, "Mayo" : 5, "Septiembre" : 9,
"Febrero" : 2, "Junio" : 6, "Octubre" : 10,
"Marzo" : 3, "Julio" : 7, "Noviembre" : 11,
"Abril" : 4, "Agosto" : 8, "Diciembre" : 12}
flow_info = ImportantData()
flow_info['to_next_page'] = False
flow_info['next_page'] = 2
request = scrapy.Request(url=first_URL, callback=self.parse)
request.meta['item'] = flow_info
yield request
def parse(self, response):
flow_info = response.meta['item']
page = flow_info['next_page']
if not flow_info['to_next_page']:
link_list = response.css('ul.news-list').xpath('./li/h5/a/@href').extract()
for link in link_list:
flow_info = ImportantData()
flow_info['next_page'] = page
flow_info['return_url'] = response.url
if link == link_list[-1] : flow_info['is_last_link'] = True
else : flow_info['is_last_link'] = False
news_link = self.baseURL + link
request = scrapy.Request(url=news_link, callback=self.parse_item)
request.meta['item'] = flow_info
yield request
else:
page_URL = self.second_URL + "p={3}&eids=&fd={1}-{2}-{0}&fh={1}-{2}-{0}&id=portada".format(self.year, self.month.zfill(2), self.day.zfill(2), str(page))
flow_info['to_next_page'] = False
flow_info['next_page'] += 1
request = scrapy.Request(url=page_URL, callback=self.parse)
request.meta['item'] = flow_info
yield request
def parse_item(self, response):
news_date = response.css('ul.metas-list > li > p').extract_first()
news_date = remove_tags(news_date)
news_date = news_date.split(u'\xa0')
news_date[1] = news_date[1].strip().replace(",", '')
news_date = date(int(self.year), self.month_parser[news_date[0]], int(news_date[1]))
if news_date == self.stop_date:
flow_info = response.meta['item']
item = NoticiasItem()
text = ''
news_date = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
title = response.css('div.post-title').css('h1').extract_first()
if title is not None : title = remove_tags(title)
topic = response.css('div.big-title').xpath('./h2/a/span').extract_first()
if topic is not None : topic = remove_tags(topic)
for p in response.css('div.post-content').css('p').extract():
p = remove_tags(p)
text += p + "\n"
## News item info ##
item['date'] = news_date
item['title'] = title.strip()
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
return TAG_RE.sub('', text)
#-------------------------------------------------------------------------------
class NoticiasSpider(scrapy.Spider):
name = 'noticias'
allowed_domains = ['cuartopoder.mx']
start_urls = ['https://cuartopoder.mx/']
#-----------------------------------------------------------------------
def start_requests(self):
self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
date = self.month.zfill(2)+"-"+self.day.zfill(2)+"-"+self.year
print(date)
url = self.start_urls[0]+"XStatic/cuartopoder/template/cargaBloque.aspx?strControl=ctrlArchivoResultadosPaginadoListado&eids=&fd="+date+"&fh="+date+"&id=portada&p=1"
yield scrapy.Request(url=url, callback=self.parsePage)
#-----------------------------------------------------------------------
def parsePage(self, response):
i = response.url.index("&p=")
url = response.url[:i+3]+str(int(response.url[i+3:])+1)
links = response.css('ul.news-list').xpath('./li/h5/a/@href').extract()
print(response.url)
print(len(links))
if len(links)>0:
for l in links:
yield scrapy.Request(url=self.start_urls[0][:-1]+l, callback=self.parse)
yield scrapy.Request(url=url, callback=self.parsePage)
#-----------------------------------------------------------------------
def parse(self, response):
item = CuartopoderItem()
date = self.year+"-"+self.month.zfill(2)+"-"+self.day.zfill(2)
item["date"]= datetime.datetime.strptime(date, '%Y-%m-%d').isoformat()
item["title"] =response.xpath("//meta[@property='og:title']/@content").extract_first()
item["topic"] = response.css('div.big-title').xpath('./h2/a/span//text()').extract_first()
item["text"] =response.xpath("//meta[@name='Description']/@content").extract_first()
item["url"] = response.url
print(item["title"])
yield item
yield item
if flow_info['is_last_link']:
flow_info['to_next_page'] = True
request = scrapy.Request(url=flow_info['return_url'], callback=self.parse, dont_filter=True)
request.meta['item'] = flow_info
yield request
\ No newline at end of file
......@@ -41,6 +41,7 @@ class QuotesSpider(scrapy.Spider):
def parse(self, response):
print(response.url)
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()
......@@ -78,6 +79,6 @@ class QuotesSpider(scrapy.Spider):
item['url'] = response.url
# print item['title']
print (item['title'])
yield item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class DespertaroaxacaSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for despertarOaxaca project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'despertarOaxaca'
SPIDER_MODULES = ['despertarOaxaca.spiders']
NEWSPIDER_MODULE = 'despertarOaxaca.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'despertarOaxaca (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'despertarOaxaca.middlewares.DespertaroaxacaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'despertarOaxaca.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'despertarOaxaca.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from despertarOaxaca.items import NoticiasItem
"""
MEDIO:
El Despertar de Oaxaca
USO:
scrapy crawl noticias --nolog -s filename=2018-02-04.json -a year=2018 -a month=2 -a day=4
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
# LOC_RE = re.compile(r'\n.+?,? ?.+? ?\. ?- ?')
# G_RE = re.compile(r' ?- ?')
# EM_RE = re.compile(r'((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?')
# TW_RE = re.compile(r'M.{1,3}s de la P.{1,3}lvora en Twitter: @[\w.%+-]+.', re.I)
# TW2_RE = re.compile(r'((\| )?Twitter:\s+)?(@[\w.%+-]+.)?', re.I)
# TAG2_RE = re.compile(r'\ntransition_[^\]]+\]')
# TAG3_RE = re.compile(r'\[[^\]]+[\]\n]')
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "http://despertardeoaxaca.com/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//div[@class="post-pagination cat-"]/a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/') + 1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL + '/page/' + str(page + 1), callback=self.parse_page)
def parse_page(self, response):
for post in response.xpath('//div[@class="articles"]').css('div.cnt'):
item = NoticiasItem()
topic = post.css('span.category').xpath('./a').extract_first()
if topic is not None:
item['topic'] = remove_tags(topic)
link = post.css('h3').xpath('./a/@href').extract_first()
request = scrapy.Request(url=link, callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_item(self, response):
item = response.meta['item']
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = remove_tags(response.css('article.article-post').xpath('./header/h1').extract_first()).strip()
for p in response.xpath('//div[@class="article-post-content"]').css('p').extract():
text += remove_tags(p) + "\n"
# result = LOC_RE.search(text)
# if result:
# m = result.group(0)
# location = G_RE.sub('', m).strip()
# if len(location) <= 35:
# item['location'] = location
# text = text[text.find(m)+len(m):]
# text = EM_RE.sub('', text)
# text = TW_RE.sub('', text)
# text = TW2_RE.sub('', text)
# text = TAG2_RE.sub("\n", text)
# text = TAG3_RE.sub('', text)
item['text'] = text.strip()
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = despertarOaxaca.settings
[deploy]
#url = http://localhost:6800/
project = despertarOaxaca
This source diff could not be displayed because it is too large. You can view the blob instead.
File mode changed from 100755 to 100644
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class EdomexdiaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
date = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class EdomexdiaSpiderMiddleware(object):
class EdomexdiaSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
......@@ -20,30 +21,29 @@ class EdomexdiaSpiderMiddleware(object):
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(start_requests, spider):
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
......@@ -54,3 +54,50 @@ class EdomexdiaSpiderMiddleware(object):
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class EdomexdiaDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class EdomexdiaPipeline:
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for edoMexDia project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'edoMexDia'
SPIDER_MODULES = ['edoMexDia.spiders']
NEWSPIDER_MODULE = 'edoMexDia.spiders'
FEED_EXPORT_ENCODING="utf-8"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'edoMexDia (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -45,31 +43,31 @@ COOKIES_ENABLED = False
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'edoMexDia.middlewares.EdomexdiaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'edoMexDia.middlewares.MyCustomDownloaderMiddleware': 543,
# 'edoMexDia.middlewares.EdomexdiaDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'edoMexDia.pipelines.JsonWriterPipeline': 300,
}
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'edoMexDia.pipelines.EdomexdiaPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
......@@ -82,7 +80,7 @@ ITEM_PIPELINES = {
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
......
File mode changed from 100755 to 100644
# -*- coding: utf-8 -*-
"""
Spider for edomexaldia.com
Author: Mario Chirinos Coluga
Usage:scrapy crawl noticias --nolog -O 2017-04-23.json -a year=2017 -a month=4 -a day=23
"""
import scrapy, re
from edoMexDia.items import NoticiasItem
from edoMexDia.items import EdomexdiaItem
from datetime import datetime, timedelta, tzinfo
"""
MEDIO:
EDOMEX al Día, Estado de México
USO:
scrapy crawl noticias --nolog -s filename=2018-01-30.json -a year=2018 -a month=1 -a day=30
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para estado de méxico: utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
self.date_parser = {'january': 1, 'february': 2, 'march': 3, 'april': 4,
'may': 5, 'june': 6, 'july': 7, 'august': 8,
'september': 9, 'october': 10, 'november': 11, 'december': 12}
self.baseURL = "http://www.edomexaldia.com.mx/" + self.year + "/" + self.month.zfill(2) + "/" + self.day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
lastPage = response.xpath('//div[@class="numbered-pagination"]/a[@class="pagi-last"]/@href').extract_first()
if lastPage is None:
lastPage = response.xpath('//div[@class="numbered-pagination"]/a/@href').extract()[-1]
if lastPage is not None and lastPage != '':
lastPage = lastPage.strip('/')
lastPage = int(lastPage[lastPage.rfind('/')+1:])
for page in range(1, lastPage):
yield scrapy.Request(url=self.baseURL + "/page/" + str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.xpath('//div[@id="main"]/div/h2[@class="entry_title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
nextPage = response.xpath('//div[@class="numbered-pagination"]/a[@class="pagi-next"]/@href').extract_first()
if nextPage is not None and nextPage != '':
yield scrapy.Request(url=nextPage, callback=self.parse)
def parse_item(self, response):
item = NoticiasItem()
text = ''
try:
d = remove_tags(response.xpath('//span[@class="post_author_create"]').extract_first())
d = d.replace("el ", '').replace(",", '').replace(".", '').split()
dat = datetime(int(d[2]), self.date_parser[d[1].lower()], int(d[0]), tzinfo=self.tz).isoformat("T")
except:
dat = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
item['date'] = dat
item['title'] = remove_tags(response.xpath('//div[@id="main"]/div/h1').extract_first()).strip()
item['topic'] = None
author = response.xpath('//span[@class="post_author_author"]').extract_first()
if author is not None and author != '':
author = remove_tags(author).strip()
author = author.replace(" Publicado:", '')
item['author'] = author
for p in response.xpath('//div[@id="main"]/div/p').extract():
text += remove_tags(p) + "\n"
item['text'] = text.strip()
item['url'] = response.url
yield item
name = "noticias"
def start_requests(self):
self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
self.baseURL = "http://edomexaldia.com/" + self.year + "/" + self.month.zfill(2) + "/" + self.day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse_page)
#-----------------------------------------------------------------------
def parse_page(self, response):
print("parse page", response.url)
for link in response.xpath('//main[@id="main"]/article/header/h2[@class="entry-title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
next = response.css('a.next::attr(href)').extract_first()
if next is not None:
yield scrapy.Request(url=next, callback=self.parse_page)
#-----------------------------------------------------------------------
def parse_item(self, response):
item = EdomexdiaItem()
item['date'] = response.xpath("//meta[@property='article:published_time']/@content").extract_first()
item['title'] = response.xpath("//meta[@property='og:title']/@content").extract_first().replace(" - Edomex Al Día","")
text=""
for p in response.xpath('//div[@class="entry-content"]/p/text()').extract():
nt = remove_tags(p).replace("\n","").replace("\r","").strip()
text+=nt
if len(nt)>0:
text+="\n"
item['text'] = text.strip()
item['topic'] = ", ".join(response.xpath('//span[@class="tag-links"]/a/text()').extract())
item['url'] = response.url
item["author"]=""
item["location"]=""
print(item["url"])
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = edoMexDia.settings
......
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ElfinancieroItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# define the fields for your item here like:
# name = scrapy.Field()
date = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
media = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class ElfinancieroSpiderMiddleware(object):
class ElfinancieroSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
......@@ -31,7 +32,7 @@ class ElfinancieroSpiderMiddleware(object):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
......@@ -39,8 +40,7 @@ class ElfinancieroSpiderMiddleware(object):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
......@@ -56,7 +56,7 @@ class ElfinancieroSpiderMiddleware(object):
spider.logger.info('Spider opened: %s' % spider.name)
class ElfinancieroDownloaderMiddleware(object):
class ElfinancieroDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
......
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
class ElfinancieroPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
def open_spider(self, spider):
self.itemList=[]
def close_spider(self, spider):
# print(len(self.itemList))
with open(self.filename, 'w') as fp:
json.dump(self.itemList, fp)
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
def process_item(self, item, spider):
self.itemList.append(dict(item))
return item
class ElfinancieroPipeline:
def process_item(self, item, spider):
return item
# -*- coding: utf-8 -*-
# Scrapy settings for elFinanciero project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'elFinanciero'
SPIDER_MODULES = ['elFinanciero.spiders']
NEWSPIDER_MODULE = 'elFinanciero.spiders'
FEED_EXPORT_ENCODING="utf-8"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'elFinanciero (+http://www.yourdomain.com)'
......@@ -25,7 +23,7 @@ ROBOTSTXT_OBEY = True
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
......@@ -45,31 +43,31 @@ ROBOTSTXT_OBEY = True
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'elFinanciero.middlewares.ElfinancieroSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'elFinanciero.middlewares.ElfinancieroDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'elFinanciero.pipelines.ElfinancieroPipeline': 300,
}
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'elFinanciero.pipelines.ElfinancieroPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
......@@ -82,7 +80,7 @@ ITEM_PIPELINES = {
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
......
# -*- coding: utf-8 -*-
"""
MEDIA:
El Financiero
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd elFinanciero/
$ scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
Spider for elfinanciero.com.mx
Author: Mario Chirinos Coluga
Usage:scrapy crawl noticias --nolog -O 2021-03-18.json -a year=2021 -a month=3 -a day=18
"""
import scrapy
import json
import re
from elFinanciero.items import ElfinancieroItem
import scrapy, re, json
from elFinanciero.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
#------------------------------------------------------------------------------------------------
allSections = [{"name":"Economía","slug":"economia"},{"name":"Empresas","slug":"empresas"},{"name":"Mercados","slug":"mercados"},{"name":"Pyme","slug":"pyme"},{"name":"Franquicias","slug":"franquicias"},{"name":"Nacional","slug":"nacional"},{"name":"Tech","slug":"tech"},{"name":"Mundo","slug":"mundo"},{"name":"Deportes","slug":"deportes"},{"name":"Culturas","slug":"culturas"},{"name":"Buena Vida","slug":"buena-vida"},{"name":"Reflector","slug":"reflector"},{"name":"Ciencia","slug":"ciencia"},{"name":"Mis Finanzas","slug":"mis-finanzas"},{"name":"Opinión","slug":"opinion"},{"name":"Interactivos","slug":"interactivos"},{"name":"Blogs","slug":"blogs"},{"name":"Fotogalerías","slug":"fotogalerias"},{"name":"Financial Times","slug":"financial-times"},{"name":"Power Tools","slug":"power-tools"},{"name":"Bajío","slug":"bajio"},{"name":"Monterrey","slug":"monterrey"},{"name":"Universidades","slug":"universidades"},{"name":"Mundo empresa","slug":"mundo-empresa"},{"name":"Texas","slug":"texas"},{"name":"Suplementos","slug":"suplementos"},{"name":"Archivo","slug":"archivo"},{"name":"Pages","slug":"pages"},{"name":"Licitaciones","slug":"licitaciones"},{"name":"Bloomberg","slug":"bloomberg"},{"name":"Startup","slug":"startup"},{"name":"Mercados - Acciones","slug":"mercados/acciones"},{"name":"Mercados - IPC","slug":"mercados/ipc"},{"name":"Mercados - Divisas","slug":"mercados/divisas"},{"name":"Mercados - Dinero","slug":"mercados/dinero"},{"name":"Mercados - Commodities","slug":"mercados/commodities"},{"name":"TLCAN","slug":"tlcan"},{"name":"Blogs - Territorio Viral","slug":"blogs/territorio-viral"},{"name":"Blogs - Templo del Morbo","slug":"blogs/templo-del-morbo"},{"name":"Sponsor","slug":"sponsor"},{"name":"Bloomberg Businessweek","slug":"bloomberg-businessweek"},{"name":"Millonarios","slug":"millonarios"},{"name":"Management","slug":"management"},{"name":"Viajes","slug":"viajes"},{"name":"Cartones","slug":"cartones"},{"name":"EF Eventos","slug":"ef-eventos"},{"name":"Blogs - Efecto Jazz","slug":"blogs/efecto-jazz"},{"name":"Blogs - Visión CFA","slug":"blogs/vision-cfa"},{"name":"Pages - Eventos","slug":"pages/eventos"},{"name":"Pages - Interactivos","slug":"pages/interactivos"},{"name":"Pages - PDF","slug":"pages/pdf"},{"name":"Pages - Documentos","slug":"pages/documentos"},{"name":"Pages - Docs","slug":"pages/docs"},{"name":"TV","slug":"tv"},{"name":"Tv - Al sonar la campana","slug":"tv/al-sonar-la-campana"},{"name":"Tv - Espresso Doble","slug":"tv/espresso-doble"},{"name":"Tv - Ganadores & Perdedores","slug":"tv/ganadores-y-perdedores"},{"name":"Tv - Entre Mercados","slug":"tv/entre-mercados"},{"name":"Tv - Mesa Central","slug":"tv/mesa-central"},{"name":"Tv - Bitácora Política","slug":"tv/bitacora-politica"},{"name":"Tv - Sin Línea","slug":"tv/sin-linea"},{"name":"Tv - Al Cierre","slug":"tv/al-cierre"},{"name":"Tv - Tiempo de Toros","slug":"tv/tiempo-de-toros"},{"name":"Tv - Nación 321","slug":"tv/nacion321"},{"name":"Tv - El mundo según...","slug":"tv/el-mundo-segun"},{"name":"Tv - En EF y por Adela","slug":"tv/en-ef-y-por-adela"},{"name":"Tv - La Nota Dura","slug":"tv/la-nota-dura"},{"name":"Tv - La Silla Roja","slug":"tv/la-silla-roja"},{"name":"Tv - Personajes","slug":"tv/personajes"},{"name":"Tv - Tech","slug":"tv/tech"},{"name":"Tv - Mundo","slug":"tv/mundo"},{"name":"Tv - Finanzas Personales","slug":"tv/finanzas-personales"},{"name":"Tv - Estilo de Vida","slug":"tv/estilo-de-vida"},{"name":"Tv - Bloomberg","slug":"tv/bloomberg"},{"name":"Tv - Viral","slug":"tv/viral"},{"name":"Tv - Nacional","slug":"tv/nacional"},{"name":"Tv - Empresas","slug":"tv/empresas"},{"name":"Tv - Economía","slug":"tv/economia"},{"name":"Tv - Reflector","slug":"tv/reflector"},{"name":"Tv - Sponsor","slug":"tv/sponsor"},{"name":"Rankings","slug":"rankings"},{"name":"Trivias","slug":"trivias"},{"name":"Elecciones 2018","slug":"elecciones-2018"},{"name":"Pages - Businessweek México","slug":"pages/businessweek-mexico"},{"name":"Fibras","slug":"fibras"},{"name":"After Office","slug":"after-office"},{"name":"New York Times Syndicate","slug":"new-york-times-syndicate"},{"name":"México en Hannover","slug":"mexico-en-hannover"},{"name":"Tv - Opinión","slug":"tv/opinion"},{"name":"Pages - Central Política","slug":"pages/central-politica"},{"name":"Relojes","slug":"relojes"},{"name":"Autos","slug":"autos"},{"name":"Sibarita","slug":"sibarita"},{"name":"Letras Libres","slug":"letras-libres"},{"name":"Rusia 2018","slug":"rusia-2018"},{"name":"Tv - Especiales","slug":"tv/especiales"},{"name":"Tv - Bloomberg Businessweek","slug":"tv/bloomberg-businessweek"},{"name":"Tv - Gabinete de Seguridad","slug":"tv/gabinete-de-seguridad"},{"name":"Transición","slug":"transicion"},{"name":"Emprendedores","slug":"emprendedores"},{"name":"Blogs - Monoblock","slug":"blogs/monoblock"},{"name":"Península","slug":"peninsula"},{"name":"ESPN","slug":"espn"},{"name":"Tv - La Cuarta Transformación","slug":"tv/la-cuarta-transformacion"},{"name":"Primeros 100 días","slug":"primeros-100-dias"}]
#------------------------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
#------------------------------------------------------------------------------------------------
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
self.this_date = year + "-" + month.zfill(2) + "-" + self.day.zfill(2)
self.baseURL1 = "https://api.elfinanciero.com.mx/public/search/typed/?_format=json&json={%22search%22:%22*%22,%22categoriesslug%22:%22"
self.baseURL2 = "%22,%22min_date%22:%22"+self.this_date+"%22,%22max_date%22:%22"+self.this_date+"%22}&type=page&page=1&size=10000"
# print(self.baseURL)
for i in allSections:
yield scrapy.Request(url=self.baseURL1+i["slug"]+self.baseURL2, callback=self.parse)
def parse(self, response):
data = json.loads(response.text)["data"][1]
for d in data:
item = NoticiasItem()
item["title"] = d["_source"]["title"]
item["date"] = d["_source"]["createdAt"]
item["text"]=remove_tags(d["_source"]["html"])
item["topic"]=d["_source"]["categoryId"]["slug"]
item["author"]=d["_source"]["author"][0]["name"]+" "+d["_source"]["author"][0]["aPaterno"]+" "+d["_source"]["author"][0]["aMaterno"]
item["url"]="https://elfinanciero.com.mx/"+d["_source"]["slug"]
return TAG_RE.sub('', text)
#-------------------------------------------------------------------------------
class NoticiasSpider(scrapy.Spider):
name = 'noticias'
allowed_domains = ['elfinanciero.com']
start_urls = ['http://elfinanciero.com/']
urllist=[]
def start_requests(self):
print("start_urls")
yield scrapy.Request(url=self.start_urls[0]+"search/", callback=self.parseSections)
def parseSections(self, response):
print(response.url)
sections = json.loads(re.findall("var allSections = (.+?);\n", response.body.decode("utf-8"), re.S)[0])
for i in sections:
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.date = year + "-" + month.zfill(2) + "-" + day.zfill(2)
url = 'https://api.elfinanciero.com/public/search/typed/?_format:json&json={"categoriesslug":"'+i["slug"]+'","min_date":"'+self.date+'","max_date":"'+self.date+'"}'
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
data = json.loads(response.text)["data"][1]
for d in data:
item = ElfinancieroItem()
item["date"] = d["_source"]["createdAt"]
item["title"] = d["_source"]["title"]
item["text"]=remove_tags(d["_source"]["html"])
item["topic"]=d["_source"]["categoryId"]["slug"]
item["author"]=d["_source"]["author"][0]["name"]+" "+d["_source"]["author"][0]["aPaterno"]+" "+d["_source"]["author"][0]["aMaterno"]
item["url"]="https://elfinanciero.com/"+d["_source"]["slug"]
item["media"]="https://elfinanciero.com/"+d["_source"]["mainImage"]
if item["url"] not in self.urllist and len(item["text"])>0:
self.urllist.append(item['url'])
print(item["title"])
yield item
yield item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
twitter = scrapy.Field()
email = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class DiariocolatinoSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class DiariocolatinoDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("twitter", item['twitter']))
except:
pass
try:
row.append(("email", item['email']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for diarioCoLatino project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'diarioCoLatino'
SPIDER_MODULES = ['diarioCoLatino.spiders']
NEWSPIDER_MODULE = 'diarioCoLatino.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'diarioCoLatino (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'diarioCoLatino.middlewares.DiariocolatinoSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'diarioCoLatino.middlewares.DiariocolatinoDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'diarioCoLatino.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
MEDIA:
Diario Co Latino, El Salvador
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd diarioCoLatino/
$ scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
"""
import scrapy, re
from diarioCoLatino.items import NoticiasItem
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
AUTH_RE = re.compile(r'\nPor.+?\n')
TW_RE = re.compile(r'\n((\| )?Twitter:\s+)?@[\w.%+-]+.\n', re.I)
LOC_RE = re.compile(r'\n.*?\/(PL|AFP|DPA|SIGNIS ALC)\n', re.I)
EM_RE = re.compile(r'\n((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?\n')
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "https://www.diariocolatino.com/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//div[@class="pagination"]/a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/') + 1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL + "/page/" + str(page + 1), callback=self.parse_page)
def parse_page(self, response):
for link in response.css('div.content').css('div.post-listing').xpath('./article/h2/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
# La fecha obtenida ya incluye formato y zona horaria
news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
news_title = remove_tags(response.css('h1.entry-title').css('span').extract_first()).strip()
news_topic = None
for p in response.xpath('//div[@class="entry"]/p').extract():
text += remove_tags(p) + "\n"
if text == '':
for p in response.xpath('//div[@class="entry"]/div/span').extract():
text += remove_tags(p) + "\n"
text = "\n" + text
""" Obtiene autor """
news_author = None
res = AUTH_RE.match(text)
if res:
m = res.group(0)
news_author = m[m.find('Por')+len('Por'):].strip()
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
""" Elimina twitter """
news_twitter = None
res = TW_RE.search(text)
if res:
m = res.group(0)
news_twitter = m.strip()
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
""" Obtiene lugar """
news_loc = None
res = LOC_RE.match(text)
if res:
m = res.group(0)
if m[m.find('/') + 1:].strip().lower() != 'dpa':
news_loc = m[:m.find('/')].strip()
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
else:
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
""" Elimina correo """
news_email = None
res = EM_RE.search(text)
if res:
m = res.group(0)
news_email = m.strip()
# text = text[text.find(m) + len(m):].strip()
text = text.replace(m, '').strip()
text = "\n" + text
res = EM_RE.search(text)
if res:
m = res.group(0)
news_email = m.strip()
# text = text[text.find(m) + len(m):].strip()
text = text.replace(m, '').strip()
text = "\n" + text
text = text.replace("\n@Diario Co Latino\n", '').strip()
text = "\n" + text
text = text.replace("\nDiario Co Latino\n", '').strip()
text = "\n" + text
text = text.replace("\nCo Latino\n", '').strip()
## News item info ##
item['date'] = news_date
item['title'] = news_title
item['topic'] = news_topic
item['author'] = news_author
item['twitter'] = news_twitter
item['location'] = news_loc
item['email'] = news_email
item['text'] = text.strip()
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = diarioCoLatino.settings
[deploy]
#url = http://localhost:6800/
project = diarioCoLatino
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class SanpedrosunSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class SanpedrosunDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for sanPedroSun project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'sanPedroSun'
SPIDER_MODULES = ['sanPedroSun.spiders']
NEWSPIDER_MODULE = 'sanPedroSun.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'sanPedroSun (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'sanPedroSun.middlewares.SanpedrosunSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'sanPedroSun.middlewares.SanpedrosunDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'sanPedroSun.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from sanPedroSun.items import NoticiasItem
"""
MEDIO:
The San Pedro Sun, Belice
USO:
scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
DATE_RE1 = re.compile(r'(-|- )?([A-Z][a-z]+, ?)?[A-Z][a-z]+( \d{1,2})?, \d{4}( -|\n)? ?[A-Z]')
DATE_RE2 = re.compile(r', [a-zA-Z]+ \d{1,2} -( -)?')
AUTH_RE = re.compile(r'\n(- )?By.+\n')
class importantData(scrapy.Item):
page = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
globalSet = set()
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "https://www.sanpedrosun.com/wp-content/themes/sunbase/GetResults.php?year=" + year + "&monthnum=" + month + "&day=" + day + "&posts_per_page=8&n_view=standard&n_style=list"
# self.baseURL = "https://www.sanpedrosun.com/wp-content/themes/sunbase/GetResults.php?year=" + year + "&monthnum=" + month + "&posts_per_page=8&n_view=standard&n_style=list"
searchData = importantData()
searchData['page'] = 0
request = scrapy.Request(url=self.baseURL, callback=self.parse)
request.meta['item'] = searchData
yield request
def parse(self, response):
localSet = set(response.css('div.entry').xpath('./h2/a/@href').extract())
resultSet = localSet - self.globalSet
if len(resultSet) > 0:
searchData = response.meta['item']
for link in resultSet:
self.globalSet.add(link)
yield scrapy.Request(url=link, callback=self.parse_item)
searchData['page'] += 1
page = searchData['page']
request = scrapy.Request(url=response.url + "&n_more=" + str(page), callback=self.parse)
request.meta['item'] = searchData
yield request
def parse_page(self, response):
for link in response.css('div.td-ss-main-content').css('div.item-details').xpath('./h3/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = remove_tags(response.xpath('//div[@class="post"]/h1').extract_first()).strip()
try:
topic = response.xpath('//div[@class="breadcrumb"]/p/a/text()').extract()[-1]
except:
try:
topic = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()
except:
topic = None
item['topic'] = topic
for p in response.xpath('//div[@class="entry"]').css('p').extract():
text += remove_tags(p) + "\n"
text = text.strip()
text = "\n" + text
text = text.replace(u'\u2013', "-")
text = text.replace(u'\u00a0', '') ## Elimina 'no-break spaces'
res = DATE_RE1.search(text)
if res:
m = res.group(0)[:-1]
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
res = DATE_RE2.search(text)
if res:
m = res.group(0)[:-1]
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
res = AUTH_RE.match(text)
if res:
m = res.group(0)
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
text = text.replace("Follow The San Pedro Sun News on Twitter, become a fan on Facebook. Stay updated via RSS", '')
item['text'] = text.strip()
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = sanPedroSun.settings
[deploy]
#url = http://localhost:6800/
project = sanPedroSun
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = tiempoDigitalHn.settings
[deploy]
#url = http://localhost:6800/
project = tiempoDigitalHn
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class TiempodigitalhnSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class TiempodigitalhnDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for tiempoDigitalHn project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'tiempoDigitalHn'
SPIDER_MODULES = ['tiempoDigitalHn.spiders']
NEWSPIDER_MODULE = 'tiempoDigitalHn.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tiempoDigitalHn (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'tiempoDigitalHn.middlewares.TiempodigitalhnSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'tiempoDigitalHn.middlewares.TiempodigitalhnDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'tiempoDigitalHn.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from tiempoDigitalHn.items import NoticiasItem
"""
MEDIO:
Tiempo Digital, Honduras
USO:
scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
LOC_RE1 = re.compile(r'\n([A-Z]+ )+ ?[.-]')
LOC_RE2 = re.compile(r'\n.+?,? ?.+? ?(\. ?-|\.|-) ?[A-Z]')
SOURCE_RE = re.compile(r'\n ?Fuente:.+$')
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "https://tiempo.hn/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.page-nav').xpath('./a/@href').extract()
if len(pagination) > 0:
try:
pagination = pagination[-2]
except:
pagination = pagination[-1]
pagination = pagination.strip('/')
pages = int(pagination[pagination.rfind('/') + 1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL + '/page/' + str(page + 1), callback=self.parse_page)
def parse_page(self, response):
for link in response.css('div.td-ss-main-content').css('div.td_module_1').xpath('./h3/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.css('span.td-post-date').css('time.entry-date::attr(datetime)').extract_first()
item['title'] = remove_tags(response.xpath('//header[@class="td-post-title"]/h1').extract_first()).strip()
try:
topic = response.xpath('//ul[@class="td-category"]/li').extract()[-1]
item['topic'] = remove_tags(topic)
except:
item['topic'] = None
author = response.xpath('//div[@class="td-post-author-name"]/a').extract_first()
if author is not None:
item['author'] = remove_tags(author)
for p in response.xpath('//div[@class="td-post-content"]').css('p').extract():
text += remove_tags(p) + "\n"
text = text.strip()
text = "\n" + text
text = text.replace(u'\u2013', "-")
text = text.replace(u'\u00a0', '') ## Elimina 'no-break spaces'
res = LOC_RE1.match(text)
if res:
m = res.group(0)[:-1]
location = m.replace("-", '').strip()
if len(location) <= 25:
item['location'] = location
text = text.replace(m, '').strip()
text = "\n" + text
res = LOC_RE2.match(text)
if res:
m = res.group(0)[:-1]
location = m.replace("-", '').replace(".", '').strip()
if len(location) <= 25:
item['location'] = location
text = text.replace(m, '').strip()
text = "\n" + text
res = SOURCE_RE.search(text)
if res:
m = res.group(0)
text = text.replace(m, '').strip()
text = "\n" + text
item['text'] = text.strip()
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = tribunaHn.settings
[deploy]
#url = http://localhost:6800/
project = tribunaHn
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class TribunahnSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class TribunahnDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for tribunaHn project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'tribunaHn'
SPIDER_MODULES = ['tribunaHn.spiders']
NEWSPIDER_MODULE = 'tribunaHn.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tribunaHn (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'tribunaHn.middlewares.TribunahnSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'tribunaHn.middlewares.TribunahnDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'tribunaHn.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
MEDIA:
La Tribuna, Honduras
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd tribunaHn/
$ scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=2 -a day=29
"""
import scrapy, re
from tribunaHn.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""
Class for Time Zone
"""
def utcoffset(self, dt):
## Time zone for Honduras: UTC-6 ##
return timedelta(hours=-6)
def tzname(self, dt):
## Time zone name ##
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.news_date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
baseURL = 'http://www.latribuna.hn/' + year + '/' + month + '/' + day
yield scrapy.Request(url=baseURL, callback=self.parse)
def parse(self, response):
for link in response.xpath('//div[@id="main"]').css('h3 > a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
next_page = response.css('span.next > a::attr(href)').extract_first()
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_item(self, response):
text = ''
item = NoticiasItem()
title = response.css('article.article-post').css('h1').extract_first()
if title is not None: title = remove_tags(title)
topic = None
topic_list = response.css('aside.tags').css('li > a').extract()
if len(topic_list) > 0:
topic = remove_tags(topic_list[0])
for p in response.css('div.article-post-content').css('p').extract():
text += remove_tags(p) + '\n'
## News item info ##
item['date'] = self.news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
This diff is collapsed.
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class HeraldoagsItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
date = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class HeraldoagsSpiderMiddleware(object):
class HeraldoagsSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
......@@ -20,30 +21,29 @@ class HeraldoagsSpiderMiddleware(object):
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(start_requests, spider):
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
......@@ -54,3 +54,50 @@ class HeraldoagsSpiderMiddleware(object):
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class HeraldoagsDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class HeraldoagsPipeline:
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for heraldoAgs project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'heraldoAgs'
SPIDER_MODULES = ['heraldoAgs.spiders']
NEWSPIDER_MODULE = 'heraldoAgs.spiders'
FEED_EXPORT_ENCODING="utf-8"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'heraldoAgs (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -45,31 +43,31 @@ COOKIES_ENABLED = False
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'heraldoAgs.middlewares.HeraldoagsSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'heraldoAgs.middlewares.MyCustomDownloaderMiddleware': 543,
# 'heraldoAgs.middlewares.HeraldoagsDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'heraldoAgs.pipelines.JsonWriterPipeline': 300,
}
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'heraldoAgs.pipelines.HeraldoagsPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
......@@ -82,7 +80,7 @@ ITEM_PIPELINES = {
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment