update

parent 9d89a5cc
# M³ - Descarga de Noticias e Texto # M³ - Descarga de Noticias e Texto
Rastreadore (Crawlers) para medios escritos de información en linea basados en [Scrapy](http://scrapy.org/). Rastreadore (Crawlers) para medios escritos de información en linea basados en [Scrapy](http://scrapy.org/).
Los ratreadores estan divididos en tres clases. Los ratreadores estan divididos en tres clases:
* spiders/daily: Sitios que su verion impresa es publicada diariamente. * spiders/daily: Sitios que su verion impresa es publicada diariamente.
* spiders/monthly: SItios de publcacion mensual. * spiders/monthly: SItios de publcacion mensual.
......
#!/usr/bin/python3
import sys
import datetime
import glob
import json
import os
#===============================================================================
#===============================================================================
def main(argv):
'''
'''
cwd = os.getcwd()
if len(argv) != 2 and len(argv) != 3:
print ("Usage: " + argv[0] + "<directory> [endDate:YYYY-MM-DD]")
else:
rootdir = argv[1]
with open(rootdir+"/"+datetime.datetime.today().strftime('%Y-%m-%d')+".log", "a") as logfile:
for path in glob.glob(f'{rootdir}/*/'):
with open(path+'settings.json') as json_file:
cfg = json.load(json_file)
logfile.write("Crawler "+cfg["crawler"]+" started at: " +datetime.datetime.now().strftime("%Y-%m-%d, %H:%M:%S")+"\n")
new_cwd = os.getcwd()+"/spiders/"+cfg["crawler"]
if os.path.exists(new_cwd):
os.chdir(new_cwd)
os.system("python3 ../../../scripts/siteCrawler.py " + path)
else:
logfile.write("\t spider not found.\n")
print(os.getcwd())
os.chdir(cwd)
#-------------------------------------------------------------------------------
if __name__ == "__main__":
main(sys.argv)
#!/usr/bin/python3 #!/usr/bin/python3
#Author: Mario Chirinos Colunga # File: siteCrawler.py
# Author: Mario Chirinos Colunga
# Daily News Site Crawler
#=============================================================================== #===============================================================================
import sys import sys
import os import os
......
...@@ -16,8 +16,9 @@ def remove_tags(text): ...@@ -16,8 +16,9 @@ def remove_tags(text):
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
class NoticiasSpider(scrapy.Spider): class NoticiasSpider(scrapy.Spider):
name = 'noticias' name = 'noticias'
allowed_domains = ['elfinanciero.com'] allowed_domains = ['elfinanciero.com.mx']
start_urls = ['http://elfinanciero.com/'] start_urls = ['https://elfinanciero.com.mx/']
urllist=[] urllist=[]
def start_requests(self): def start_requests(self):
print("start_urls") print("start_urls")
......
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class DesdeelbalconSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Scrapy settings for desdeElBalcon project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'desdeElBalcon'
SPIDER_MODULES = ['desdeElBalcon.spiders']
NEWSPIDER_MODULE = 'desdeElBalcon.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'desdeElBalcon (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'desdeElBalcon.middlewares.DesdeelbalconSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'desdeElBalcon.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'desdeElBalcon.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# -*- coding: utf-8 -*-
import scrapy, re
from datetime import datetime, timedelta, tzinfo
from desdeElBalcon.items import NoticiasItem
"""
MEDIO:
Desde el Balcon, Yucatan
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
self.baseURL='http://www.desdeelbalcon.com/'+self.year+'/'+self.month+'/'+self.day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
print(response.url)
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
item = NoticiasItem()
for post in response.xpath('//ul[@class="archivepost"]/li'):
# item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
item['date'] = datetime(int(self.year),int(self.month),int(self.day),tzinfo=self.tz).isoformat('T')
item['topic'] = post.xpath('./p/a/text()').extract()
request = scrapy.Request(url=post.xpath('./h2/a/@href').extract_first(), callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_item(self, response):
text = ''
item = response.meta['item']
item['title'] = response.xpath('//h1[@class="post entry-title"]/a/text()').extract_first()
for paragraph in response.xpath('//div[@itemprop="text"]/p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
print (item['title'])
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = desdeElBalcon.settings
[deploy]
#url = http://localhost:6800/
project = desdeElBalcon
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class DiariocolatinoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
date = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class DiariocolatinoSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class DiariocolatinoDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class DiariocolatinoPipeline:
def process_item(self, item, spider):
return item
# Scrapy settings for diarioCoLatino project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'diarioCoLatino'
SPIDER_MODULES = ['diarioCoLatino.spiders']
NEWSPIDER_MODULE = 'diarioCoLatino.spiders'
FEED_EXPORT_ENCODING="utf-8"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'diarioCoLatino (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'diarioCoLatino.middlewares.DiariocolatinoSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'diarioCoLatino.middlewares.DiariocolatinoDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'diarioCoLatino.pipelines.DiariocolatinoPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
import scrapy
from diarioCoLatino.items import DiariocolatinoItem
import re
#-------------------------------------------------------------------------------
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
#-----------------------------------
#-------------------------------------------------------------------------------
class NoticiasSpider(scrapy.Spider):
name = 'noticias'
allowed_domains = ['diariocolatino.com']
start_urls = ['http://diariocolatino.com/']
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "https://www.diariocolatino.com/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parseDate)
#-----------------------------------------------------------------------
def parseDate(self, response):
print(response.url)
for page in response.css('span#tie-next-page').css("a::attr(href)").extract():
yield scrapy.Request(url=page, callback=self.parseDate)
for link in response.css('div.content').css('div.post-listing').xpath('./article/h2/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parseItem)
#-----------------------------------------------------------------------
def parseItem(self, response):
print(response.url)
item = DiariocolatinoItem()
item["date"] = response.xpath("//meta[@property='article:published_time']/@content").extract_first()
item["title"] = response.xpath('//meta[@property="og:title"]/@content').extract_first().replace("- Diario Co Latino","")
item["topic"] = response.css("span.post-cats").css("a::text").extract_first().lower()
text=""
for p in response.xpath('//div[@class="entry"]/p').extract():
text += remove_tags(p) + "\n"
item["text"]=text
item["url"]=response.url
print(item)
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = diarioCoLatino.settings
[deploy]
#url = http://localhost:6800/
project = diarioCoLatino
# -*- coding: utf-8 -*-
import scrapy, re
from edoMexDia.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
"""
MEDIO:
EDOMEX al Día, Estado de México
USO:
scrapy crawl noticias --nolog -s filename=2018-01-30.json -a year=2018 -a month=1 -a day=30
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
#-----------------------------------------------------------------------
def utcoffset(self, dt):
# zona horaria para estado de méxico: utc-6
return timedelta(hours=-6)
#-----------------------------------------------------------------------
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
self.baseURL = "http://edomexaldia.com/" + self.year + "/" + self.month.zfill(2) + "/" + self.day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse_page)
#-----------------------------------------------------------------------
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
# lastPage = response.xpath('//div[@class="numbered-pagination"]/a[@class="pagi-last"]/@href').extract_first()
# if lastPage is None:
# lastPage = response.xpath('//div[@class="numbered-pagination"]/a/@href').extract()[-1]
# if lastPage is not None and lastPage != '':
# lastPage = lastPage.strip('/')
# lastPage = int(lastPage[lastPage.rfind('/')+1:])
# for page in range(1, lastPage):
# yield scrapy.Request(url=self.baseURL + "/page/" + str(page+1), callback=self.parse_page)
#-----------------------------------------------------------------------
def parse_page(self, response):
print("parse page", response.url)
for link in response.xpath('//main[@id="main"]/article/header/h2[@class="entry-title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
# nextPage = response.xpath('//div[@class="numbered-pagination"]/a[@class="pagi-next"]/@href').extract_first()
# if nextPage is not None and nextPage != '':
# yield scrapy.Request(url=nextPage, callback=self.parse)
#-----------------------------------------------------------------------
def parse_item(self, response):
print("print item", response.url)
item = NoticiasItem()
#### text = ''
##### print(response.xpath("//meta[@property='article:published_time']/@content").extract_first())
##### try:
##### d = remove_tags(response.xpath('//span[@class="post_author_create"]').extract_first())
##### d = d.replace("el ", '').replace(",", '').replace(".", '').split()
##### dat = datetime(int(d[2]), self.date_parser[d[1].lower()], int(d[0]), tzinfo=self.tz).isoformat("T")
##### except:
##### dat = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
item['date'] = response.xpath("//meta[@property='article:published_time']/@content").extract_first()
item['title'] = response.xpath("//meta[@property='og:title']/@content").extract_first().replace(" - Edomex Al Día","")
item['text'] = "\n".join([remove_tags(p) for p in response.xpath('//div[@class="entry-content"]/p/text()').extract() ])
item['topic'] = None
##### author = response.xpath('//span[@class="post_author_author"]').extract_first()
##### if author is not None and author != '':
##### author = remove_tags(author).strip()
##### author = author.replace(" Publicado:", '')
##### item['author'] = author
##### for p in response.xpath('//div[@id="main"]/div/p').extract():
##### text += remove_tags(p) + "\n"
##### item['text'] = text.strip()
item['url'] = response.url
print(item)
# yield item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class ElcomentarioSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class ElcomentarioDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for elComentario project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'elComentario'
SPIDER_MODULES = ['elComentario.spiders']
NEWSPIDER_MODULE = 'elComentario.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'elComentario (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'elComentario.middlewares.ElcomentarioSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'elComentario.middlewares.ElcomentarioDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'elComentario.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# -*- coding: utf-8 -*-
"""
MEDIA:
El Comentario, Colima
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd elComentario/
$ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
"""
import scrapy, re
from elComentario.items import NoticiasItem
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
baseURL = "https://elcomentario.ucol.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
yield scrapy.Request(url=baseURL, callback=self.parse)
def parse(self, response):
print("parse", response.url)
pages = response.css("li.mkd-pagination-last-page").css("a::attr(href)").extract_first()
pages = int(pages[pages.find("page/")+5:-1])
for p in range (pages):
next_page = response.url+"/page/"+str(p+1)
yield scrapy.Request(url=next_page, callback=self.parse_page)
def parse_page(self, response):
print("parse_page", response.url)
links = response.css("h5").css("a::attr(href)").extract()
for link in links:
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
print("parse_item", response.url)
item = NoticiasItem()
text = ''
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = response.xpath("//meta[@property='og:title']/@content").extract_first()
item["topic"] = response.css("div.mkd-post-info-category").css("a::text").extract_first()
content = response.css("div.pf-content").css("p").extract()
for p in content:
text+= remove_tags(p)+"\n"
text = text.strip()
item['text'] = text
item['url'] = response.url
print(item)
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = elComentario.settings
[deploy]
#url = http://localhost:6800/
project = elComentario
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class LajornadaagsSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for laJornadaAgs project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'laJornadaAgs'
SPIDER_MODULES = ['laJornadaAgs.spiders']
NEWSPIDER_MODULE = 'laJornadaAgs.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'laJornadaAgs (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'laJornadaAgs.middlewares.LajornadaagsSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'laJornadaAgs.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'laJornadaAgs.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from laJornadaAgs.items import NoticiasItem
"""
MEDIO:
La Jornada Aguascalientes, Ags.
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://www.lja.mx/'+year+'/'+month.zfill(2)+'/'+day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.vw-page-navigation-pagination').css('a::attr(href)').extract()
if len(pagination) > 0:
pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.css('div.vw-post-loop-inner').css('div.vw-post-box-inner').xpath('./h3/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
## la fecha de la noticia ya incluye la zona horaria
item['date'] = response.xpath('//time[@itemprop="datePublished"]/@datetime').extract_first()
item['title'] = remove_tags(response.xpath('//div[@class="vw-page-content"]/article/h1[@class="entry-title"]').extract_first())
topic = response.xpath('//div[@class="vw-page-content"]/article/div[@class="vw-post-categories"]/a').extract_first()
if topic is not None:
item['topic'] = remove_tags(topic)
else:
item['topic'] = topic
author = response.xpath('//span[@itemprop="author"]/a[@class="author-name"]').extract_first()
if author is not None:
item['author'] = remove_tags(author)
else:
item['author'] = author
for paragraph in response.xpath('//div[@itemprop="articleBody"]/p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = laJornadaAgs.settings
[deploy]
#url = http://localhost:6800/
project = laJornadaAgs
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
from scrapy import signals from scrapy import signals
class EdomexdiaSpiderMiddleware(object): class LectormxSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined, # Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the # scrapy acts as if the spider middleware does not modify the
# passed objects. # passed objects.
......
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Scrapy settings for edoMexDia project # Scrapy settings for lectorMX project
# #
# For simplicity, this file contains only settings considered important or # For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation: # commonly used. You can find more settings consulting the documentation:
...@@ -9,14 +9,14 @@ ...@@ -9,14 +9,14 @@
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'edoMexDia' BOT_NAME = 'lectorMX'
SPIDER_MODULES = ['edoMexDia.spiders'] SPIDER_MODULES = ['lectorMX.spiders']
NEWSPIDER_MODULE = 'edoMexDia.spiders' NEWSPIDER_MODULE = 'lectorMX.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent # Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'edoMexDia (+http://www.yourdomain.com)' #USER_AGENT = 'lectorMX (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
# ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
...@@ -47,13 +47,13 @@ COOKIES_ENABLED = False ...@@ -47,13 +47,13 @@ COOKIES_ENABLED = False
# Enable or disable spider middlewares # Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = { #SPIDER_MIDDLEWARES = {
# 'edoMexDia.middlewares.EdomexdiaSpiderMiddleware': 543, # 'lectorMX.middlewares.LectormxSpiderMiddleware': 543,
#} #}
# Enable or disable downloader middlewares # Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = { #DOWNLOADER_MIDDLEWARES = {
# 'edoMexDia.middlewares.MyCustomDownloaderMiddleware': 543, # 'lectorMX.middlewares.MyCustomDownloaderMiddleware': 543,
#} #}
# Enable or disable extensions # Enable or disable extensions
...@@ -65,7 +65,7 @@ COOKIES_ENABLED = False ...@@ -65,7 +65,7 @@ COOKIES_ENABLED = False
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = { ITEM_PIPELINES = {
'edoMexDia.pipelines.JsonWriterPipeline': 300, 'lectorMX.pipelines.JsonWriterPipeline': 300,
} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
......
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
MEDIA:
Lector MX, Yucatán
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd lectorMX/
$ scrapy crawl noticias --nolog -s filename=2017-03-30.json -a year=2017 -a month=3 -a day=30
"""
import scrapy, re
from lectorMX.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""
Class for Time Zone
"""
def utcoffset(self, dt):
## Time zone for Yucatán: UTC-6 ##
return timedelta(hours=-6)
def tzname(self, dt):
## Time zone name ##
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.news_date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
baseURL = 'http://lectormx.com/' + year + '/' + month + '/' + day
yield scrapy.Request(url=baseURL, callback=self.parse)
def parse(self, response):
for link in response.css('div.paginated_content').css('h2.entry-title > a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
next_page = response.css('div.archive-pagination').css('a.next::attr(href)').extract_first()
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_item(self, response):
text = ''
item = NoticiasItem()
title = response.css('h1.entry-title').extract_first()
if title is not None: title = remove_tags(title)
topic = response.xpath('//a[@rel="tag"]').extract_first()
if topic is not None: topic = remove_tags(topic)
for p in response.css('div.entry-content > p').extract():
text += remove_tags(p) + '\n'
## News item info ##
item['date'] = self.news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
# https://scrapyd.readthedocs.org/en/latest/deploy.html # https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings] [settings]
default = edoMexDia.settings default = lectorMX.settings
[deploy] [deploy]
#url = http://localhost:6800/ #url = http://localhost:6800/
project = edoMexDia project = lectorMX
# -*- coding: utf-8 -*-
# Define here the models for your scraped items # Define here the models for your scraped items
# #
# See documentation in: # See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html # https://docs.scrapy.org/en/latest/topics/items.html
import scrapy import scrapy
class NoticiasItem(scrapy.Item): class DesdeelbalconItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
date = scrapy.Field()
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field() location = scrapy.Field()
author = scrapy.Field() author = scrapy.Field()
topic = scrapy.Field() topic = scrapy.Field()
......
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware # Define here the models for your spider middleware
# #
# See documentation in: # See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class DesdeelbalconSpiderMiddleware(object): class DesdeelbalconSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined, # Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the # scrapy acts as if the spider middleware does not modify the
# passed objects. # passed objects.
...@@ -20,30 +21,29 @@ class DesdeelbalconSpiderMiddleware(object): ...@@ -20,30 +21,29 @@ class DesdeelbalconSpiderMiddleware(object):
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s return s
def process_spider_input(response, spider): def process_spider_input(self, response, spider):
# Called for each response that goes through the spider # Called for each response that goes through the spider
# middleware and into the spider. # middleware and into the spider.
# Should return None or raise an exception. # Should return None or raise an exception.
return None return None
def process_spider_output(response, result, spider): def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after # Called with the results returned from the Spider, after
# it has processed the response. # it has processed the response.
# Must return an iterable of Request, dict or Item objects. # Must return an iterable of Request, or item objects.
for i in result: for i in result:
yield i yield i
def process_spider_exception(response, exception, spider): def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method # Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception. # (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict # Should return either None or an iterable of Request or item objects.
# or Item objects.
pass pass
def process_start_requests(start_requests, spider): def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works # Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except # similarly to the process_spider_output() method, except
# that it doesn’t have a response associated. # that it doesn’t have a response associated.
...@@ -54,3 +54,50 @@ class DesdeelbalconSpiderMiddleware(object): ...@@ -54,3 +54,50 @@ class DesdeelbalconSpiderMiddleware(object):
def spider_opened(self, spider): def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name) spider.logger.info('Spider opened: %s' % spider.name)
class DesdeelbalconDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here # Define your item pipelines here
# #
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name # useful for handling different item types with a single interface
return cls(filename) from itemadapter import ItemAdapter
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class DesdeelbalconPipeline:
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
# -*- coding: utf-8 -*-
# Scrapy settings for desdeElBalcon project # Scrapy settings for desdeElBalcon project
# #
# For simplicity, this file contains only settings considered important or # For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation: # commonly used. You can find more settings consulting the documentation:
# #
# http://doc.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'desdeElBalcon' BOT_NAME = 'desdeElBalcon'
SPIDER_MODULES = ['desdeElBalcon.spiders'] SPIDER_MODULES = ['desdeElBalcon.spiders']
NEWSPIDER_MODULE = 'desdeElBalcon.spiders' NEWSPIDER_MODULE = 'desdeElBalcon.spiders'
FEED_EXPORT_ENCODING="utf-8"
# Crawl responsibly by identifying yourself (and your website) on the user-agent # Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'desdeElBalcon (+http://www.yourdomain.com)' #USER_AGENT = 'desdeElBalcon (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
# ROBOTSTXT_OBEY = True ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5 #DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
COOKIES_ENABLED = False #COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -45,31 +43,31 @@ COOKIES_ENABLED = False ...@@ -45,31 +43,31 @@ COOKIES_ENABLED = False
#} #}
# Enable or disable spider middlewares # Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = { #SPIDER_MIDDLEWARES = {
# 'desdeElBalcon.middlewares.DesdeelbalconSpiderMiddleware': 543, # 'desdeElBalcon.middlewares.DesdeelbalconSpiderMiddleware': 543,
#} #}
# Enable or disable downloader middlewares # Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = { #DOWNLOADER_MIDDLEWARES = {
# 'desdeElBalcon.middlewares.MyCustomDownloaderMiddleware': 543, # 'desdeElBalcon.middlewares.DesdeelbalconDownloaderMiddleware': 543,
#} #}
# Enable or disable extensions # Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html # See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = { #EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None, # 'scrapy.extensions.telnet.TelnetConsole': None,
#} #}
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = { #ITEM_PIPELINES = {
'desdeElBalcon.pipelines.JsonWriterPipeline': 300, # 'desdeElBalcon.pipelines.DesdeelbalconPipeline': 300,
} #}
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True #AUTOTHROTTLE_ENABLED = True
# The initial download delay # The initial download delay
#AUTOTHROTTLE_START_DELAY = 5 #AUTOTHROTTLE_START_DELAY = 5
...@@ -82,7 +80,7 @@ ITEM_PIPELINES = { ...@@ -82,7 +80,7 @@ ITEM_PIPELINES = {
#AUTOTHROTTLE_DEBUG = False #AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default) # Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True #HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_DIR = 'httpcache'
......
# -*- coding: utf-8 -*- import scrapy
import scrapy, re
from datetime import datetime, timedelta, tzinfo
from desdeElBalcon.items import NoticiasItem
"""
MEDIO:
Desde el Balcon, Yucatan
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
from desdeElBalcon.items import DesdeelbalconItem
import re
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasSpider(scrapy.Spider):
name = 'noticias'
class UTC(tzinfo): allowed_domains = ['desdeelbalcon.com']
"""clase para el 'time zone' (zona horaria)""" start_urls = ['http://desdeelbalcon.com/']
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self): def start_requests(self):
self.tz = UTC() year = getattr(self, "year", None)
self.year = getattr(self, 'year', None) month = getattr(self, "month", None)
self.month = getattr(self, 'month', None) day = getattr(self, "day", None)
self.day = getattr(self, 'day', None)
self.baseURL='http://www.desdeelbalcon.com/'+self.year+'/'+self.month+'/'+self.day
self.date = year + "-" + month.zfill(2) + "-" + self.day.zfill(2)
self.baseURL = "http://desdeelbalcon.com/" + year + "/" + month.zfill(2) + "/" + self.day.zfill(2) + "/"
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response): def parse(self, response):
print(response.url) for link in response.xpath('//article//h3/a/@href').extract():
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=link, callback=self.parse_item)
nextPage = response.xpath('//div[@class="nav-links"]/a[@class="next page-numbers"]/@href').extract_first()
pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract() if nextPage is not None:
if len(pagination) > 0: yield scrapy.Request(url=nextPage, callback=self.parsePage)
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
item = NoticiasItem()
for post in response.xpath('//ul[@class="archivepost"]/li'):
# item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
item['date'] = datetime(int(self.year),int(self.month),int(self.day),tzinfo=self.tz).isoformat('T')
item['topic'] = post.xpath('./p/a/text()').extract()
request = scrapy.Request(url=post.xpath('./h2/a/@href').extract_first(), callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_item(self, response): def parse_item(self, response):
text = '' item = DesdeelbalconItem()
item = response.meta['item'] item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
item['topic'] = response.xpath('//meta[@property="article:section"]/@content').extract_first()
item['title'] = response.xpath('//h1[@class="post entry-title"]/a/text()').extract_first() paragraphs = response.xpath('//div[contains(@class,"entry-content")]/p/text()').extract()
text=""
for p in paragraphs:
p = p.replace("<br>", "\n")
text += remove_tags(p) + "\n"
for paragraph in response.xpath('//div[@itemprop="text"]/p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
print(item["title"])
print (item['title']) yield(item)
yield item
# Automatically created by: scrapy startproject # Automatically created by: scrapy startproject
# #
# For more information about the [deploy] section see: # For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html # https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings] [settings]
default = desdeElBalcon.settings default = desdeElBalcon.settings
......
...@@ -11,8 +11,8 @@ BOT_NAME = 'diarioCoLatino' ...@@ -11,8 +11,8 @@ BOT_NAME = 'diarioCoLatino'
SPIDER_MODULES = ['diarioCoLatino.spiders'] SPIDER_MODULES = ['diarioCoLatino.spiders']
NEWSPIDER_MODULE = 'diarioCoLatino.spiders' NEWSPIDER_MODULE = 'diarioCoLatino.spiders'
FEED_EXPORT_ENCODING="utf-8"
FEED_EXPORT_ENCODING="utf-8"
# Crawl responsibly by identifying yourself (and your website) on the user-agent # Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'diarioCoLatino (+http://www.yourdomain.com)' #USER_AGENT = 'diarioCoLatino (+http://www.yourdomain.com)'
......
...@@ -2,11 +2,10 @@ import scrapy ...@@ -2,11 +2,10 @@ import scrapy
from diarioCoLatino.items import DiariocolatinoItem from diarioCoLatino.items import DiariocolatinoItem
import re import re
#-------------------------------------------------------------------------------
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
#-----------------------------------
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
class NoticiasSpider(scrapy.Spider): class NoticiasSpider(scrapy.Spider):
name = 'noticias' name = 'noticias'
...@@ -20,28 +19,27 @@ class NoticiasSpider(scrapy.Spider): ...@@ -20,28 +19,27 @@ class NoticiasSpider(scrapy.Spider):
self.baseURL = "https://www.diariocolatino.com/" + year + "/" + month.zfill(2) + "/" + day.zfill(2) self.baseURL = "https://www.diariocolatino.com/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parseDate) yield scrapy.Request(url=self.baseURL, callback=self.parsePage)
#----------------------------------------------------------------------- #-----------------------------------------------------------------------
def parseDate(self, response): def parsePage(self, response):
print(response.url) print(response.url)
for page in response.css('span#tie-next-page').css("a::attr(href)").extract(): for link in response.xpath('//article/h2/a/@href').extract():
yield scrapy.Request(url=page, callback=self.parseDate)
for link in response.css('div.content').css('div.post-listing').xpath('./article/h2/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parseItem) yield scrapy.Request(url=link, callback=self.parseItem)
nextPage = response.xpath('//span[@id="tie-next-page"]/a/@href').extract_first()
if nextPage is not None:
yield scrapy.Request(url=nextPage, callback=self.parsePage)
#----------------------------------------------------------------------- #-----------------------------------------------------------------------
def parseItem(self, response): def parseItem(self, response):
print(response.url)
item = DiariocolatinoItem() item = DiariocolatinoItem()
item["date"] = response.xpath("//meta[@property='article:published_time']/@content").extract_first() item["date"] = response.xpath("//meta[@property='article:published_time']/@content").extract_first()
item["title"] = response.xpath('//meta[@property="og:title"]/@content').extract_first().replace("- Diario Co Latino","") item["title"] = response.xpath('//meta[@property="og:title"]/@content').extract_first().replace("- Diario Co Latino","").strip()
item["topic"] = response.css("span.post-cats").css("a::text").extract_first().lower() item["topic"] = response.css("span.post-cats").css("a::text").extract_first().lower()
text="" text=""
for p in response.xpath('//div[@class="entry"]/p').extract(): for p in response.xpath('//div[@class="entry"]/p').extract():
text += remove_tags(p) + "\n" text += remove_tags(p) + "\n"
item["text"]=text item["text"]=text
item["url"]=response.url item["url"]=response.url
print(item['title'])
print(item) yield(item)
yield item
This diff is collapsed.
This diff is collapsed.
# -*- coding: utf-8 -*-
# Define here the models for your scraped items # Define here the models for your scraped items
# #
# See documentation in: # See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html # https://docs.scrapy.org/en/latest/topics/items.html
import scrapy import scrapy
class NoticiasItem(scrapy.Item): class DiariopuntualItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
date = scrapy.Field()
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field() location = scrapy.Field()
author = scrapy.Field() author = scrapy.Field()
topic = scrapy.Field() topic = scrapy.Field()
......
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware # Define here the models for your spider middleware
# #
# See documentation in: # See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class DiariopuntualSpiderMiddleware(object): class DiariopuntualSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined, # Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the # scrapy acts as if the spider middleware does not modify the
# passed objects. # passed objects.
...@@ -31,7 +32,7 @@ class DiariopuntualSpiderMiddleware(object): ...@@ -31,7 +32,7 @@ class DiariopuntualSpiderMiddleware(object):
# Called with the results returned from the Spider, after # Called with the results returned from the Spider, after
# it has processed the response. # it has processed the response.
# Must return an iterable of Request, dict or Item objects. # Must return an iterable of Request, or item objects.
for i in result: for i in result:
yield i yield i
...@@ -39,8 +40,7 @@ class DiariopuntualSpiderMiddleware(object): ...@@ -39,8 +40,7 @@ class DiariopuntualSpiderMiddleware(object):
# Called when a spider or process_spider_input() method # Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception. # (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict # Should return either None or an iterable of Request or item objects.
# or Item objects.
pass pass
def process_start_requests(self, start_requests, spider): def process_start_requests(self, start_requests, spider):
...@@ -56,7 +56,7 @@ class DiariopuntualSpiderMiddleware(object): ...@@ -56,7 +56,7 @@ class DiariopuntualSpiderMiddleware(object):
spider.logger.info('Spider opened: %s' % spider.name) spider.logger.info('Spider opened: %s' % spider.name)
class DiariopuntualDownloaderMiddleware(object): class DiariopuntualDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined, # Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the # scrapy acts as if the downloader middleware does not modify the
# passed objects. # passed objects.
......
# -*- coding: utf-8 -*-
# Define your item pipelines here # Define your item pipelines here
# #
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name # useful for handling different item types with a single interface
return cls(filename) from itemadapter import ItemAdapter
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class DiariopuntualPipeline:
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment