Commit 95002f31 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 168936ef
[{"nombre": "El Financiero", "crawler": "descarga_por_rss/elFinanciero", "url": "http://www.elfinanciero.com.mx/"},
{"nombre": "El Universal", "crawler": "descarga_por_rss/elUniversal", "url": "http://www.eluniversal.com.mx/"},
{"nombre": "El Sol de Mexico", "crawler": "descarga_por_rss/solDeMex", "url": "https://www.elsoldemexico.com.mx"}]
\ No newline at end of file
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import json
import os
import datetime
from collections import OrderedDict
today = datetime.datetime.now()
baseDir = "/home/geoint/virtualHDD/m3/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/"
row = {}
with open(sys.argv[1]) as data_file:
siteList = json.load(data_file)
os.chdir(baseDir)
for s in siteList:
# desde = datetime.datetime.strptime(s['desde'], '%d-%m-%Y')
desde = today
print str(s['nombre'] + ", desde:" + desde.strftime("%Y-%m-%d"))
media = s['crawler'][s['crawler'].find("/") + 1:]
try:
os.makedirs(media)
except:
print "ok"
os.chdir(media)
CORRECT_YEAR = False
while not CORRECT_YEAR:
lstYears = os.listdir(".")
lstYears.sort()
year = desde.year
if len(lstYears) > 0:
element = lstYears[len(lstYears) - 1]
if element[-4:] == 'json':
os.system('rm ' + element)
else:
CORRECT_YEAR = True
year = int(element)
for y in range(year, today.year + 1):
print y
try:
os.makedirs(str(y))
except:
print "ok"
os.chdir(str(y))
# print os.getcwd()
lstDays = os.listdir(".")
lstDays = [l for l in lstDays if not l.startswith('.')]
lstDays.sort()
print lstDays
day = desde.timetuple().tm_yday
print day
currentDate = desde
if len(lstDays) > 0:
strDate = lstDays[len(lstDays) - 1]
strDate = strDate[:strDate.find(".")]
currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
day = currentDate.timetuple().tm_yday
elif y != desde.year:
currentDate = datetime.datetime.strptime(str(y) + "-01-01", '%Y-%m-%d')
day = 1
for d in range(day, ((datetime.date(y, 12, 31) - datetime.date(y, 1, 1)).days + 1 if today.year != y else today.timetuple().tm_yday) + 1):
YESTERDAY = False
filename = currentDate.strftime('%Y-%m-%d') + ".json"
scrapycommand = "scrapy crawl noticias -t json --nolog -o " + filename
mydir = os.getcwd()
print mydir
os.chdir(scrapyDir + s['crawler'])
print media
print scrapycommand
os.system(scrapycommand)
fileSize = os.stat(filename).st_size
if fileSize <= 3:
os.system('rm ' + filename)
else:
f1 = mydir + '/' + filename
f2 = filename
f3 = baseDir + media + '/' + filename
try:
with open(f1) as infile1, open(f2) as infile2, open(f3, 'a') as infile3:
master = json.load(infile1)
slave = json.load(infile2)
urlSet = set([line['url'] for line in master])
counter = 0
infile3.write('[')
for line in master:
lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d')
if lineDate == currentDate:
counter += 1
if media == 'elFinanciero' or media == 'solDeMex':
row = OrderedDict([
('date', line['date']),
('topic', line['topic']),
('title', line['title']),
('author', line['author']),
('url', line['url']),
('text', line['text'])
])
elif media == 'elUniversal':
row = OrderedDict([
('date', line['date']),
('topic', line['topic']),
('title', line['title']),
('author', line['author']),
('location', line['location']),
('url', line['url']),
('text', line['text'])
])
if counter == 1:
infile3.write(json.dumps(row))
elif counter > 1:
infile3.write(',\n' + json.dumps(row))
for line in slave:
if not line['url'] in urlSet:
lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d')
if lineDate == currentDate:
if media == 'elFinanciero' or media == 'solDeMex':
row = OrderedDict([
('date', line['date']),
('topic', line['topic']),
('title', line['title']),
('author', line['author']),
('url', line['url']),
('text', line['text'])
])
elif media == 'elUniversal':
row = OrderedDict([
('date', line['date']),
('topic', line['topic']),
('title', line['title']),
('author', line['author']),
('location', line['location']),
('url', line['url']),
('text', line['text'])
])
infile3.write(',\n' + json.dumps(row))
elif (currentDate - lineDate).days == 1:
YESTERDAY = True
infile3.write(']')
os.system("mv " + f3 + " " + mydir)
# os.system("rm " + f2)
except:
os.system("cp " + f2 + " " + mydir)
if YESTERDAY:
currentDate -= datetime.timedelta(days=1)
filenameYesterday = currentDate.strftime('%Y-%m-%d') + ".json"
f1 = mydir + '/' + filenameYesterday
f2 = filename
f3 = baseDir + media + '/' + filenameYesterday
with open(f1) as infile1, open(f2) as infile2, open(f3, 'a') as infile3:
master = json.load(infile1)
slave = json.load(infile2)
urlSet = set([line['url'] for line in master])
counter = 0
infile3.write('[')
for line in master:
counter += 1
if media == 'elFinanciero' or media == 'solDeMex':
row = OrderedDict([
('date', line['date']),
('topic', line['topic']),
('title', line['title']),
('author', line['author']),
('url', line['url']),
('text', line['text'])
])
elif media == 'elUniversal':
row = OrderedDict([
('date', line['date']),
('topic', line['topic']),
('title', line['title']),
('author', line['author']),
('location', line['location']),
('url', line['url']),
('text', line['text'])
])
if counter == 1:
infile3.write(json.dumps(row))
elif counter > 1:
infile3.write(',\n' + json.dumps(row))
for line in slave:
lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d')
if not line['url'] in urlSet and lineDate == currentDate:
if media == 'elFinanciero' or media == 'solDeMex':
row = OrderedDict([
('date', line['date']),
('topic', line['topic']),
('title', line['title']),
('author', line['author']),
('url', line['url']),
('text', line['text'])
])
elif media == 'elUniversal':
row = OrderedDict([
('date', line['date']),
('topic', line['topic']),
('title', line['title']),
('author', line['author']),
('location', line['location']),
('url', line['url']),
('text', line['text'])
])
infile3.write(',\n' + json.dumps(row))
infile3.write(']')
os.system("mv " + f3 + " " + mydir)
os.system("rm " + f2)
os.chdir(mydir)
if YESTERDAY:
currentDate += datetime.timedelta(days=2)
else:
currentDate += datetime.timedelta(days=1)
os.chdir("..")
os.chdir("..")
print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
...@@ -18,22 +18,25 @@ NEWSPIDER_MODULE = 'diarioYucatan.spiders' ...@@ -18,22 +18,25 @@ NEWSPIDER_MODULE = 'diarioYucatan.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent # Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'diarioYucatan (+http://www.yourdomain.com)' #USER_AGENT = 'diarioYucatan (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS=32 #CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
# DOWNLOAD_DELAY=3 #DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN=16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP=16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
COOKIES_ENABLED=False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED=False #TELNETCONSOLE_ENABLED = False
# Override the default request headers: # Override the default request headers:
#DEFAULT_REQUEST_HEADERS = { #DEFAULT_REQUEST_HEADERS = {
...@@ -44,7 +47,7 @@ COOKIES_ENABLED=False ...@@ -44,7 +47,7 @@ COOKIES_ENABLED=False
# Enable or disable spider middlewares # Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = { #SPIDER_MIDDLEWARES = {
# 'diarioYucatan.middlewares.MyCustomSpiderMiddleware': 543, # 'diarioYucatan.middlewares.DiarioyucatanSpiderMiddleware': 543,
#} #}
# Enable or disable downloader middlewares # Enable or disable downloader middlewares
...@@ -56,30 +59,32 @@ COOKIES_ENABLED=False ...@@ -56,30 +59,32 @@ COOKIES_ENABLED=False
# Enable or disable extensions # Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = { #EXTENSIONS = {
# 'scrapy.telnet.TelnetConsole': None, # 'scrapy.extensions.telnet.TelnetConsole': None,
#} #}
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { #ITEM_PIPELINES = {
# 'diarioYucatan.pipelines.SomePipeline': 300, # 'diarioYucatan.pipelines.DiarioyucatanPipeline': 300,
#} #}
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# NOTE: AutoThrottle will honour the standard settings for concurrency and delay #AUTOTHROTTLE_ENABLED = True
#AUTOTHROTTLE_ENABLED=True
# The initial download delay # The initial download delay
#AUTOTHROTTLE_START_DELAY=5 #AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies # The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY=60 #AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received: # Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG=False #AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default) # Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED=True #HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS=0 #HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR='httpcache' #HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES=[] #HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
import scrapy, re import scrapy, re
from datetime import datetime, date, timedelta
from scrapy.spidermiddlewares.httperror import HttpError # from datetime import datetime, date, timedelta
# from scrapy.spidermiddlewares.httperror import HttpError
""" """
Esta version descarga ingresando una fecha. Esta version descarga todas las noticias contenidas en la pagina, sin necesidad
de una fecha especifica.
USO: USO:
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=09 -a day=13 scrapy crawl noticias -t json --nolog -o noticias.json
No es recomendable para fechas de mas de un mes de antiguas.
""" """
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
...@@ -30,81 +32,58 @@ class QuotesSpider(scrapy.Spider): ...@@ -30,81 +32,58 @@ class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
section_list = ['merida', 'yucatan', 'mexico', 'internacional', 'deportes', section_list = ['merida', 'yucatan', 'mexico', 'internacional', 'deportes', 'mexico/quintana-roo',
'espectaculos', 'imagen', 'economia', 'tecnologia', 'salud'] 'espectaculos', 'imagen', 'economia', 'tecnologia', 'salud', 'editorial', 'multimedia']
# section_list = ['yucatan', 'salud', 'editorial', 'imagen',
year = getattr(self, 'year', None) # 'merida', 'merida/clima', 'merida/gobierno', 'merida/policia', 'merida/politica',
month = getattr(self, 'month', None) # 'mexico', 'mexico/quintana-roo', 'mexico/cdmx', 'mexico/economia', 'mexico/campeche',
day = getattr(self, 'day', None) # 'internacional', 'internacional/asia', 'internacional/europa', 'internacional/africa',
self.baseURL='http://yucatan.com.mx/seccion/' # 'internacional/america', 'internacional/oceania',
self.date = date(int(year), int(month), int(day)) # 'deportes', 'deportes/futbol', 'deportes/nfl',
self.parsing_month = { 'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6, 'julio': 7, # 'espectaculos', 'espectaculos/cine', 'espectaculos/farandula', 'espectaculos/musica',
'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12 } # 'espectaculos/tv-y-series',
# 'tecnologia', 'tecnologia/redes-sociales', 'tecnologia/innovaciones',
self.pages = 100 # 'multimedia', 'multimedia/fotos', 'multimedia/videos']
for s in section_list:
yield scrapy.Request(url=self.baseURL+s, callback=self.parse) self.globalLinkSet = set()
self.baseURL = 'http://www.yucatan.com.mx/seccion/'
self.parsing_month = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6, 'julio': 7,
'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
for s in section_list:
yield scrapy.Request(url=self.baseURL + s, callback=self.parse)
def parse(self, response): def parse(self, response):
if ( len(response.xpath('//a[@class="show-more-link"]/@href').extract()) > 0 ): pathList = ['//*[@class="g1-column"]/div/div/article',
for link in response.xpath('//a[@class="show-more-link"]/@href').extract(): '//*[@class="g1-collection g1-collection-columns-2"]/div/ul/li/article']
yield scrapy.Request(url=link, callback=self.parse_pagination)
elif ( len(response.xpath('//a[@class="show-more-link"]/@href').extract()) == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_pagination, dont_filter=True)
for path in pathList:
for entry in response.xpath(path):
link = entry.css('h3').css('a::attr(href)').extract_first()
def parse_pagination(self, response): if not link in self.globalLinkSet:
pagination = response.xpath('//*[@class="pagination"]/a/@href').extract() self.globalLinkSet.add(link)
if ( len(pagination) > 0 ): item = NoticiasItem()
p = 1
while p <= self.pages:
if ( p == 1 ):
yield scrapy.Request(url=response.url, callback=self.parse_link, dont_filter=True)
elif ( p > 1 ):
yield scrapy.Request(url=response.url+'/page/'+str(p+1), callback=self.parse_link)
p += 1
else:
yield scrapy.Request(url=response.url, callback=self.parse_link, dont_filter=True)
def parse_link(self, response):
for entry in response.xpath('//*[@class="bp-entry"]'):
entry_date = entry.xpath('./*[@class="bp-head"]/div/span/text()').extract_first()
entry_date = entry_date[entry_date.rfind(',')+2:][:entry_date[entry_date.rfind(',')+2:].rfind('-')-2]
news_date = date(int(entry_date[-4:]), self.parsing_month[entry_date[:-8][entry_date[:-8].rfind(' ')+1:]], int(entry_date[:entry_date.find(' ')])) item['topic'] = remove_tags(response.xpath('//h2[@class="g1-delta g1-delta-2nd resaltartitulo"]').extract_first())
link = entry.xpath('./*[@class="bp-head"]/h2/a/@href').extract_first() request = scrapy.Request(url=link, callback=self.parse_item)
if news_date == self.date and link is not None: request.meta['item'] = item
yield scrapy.Request(url=link, callback=self.parse_item) yield request
def parse_item(self, response): def parse_item(self, response):
item = response.meta['item']
text = '' text = ''
item = NoticiasItem()
item['title'] = response.css('h1.entry-title::text').extract_first()
d = response.css('div.base-box').css('span.entry-date::attr(datetime)').extract_first() item['date'] = response.xpath('//time[@class="entry-date"]/@datetime').extract_first() + "-06:00"
if d is None: item['title'] = remove_tags(response.xpath('//h1[@class="g1-mega g1-mega-1st entry-title"]').extract_first())
d = response.xpath('//meta[@itemprop="datePublished"]/@content').extract_first()
if d is None:
d = response.xpath('//time[@class="updated"]/@datetime').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico) for p in response.xpath('//*[@itemprop="articleBody"]/p').extract():
if d[-6:] != '-06:00': text += remove_tags(p) + "\n"
d = d[:-6] + '-06:00'
item['date'] = d
for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['topic'] = response.xpath('//*[@class="breadcrumbs-plus"]/span/a/span/text()').extract()[1]
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
......
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class Diarioyucatan2Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class Diarioyucatan2SpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class Diarioyucatan2Pipeline(object):
def process_item(self, item, spider):
return item
# -*- coding: utf-8 -*-
# Scrapy settings for diarioYucatan2 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'diarioYucatan2'
SPIDER_MODULES = ['diarioYucatan2.spiders']
NEWSPIDER_MODULE = 'diarioYucatan2.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'diarioYucatan2 (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'diarioYucatan2.middlewares.Diarioyucatan2SpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'diarioYucatan2.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'diarioYucatan2.pipelines.Diarioyucatan2Pipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
import scrapy, re
# from datetime import datetime, date, timedelta
# from scrapy.spidermiddlewares.httperror import HttpError
"""
Esta version descarga todas las noticias contenidas en la pagina, sin necesidad
de una fecha especifica.
USO:
scrapy crawl noticias -t json --nolog -o noticias.json
Genera un archivo JSON con todas las noticias disponibles. El archivo 'parse_date_file.py'
puede servir para clasificar dichas noticias en sus respectivas fechas.
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
section_list = ['merida', 'yucatan', 'mexico', 'internacional', 'deportes',
'espectaculos', 'imagen', 'economia', 'tecnologia', 'salud']
# year = getattr(self, 'year', None)
# month = getattr(self, 'month', None)
# day = getattr(self, 'day', None)
self.baseURL='http://yucatan.com.mx/seccion/'
# self.date = date(int(year), int(month), int(day))
self.parsing_month = { 'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6, 'julio': 7,
'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12 }
# self.pages = 100
for s in section_list:
yield scrapy.Request(url=self.baseURL+s, callback=self.parse)
def parse(self, response):
if ( len(response.xpath('//a[@class="show-more-link"]/@href').extract()) > 0 ):
for link in response.xpath('//a[@class="show-more-link"]/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_pagination)
elif ( len(response.xpath('//a[@class="show-more-link"]/@href').extract()) == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_pagination, dont_filter=True)
def parse_pagination(self, response):
pagination = response.xpath('//*[@class="pagination"]/a/@href').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-1]
pages = int(pagination[pagination.rfind('/')+1:])
p = 1
while p <= pages:
if ( p == 1 ):
yield scrapy.Request(url=response.url, callback=self.parse_link, dont_filter=True)
elif ( p > 1 ):
yield scrapy.Request(url=response.url+'/page/'+str(p+1), callback=self.parse_link)
p += 1
else:
yield scrapy.Request(url=response.url, callback=self.parse_link, dont_filter=True)
def parse_link(self, response):
for entry in response.xpath('//*[@class="bp-entry"]'):
# entry_date = entry.xpath('./*[@class="bp-head"]/div/span/text()').extract_first()
# entry_date = entry_date[entry_date.rfind(',')+2:][:entry_date[entry_date.rfind(',')+2:].rfind('-')-2]
# news_date = date(int(entry_date[-4:]), self.parsing_month[entry_date[:-8][entry_date[:-8].rfind(' ')+1:]], int(entry_date[:entry_date.find(' ')]))
link = entry.xpath('./*[@class="bp-head"]/h2/a/@href').extract_first()
if link is not None:
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
text = ''
item = NoticiasItem()
item['title'] = response.css('h1.entry-title::text').extract_first()
d = response.css('div.base-box').css('span.entry-date::attr(datetime)').extract_first()
if d is None:
d = response.xpath('//meta[@itemprop="datePublished"]/@content').extract_first()
if d is None:
d = response.xpath('//time[@class="updated"]/@datetime').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['topic'] = response.xpath('//*[@class="breadcrumbs-plus"]/span/a/span/text()').extract()[1]
item['url'] = response.url
# print item['title']
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = diarioYucatan2.settings
[deploy]
#url = http://localhost:6800/
project = diarioYucatan2
...@@ -639,7 +639,8 @@ class QuotesSpider(scrapy.Spider): ...@@ -639,7 +639,8 @@ class QuotesSpider(scrapy.Spider):
# item['date'] = self.date # item['date'] = self.date
item['date'] = datetime.combine(newsDate, time()).replace(tzinfo=self.tz).isoformat('T') item['date'] = datetime.combine(newsDate, time()).replace(tzinfo=self.tz).isoformat('T')
item['title'] = remove_tags(response.xpath('//*[@class="cabeza"]').extract_first()) title = remove_tags(response.xpath('//*[@class="cabeza"]').extract_first())
item['title'] = " ".join(title.split())
item['topic'] = response.xpath('//*[@class="breadcrumb gui"]/span[2]/a/text()').extract_first() item['topic'] = response.xpath('//*[@class="breadcrumb gui"]/span[2]/a/text()').extract_first()
author = response.xpath('//*[@class="credito-autor"]/text()').extract_first() author = response.xpath('//*[@class="credito-autor"]/text()').extract_first()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment