Commit e51b25d1 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

deprecated

parent bddd5804
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class LajornadaSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for laJornada project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'laJornada'
SPIDER_MODULES = ['laJornada.spiders']
NEWSPIDER_MODULE = 'laJornada.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'laJornada (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'laJornada.middlewares.LajornadaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'laJornada.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'laJornada.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
=======================================================================
THIS VERSION OF La Jornada IS NOW DEPRECATED SINCE THE SITE'S WEB PAGE
NO LONGER USES THIS URL: http://www.jornada.unam.mx/.
THE NEW VERSION CAN BE FOUND IN THE descarga_por_dia FOLDER.
=======================================================================
"""
"""
MEDIO:
La Jornada, CDMX
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
import scrapy, re
from laJornada.items import NoticiasItem
from datetime import date, datetime, timedelta, tzinfo, time
from collections import OrderedDict
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
# re: r'(Twitter:\s+)?(@[\w.%+-]+.)?'
TW_RE = re.compile(r"""
(Twitter: # inicio de bloque, contiene la cadena 'Twitter:' (case insensitive)
\s+ # cualquier espacio (\t\n\r\f), una o mas ocurrencias
)? # fin de bloque, ninguna o una ocurrencia del bloque
(@ # inicio de bloque, contiene caracter '@'
[\w.%+-]+ # cualquier caracter alfanumerico mas los signos (.%+-), una o mas ocurrencias
. # cualquier caracter, excepto '\n'
)? # fin de bloque, ninguna o una ocurrencia del bloque
""", re.X|re.I) # banderas: verbose|case insensitive
# re: r'(Facebook|Vk):\s+[\w.%+-]+.'
FB_RE = re.compile(r"""
(Facebook|Vk) # bloque, contiene la cadena 'Facebook' o 'Vk' (case insensitive)
: # contiene el caracter ':'
\s+ # cualquier espacio (\t\n\r\f), una o mas ocurrencias
[\w.%+-]+ # cualquier caracter alfanumerico mas los signos (.%+-), una o mas ocurrencias
. # cualquier caracter, excepto '\n'
""", re.X|re.I) # banderas: verbose|case insensitive
# re: r'\(?(Foro:\s+)?(https?:\/\/)?([w{3}.])?[\w%+-]+(\.[a-zA-Z]{2,6}){1,2}[/\w.#$%&+-]*\)?.'
URL_RE = re.compile(r"""
\(? # contiene o no caracter '(', ninguna o una vez
(Foro: # inicio de bloque, contiene la cadena 'Foro:' (case insensitive)
\s+ # cualquier espacio (\t\n\r\f), una o mas ocurrencias
)? # fin de bloque, ninguna o una ocurrencia del bloque
(http # inicio de bloque, contiene cadena 'http'
s? # contiene o no caracter 's'
:\/\/ # contiene cadena '://'
)? # fin de bloque, ninguna o una ocurrencia del bloque
([w{3}.])? # el caracter 'w' tres veces y/o punto (www.), ninguna o una vez
[\w%+-]+ # cualquier caracter alfanumerico mas los signos (%+-), una o mas ocurrencias
(\. # inicio de bloque, contiene caracter '.'
[a-zA-Z]{2,6} # 2 a 6 letras, minusculas o mayusculas
){1,2} # fin de bloque, bloque se repite de 1 a 2 veces
[/\w.#$%&+-]* # seguido de '/', cualquier caracter alfanumerico mas los signos (.#$%&+-), cero o mas ocurrencias
\)? # contiene o no caracter ')', ninguna o una vez
. # cualquier caracter, excepto '\n'
""", re.X|re.I) # banderas: verbose|case insensitive
# re: r'[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?'
EMAIL_RE = re.compile(r"""
[\w.-]+ # cualquier caracter alfanumerico mas los signos (.-), una o mas repeticiones
@ # seguido de '@'
[\w-]+ # cualquier caracter alfanumerico mas el signo '-', una o mas repeticiones
(\. # inicio de bloque, contiene '.'
[a-zA-Z]{2,6} # 2 a 6 letras, minusculas o mayusculas
){1,2} # fin de bloque, bloque se repite de 1 a 2 veces
\s? # cualquier espacio (\t\n\r\f), ninguna o una coincidencia
""", re.X|re.I) # banderas: verbose|case insensitive
DIVP_RE = re.compile(r'(<div class="(credito-(autor|titulo)|hemero)">.*?<\/div>|<p class="s-s">.{,35}<\/p>|<span class="loc">.*?<\/span>)', re.S)
TRANSLATION_RE = re.compile(r'Traducci.n.*', re.I|re.S)
def clean_raw(rawText):
text = rawText.replace("* * *", '')
text = DIVP_RE.sub('', text)
text = TRANSLATION_RE.sub('', text)
return text
def text_cleaning(text):
"""
Function for cleaning news text
"""
"""
Elimina los espacios dobles, triples o mayores, innecesarios dentro del texto. Primero divide el texto de
acuerdo a los saltos de linea, despues divide cada segmento en palabras sin tomar en cuenta espacios. Luego
las palabras son agrupadas en nuevos segmentos con un solo espacio entre ellas y se agregan los saltos de
linea necesarios.
"""
newText = ''
counter = 0
text = text.replace(u'\u0164', '')
text = text.replace("Afp", '')
for segment in text.split("\n"):
counter += 1
if counter == 1:
newText += " ".join(segment.split())
elif counter > 1:
newText += "\n" + " ".join(segment.split())
"""---------------------------------------------------------------------------------------------------"""
"""
Elimina del texto la info de facebook, twitter, foro y correo electronico.
"""
newText = TW_RE.sub('', newText)
newText = FB_RE.sub('', newText)
newText = EMAIL_RE.sub('', newText)
newText = URL_RE.sub('', newText)
newText = TRANSLATION_RE.sub('', newText)
"""---------------------------------------------------------------------------------------------------"""
return newText
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para centro de mexico: utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
self.counter = 0
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://www.jornada.unam.mx/'+year+'/'+month.zfill(2)+'/'+day.zfill(2)+'/'
self.comparison_date_1 = date(2001, 12, 7)
self.comparison_date_2 = date(2002, 1, 8)
self.comparison_date_3 = date(2003, 4, 25)
self.comparison_date_4 = date(2004, 11, 16)
self.comparison_date_5 = date(2004, 12, 12)
self.comparison_date_6 = date(2005, 1, 31)
self.comparison_date_7 = date(2009, 2, 15)
self.date = date(int(year), int(month), int(day))
self.parse_month = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4,
'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
# self.section_list = ['opinion', 'politica', 'economia', 'mundo', 'estados', 'ciencias',
# 'capital', 'sociedad', 'cultura', 'espectaculos', 'deportes']
# for section in section_list:
# para la fecha 2009/02/15 o menores, se tiene una estructura determinada de pagina
# para las fechas mayores a esa la estructura cambia
# if ( requested_date <= comparison_date_1 ):
# yield scrapy.Request(url=self.baseURL+section, callback=self.parse)
# else:
# yield scrapy.Request(url=self.baseURL+section, callback=self.parse_2)
if self.date <= self.comparison_date_2:
section_list = ['index.html', 'edito.html', 'opinion.html', 'politica.html',
'economia.html', 'cultura.html', 'espectaculos.html', 'estados.html',
'capital.html', 'mundo.html', 'soc-jus.html', 'deportes.html']
parse_s = {'index.html': 'Portada', 'edito.html': 'Editorial', 'opinion.html': 'Opinion',
'politica.html': 'Politica', 'economia.html': 'Economia',
'cultura.html': 'Cultura', 'espectaculos.html': 'Espectaculos', 'estados.html': 'Estados',
'capital.html': 'Capital', 'mundo.html': 'Mundo', 'soc-jus.html': 'Sociedad',
'deportes.html': 'Deportes'}
for s in section_list:
item = NoticiasItem()
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['topic'] = parse_s[s]
if s == 'edito.html':
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item)
else:
request = scrapy.Request(url=self.baseURL+s, callback=self.parse)
request.meta['item'] = item
yield request
elif self.date > self.comparison_date_2 and self.date <= self.comparison_date_3:
section_list = ['index.html', 'edito.html', 'opinion.html', 'politica.html',
'economia.html', 'cultura.html', 'espectaculos.html', 'estados.html',
'capital.html', 'mundo.html', 'soc-jus.html', 'deportes.html',
'index.php', 'edito.php', 'opinion.php', 'politica.php',
'economia.php', 'cultura.php', 'espectaculos.php', 'estados.php',
'capital.php', 'mundo.php', 'soc-jus.php', 'deportes.php']
parse_s = {'index.html': 'Portada', 'edito.html': 'Editorial', 'opinion.html': 'Opinion',
'politica.html': 'Politica', 'economia.html': 'Economia',
'cultura.html': 'Cultura', 'espectaculos.html': 'Espectaculos', 'estados.html': 'Estados',
'capital.html': 'Capital', 'mundo.html': 'Mundo', 'soc-jus.html': 'Sociedad',
'deportes.html': 'Deportes',
'index.php': 'Portada', 'edito.php': 'Editorial', 'opinion.php': 'Opinion',
'politica.php': 'Politica', 'economia.php': 'Economia',
'cultura.php': 'Cultura', 'espectaculos.php': 'Espectaculos', 'estados.php': 'Estados',
'capital.php': 'Capital', 'mundo.php': 'Mundo', 'soc-jus.php': 'Sociedad',
'deportes.php': 'Deportes'}
for s in section_list:
item = NoticiasItem()
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['topic'] = parse_s[s]
if s == 'edito.html' or s == 'correo.html' or s == 'edito.php' or s == 'correo.php':
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item)
else:
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_2)
request.meta['item'] = item
yield request
elif self.date > self.comparison_date_3 and self.date <= self.comparison_date_6:
section_list = ['indexfla.php', 'edito.php', 'opinion.php', 'correo.php', 'politica.php',
'economia.php', 'cultura.php', 'espectaculos.php', 'estados.php',
'capital.php', 'mundo.php', 'soc-jus.php', 'deportes.php', 'index.php']
parse_s = {'indexfla.php': 'Portada', 'edito.php': 'Editorial', 'opinion.php': 'Opinion',
'correo.php': 'Correo', 'politica.php': 'Politica', 'economia.php': 'Economia',
'cultura.php': 'Cultura', 'espectaculos.php': 'Espectaculos', 'estados.php': 'Estados',
'capital.php': 'Capital', 'mundo.php': 'Mundo', 'soc-jus.php': 'Sociedad',
'deportes.php': 'Deportes','index.php': 'Portada'}
for s in section_list:
item = NoticiasItem()
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['topic'] = parse_s[s]
if s == 'edito.php' or s == 'correo.php':
if self.date > self.comparison_date_3 and self.date <= self.comparison_date_5:
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item)
elif self.date > self.comparison_date_5 and self.date <= self.comparison_date_6:
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item_2)
else:
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_3)
request.meta['item'] = item
yield request
elif self.date > self.comparison_date_6:
# print 'first filter'
section_list = ['opinion', 'politica', 'economia', 'mundo', 'estados', 'ciencias',
'capital', 'sociedad', 'cultura', 'espectaculos', 'deportes']
for s in section_list:
# para las fechas menores a 2009/02/15 y mayores a 2005/01/31, se tiene una estructura determinada de pagina
# para las fechas mayores a esa la estructura cambia
if self.date <= self.comparison_date_7:
yield scrapy.Request(url=self.baseURL+s, callback=self.parse_5)
elif self.date > self.comparison_date_7:
# print 'second filter in ' + self.baseURL + s
yield scrapy.Request(url=self.baseURL+s, callback=self.parse_6)
def parse(self, response):
item = response.meta['item']
if self.date <= self.comparison_date_1:
if item['topic'] == 'Portada':
path = '//td[@rowspan="3"]'
else:
if len(response.xpath('//td[@align="center"]').css('a::attr(href)').extract()) > 0:
path = '//td[@align="center"]'
else:
path = '//td[@align="CENTER"]'
elif self.date > self.comparison_date_1 and self.date <= self.comparison_date_2:
if item['topic'] == 'Portada':
path = '//empieza'
else:
path = '//table[@bordercolor="#CCCCCC"]'
for r in response.xpath(path).css('a::attr(href)').extract():
if r[-5:] == '.html':
request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_2(self, response):
item = response.meta['item']
for r in response.xpath('//table[@bordercolor="#CCCCCC"]').css('a::attr(href)').extract():
if r[-5:] == '.html':
request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_3(self, response):
item = response.meta['item']
link_list = []
link_list.extend(response.xpath('//td[@width="100%"]').css('a::attr(href)').extract())
link_list.extend(response.xpath('//td[@width="52%"]').css('a::attr(href)').extract())
link_list.extend(response.xpath('//td[@width="24%"]').css('a::attr(href)').extract())
link_list.extend(response.xpath('//td[@width="646"]').css('a::attr(href)').extract())
link_list.extend(response.xpath('//table[@width="100%"]').css('a::attr(href)').extract())
for r in link_list:
if r[-11:] == '.html&fly=1' or r[-9:] == '.php&fly=' or r[-4:] == '.php':
if self.date > self.comparison_date_3 and self.date <= self.comparison_date_6:
if self.date <= self.comparison_date_4:
request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item)
request.meta['item'] = item
yield request
elif self.date > self.comparison_date_4 and self.date <= self.comparison_date_6:
if r[:4] == 'http' and r[-4:] == '.php':
this_url = r.replace('\n','')
if self.date <= self.comparison_date_5:
request = scrapy.Request(url=this_url, callback=self.parse_item)
elif self.date > self.comparison_date_5 and self.date <= self.comparison_date_6:
request = scrapy.Request(url=this_url, callback=self.parse_item_2)
request.meta['item'] = item
yield request
# elif self.date > self.comparison_date_5 and self.date <= self.comparison_date_6:
# request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item_2)
# request.meta['item'] = item
# yield request
def parse_4(self, response):
print response.url
for r in response.xpath('//td[@width="646"]').css('a::attr(href)').extract():
if r[-4:] == '.php':
print r.replace('\n','')
# request = scrapy.Request(url=r.replace('\n',''), callback=self.parse_item)
# request.meta['item'] = item
# yield request
def parse_5(self, response):
if response.url[:response.url.rfind('/')+1] == self.baseURL: # verifica que se conserva la misma URL base
section = response.url[response.url.rfind('/')+1:]
if section == 'opinion': # la seccion 'opinion' tiene una estructura diferente a las otras
path_list = ['//*[@id="columnas"]/p/a/@href',
'//*[@id="opinion"]/p/a/@href']
else:
path_list = ['//*[@id="article_list"]/h2/a/@href',
'//*[@id="article_list"]/h3/a/@href']
for path in path_list:
for link in response.xpath(path).extract():
yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item_3)
def parse_6(self, response):
if response.url[:response.url.rfind('/')+1] == self.baseURL:
# linkSet = set()
# path_list = ['//*[@class="itemfirst"]/div/a/@href', '//*[@class="item start"]/div/a/@href',
# '//*[@class="item"]/div/a/@href']
#
# for path in path_list:
# for link in response.xpath(path).extract():
# if link not in linkSet:
# linkSet.add(link)
# yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item_4)
linkSet = set()
linkLst = []
linkLst.extend(response.xpath('//*[@class="itemfirst"]/div/a/@href').extract())
linkLst.extend(response.xpath('//*[@class="item start"]/div/a/@href').extract())
linkLst.extend(response.xpath('//*[@class="item"]/div/a/@href').extract())
for l in linkLst:
link = self.baseURL + l
if not link in linkSet:
linkSet.add(link)
yield scrapy.Request(url=link, callback=self.parse_item_4)
def parse_item(self, response):
"""
FECHAS <= 2004-12-12
"""
item = response.meta['item']
flag = True
text = ''
try:
title = remove_tags(response.xpath('//font[@size="5"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//p/font[@size="5"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//p/font[@size="5"]').extract()[1])
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//font[@size="4"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//p/font[@size="4"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//p/font[@size="4"][1]').extract()[1])
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//font[@size="3"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//p/font[@size="3"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//p/font[@size="3"][1]').extract()[1])
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//font[@size="+1"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//font[@size="+0"]').extract_first())
item['title'] = title
except:
if self.date <= date(1999, 10, 3): # en esta fecha hay un cambio respecto a las otras en cuanto al html de la pag
try:
title = remove_tags(response.xpath('//center').extract_first())
item['title'] = title
flag = False
except:
pass
else:
pass
if flag:
if self.date <= self.comparison_date_1:
"""
FECHAS > 1999-10-03 Y FECHAS <= 2001-12-07
"""
for p in response.css('p').extract():
# text += remove_tags(p).replace('\r','') ## no toma en cuenta los primeros indices donde esta el titulo
# text = text.replace('\t','')
p = clean_raw(p)
newsText = remove_tags(p)
text += text_cleaning(newsText)
m = re.search(title, text)
if title[-1] == "?": text = text[m.end()+1:]
else: text = text[m.end():]
text = text.lstrip("\n")
text = text.rstrip("\n")
elif self.date > self.comparison_date_1 and self.date <= self.comparison_date_3:
"""
FECHAS > 2001-12-07 Y FECHAS <= 2003-04-25
"""
for p in response.xpath('//table[@bordercolor="#CCCCCC"]').css('p').extract():
# text += remove_tags(p).replace('\r','')
# text = text.replace('\t','')
p = clean_raw(p)
newsText = remove_tags(p)
text += text_cleaning(newsText)
m = re.search(title, text)
if title[-1] == "?": text = text[m.end()+1:]
else: text = text[m.end():]
text = text.lstrip("\n")
text = text.rstrip("\n")
elif self.date > self.comparison_date_3 and self.date <= self.comparison_date_4:
"""
FECHAS > 2003-04-25 Y FECHAS <= 2004-11-16
"""
p = response.css('p').extract()
for i in range(0, len(p)):
# text += remove_tags(p[i]).replace('\r','')
# text = text.replace('\t','')
aux = clean_raw(p[i])
newsText = remove_tags(aux).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
elif self.date > self.comparison_date_4 and self.date <= self.comparison_date_5:
"""
FECHAS > 2004-11-16 Y FECHAS <= 2004-12-12
"""
p = response.css('p').extract()
for i in range(3, len(p)):
# text += remove_tags(p[i]).replace('\r','')
# text = text.replace('\t','')
aux = clean_raw(p[i])
newsText = remove_tags(aux).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
if text == '':
for i in range(0, len(p)):
# text += remove_tags(p[i]).replace('\r','')
# text = text.replace('\t','')
aux = clean_raw(p[i])
newsText = remove_tags(aux).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
else:
"""
FECHAS <= 1999-10-03
"""
# text = remove_tags(response.body)
# text = text[len(title):]
m = re.search(title, response.body)
body = response.body[m.end():]
body = clean_raw(body)
newsText = remove_tags(body).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
item['text'] = text
item['url'] = response.url
yield item
def parse_item_2(self, response):
"""
FECHAS > 2004-12-12 Y FECHAS <= 2005-01-31
"""
item = response.meta['item']
text = ''
# titleLst = []
# titleLst.extend(response.xpath('//*[@id="contenido"]/h1/text()').extract())
# titleLst.extend(response.xpath('//h1/text()').extract())
titleSet = set()
titleSet.add(response.xpath('//*[@id="contenido"]/h1').extract_first())
titleSet.add(response.xpath('//h1').extract_first())
for t in titleSet:
if t is not None and t != '':
title = remove_tags(t).replace('\r','')
title = title.replace('\t','')
item['title'] = title
p = response.css('p').extract()
for i in range(4, len(p)):
# text += remove_tags(p[i]).replace('\r','')
# text = text.replace('\t','')
newsText = remove_tags(p[i]).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
if text == '':
for i in range(0, len(p)):
# text += remove_tags(p[i]).replace('\r','')
# text = text.replace('\t','')
newsText = remove_tags(p[i]).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
item['text'] = text
item['url'] = response.url
yield item
def parse_item_3(self, response):
"""
FECHAS > 2005-01-31 Y FECHAS <= 2009-02-15
"""
item = NoticiasItem()
text = ''
titleSet = set()
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
# title = response.xpath('//*[@class="documentContent"]/h1[@class="title"]/text()').extract()
# if len(title) > 0:
# item['title'] = title[0]
# else:
# item['title'] = response.xpath('//*[@class="documentContent"]/h1/text()').extract_first()
titleSet.add(response.xpath('//*[@class="documentContent"]/h1[@class="title"]').extract_first())
titleSet.add(response.xpath('//*[@class="documentContent"]/h1').extract_first())
for t in titleSet:
if t is not None and t != '':
title = remove_tags(t).replace('\r','')
title = title.replace('\t','')
item['title'] = title
item['topic'] = response.xpath('//*[@id="portal-breadcrumbs"]/a[2]/text()').extract_first()
for p in response.xpath('//*[@class="documentContent"]/p').extract():
# text += remove_tags(p).replace('\r','')
# text = text.replace('\t','')
newsText = remove_tags(p).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
def parse_item_4(self, response):
"""
FECHAS > 2009-02-15
"""
d = response.xpath('//*[@class="main-fecha"]/text()').extract_first()
d = d.replace('de', '').replace(' ', ' ').split(' ')
newsDate = date(int(d[3]), self.parse_month[d[2].lower()], int(d[1]))
if newsDate == self.date:
item = NoticiasItem()
text = ''
# path_list = ['//*[@class="col"]/p', '//*[@class="col col1"]/p', '//*[@class="col col2"]/p']
# path_list = ['//*[@class="col"]', '//*[@class="col col1"]', '//*[@class="col col2"]']
textLst = []
textLst.extend(response.xpath('//*[@class="col"]').extract())
textLst.extend(response.xpath('//*[@class="col col1"]').extract())
textLst.extend(response.xpath('//*[@class="col col2"]').extract())
# item['date'] = self.date
item['date'] = datetime.combine(newsDate, time()).replace(tzinfo=self.tz).isoformat('T')
title = remove_tags(response.xpath('//*[@class="cabeza"]').extract_first())
item['title'] = " ".join(title.split())
item['topic'] = response.xpath('//*[@class="breadcrumb gui"]/span[2]/a/text()').extract_first()
author = response.xpath('//*[@class="credito-autor"]/text()').extract_first()
if author is None or author == '':
author = response.xpath('//*[@class="credito-articulo"]/text()').extract_first()
if author is not None and author != '':
item['author'] = author
location = remove_tags(response.xpath('//p[@class="s-s"]').extract_first())
if location is not None and location != '' and len(location) <= 35:
item['location'] = location
for p in textLst:
# text += remove_tags(p).replace('\r', '')
# text = text.replace('\t', '')
p = clean_raw(p)
# newsText = remove_tags(p).lstrip("\n")
# newsText = newsText.rstrip("\n")
# text += text_cleaning(newsText)
text += remove_tags(p)
text = text.lstrip("\n")
text = text.rstrip("\n")
text = text_cleaning(text)
item['text'] = text
item['url'] = response.url
# print item['title']
# print 'title: ' + item['title'] + '\nurl: ' + item['url'] + '\n'
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = laJornada.settings
[deploy]
#url = http://localhost:6800/
project = laJornada
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = tribunaHn.settings
[deploy]
#url = http://localhost:6800/
project = tribunaHn
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class TribunahnSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for tribunaHn project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'tribunaHn'
SPIDER_MODULES = ['tribunaHn.spiders']
NEWSPIDER_MODULE = 'tribunaHn.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tribunaHn (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'tribunaHn.middlewares.TribunahnSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'tribunaHn.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'tribunaHn.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
===============================================================================
THIS VERSION OF La Tribuna Honduras IS NOW DEPRECATED SINCE THE SITE'S WEB PAGE
CHANGED ITS ACCESS TO ACCES BY DAY.
THE NEW VERSION CAN BE FOUNd IN THE descarga_por_dia/foraneos FOLDER.
===============================================================================
"""
import scrapy, re, json
from datetime import date
from tribunaHn.items import NoticiasItem
"""
MEDIO:
La Tribuna, Honduras
USO:
// Si se quiere obtener todas las noticias desde las más actuales hasta las más antiguas. //
scrapy crawl noticias --nolog -s filename=noticias.json
-------------------------------------------------------------------------------------------------
// Si se quiere obtener todas las noticias desde las más actuales hasta una fecha específica. //
scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=2 -a day=29
-------------------------------------------------------------------------------------------------
Después será necesario hacer uso del archivo parse_date_files.py para que las noticias contenidas
en noticias.json sean separadas en archivos por fecha.
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
DAT_RE = re.compile(r'\d{4}\/\d{2}\/\d{2}')
class ImportantData(scrapy.Item):
section = scrapy.Field()
page = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
if self.year is not None and self.month is not None and self.day is not None:
self.stopDate = date(int(self.year), int(self.month), int(self.day))
else:
self.stopDate = None
baseURL = "http://www.latribuna.hn/"
sectionList = ["noticias", "honduras", "sociedad", "cafeteando", "dejenme-decirles", "desde-usa",
"ecomentarios", "el-cambio-climatico", "el-dossier-de-atenea", "enfoques",
"pecadillos-idiomaticos", "pildoritas", "columnistas", "editorial", "tribuna-del-pueblo",
"anales-historicos", "cine", "dejando-huellas", "dia-7", "dominicales", "done-un-aula",
"especiales-lt", "la-cobra-pregunta", "la-tribuna-agropecuaria", "la-tribuna-cultural",
"nuestro-orgullo", "turismo"]
# sectionList = ["noticias"]
for s in sectionList:
yield scrapy.Request(url=baseURL + s, callback=self.parse)
def parse(self, response):
CONTINUE_SEARCHING = True
linkList = response.xpath('//div[@id="main"]').css('article.linkbox').xpath('./a[@itemprop="url"]/@href').extract()
linkList.extend(response.xpath('//div[@id="main"]').css('div.bottom-margin').css('div.col-sm-6').xpath('./h3/a[@itemprop="url"]/@href').extract())
if self.stopDate is None:
for link in linkList:
yield scrapy.Request(url=link, callback=self.parse_item)
else:
for link in linkList:
res = DAT_RE.search(link)
if res:
dat = map(int, res.group(0).split("/"))
newsDate = date(dat[0], dat[1], dat[2])
if newsDate >= self.stopDate:
yield scrapy.Request(url=link, callback=self.parse_item)
else:
CONTINUE_SEARCHING = False
break
if CONTINUE_SEARCHING:
nextPage = response.xpath('//span[@class="next"]/a/@href').extract_first()
if nextPage is not None:
yield scrapy.Request(url=nextPage, callback=self.parse)
def parse_item(self, response):
item = NoticiasItem()
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = remove_tags(response.xpath('//header/h1[@itemprop="name"]').extract_first())
try:
topic = response.xpath('//aside[@class="tags"]/ul/li/a/text()').extract()[0]
except:
topic = None
item['topic'] = topic
for p in response.css('div.article-post-content').css('p').extract():
text += remove_tags(p) + "\n"
item['text'] = text.strip()
item['url'] = response.url
yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment