Commit 967de617 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

merge with dev

parents d8b390de 9fd0391f
......@@ -8,7 +8,7 @@
from scrapy import signals
class Lajornadabc2SpiderMiddleware(object):
class LajornadaSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
......
# -*- coding: utf-8 -*-
# Scrapy settings for tiempoDigital project
# Scrapy settings for laJornada project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
......@@ -9,14 +9,14 @@
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'tiempoDigital'
BOT_NAME = 'laJornada'
SPIDER_MODULES = ['tiempoDigital.spiders']
NEWSPIDER_MODULE = 'tiempoDigital.spiders'
SPIDER_MODULES = ['laJornada.spiders']
NEWSPIDER_MODULE = 'laJornada.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tiempoDigital (+http://www.yourdomain.com)'
#USER_AGENT = 'laJornada (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
......@@ -27,7 +27,7 @@ NEWSPIDER_MODULE = 'tiempoDigital.spiders'
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1.5
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
......@@ -47,13 +47,13 @@ COOKIES_ENABLED = False
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'tiempoDigital.middlewares.TiempodigitalSpiderMiddleware': 543,
# 'laJornada.middlewares.LajornadaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'tiempoDigital.middlewares.MyCustomDownloaderMiddleware': 543,
# 'laJornada.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
......@@ -65,7 +65,7 @@ COOKIES_ENABLED = False
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'tiempoDigital.pipelines.JsonWriterPipeline': 300,
'laJornada.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
......
# -*- coding: utf-8 -*-
"""
=======================================================================
THIS VERSION OF La Jornada IS NOW DEPRECATED SINCE THE SITE'S WEB PAGE
NO LONGER USES THIS URL: http://www.jornada.unam.mx/.
THE NEW VERSION CAN BE FOUND IN THE descarga_por_dia FOLDER.
=======================================================================
"""
"""
MEDIO:
La Jornada, CDMX
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
import scrapy, re
from laJornada.items import NoticiasItem
from datetime import date, datetime, timedelta, tzinfo, time
from collections import OrderedDict
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
# re: r'(Twitter:\s+)?(@[\w.%+-]+.)?'
TW_RE = re.compile(r"""
(Twitter: # inicio de bloque, contiene la cadena 'Twitter:' (case insensitive)
\s+ # cualquier espacio (\t\n\r\f), una o mas ocurrencias
)? # fin de bloque, ninguna o una ocurrencia del bloque
(@ # inicio de bloque, contiene caracter '@'
[\w.%+-]+ # cualquier caracter alfanumerico mas los signos (.%+-), una o mas ocurrencias
. # cualquier caracter, excepto '\n'
)? # fin de bloque, ninguna o una ocurrencia del bloque
""", re.X|re.I) # banderas: verbose|case insensitive
# re: r'(Facebook|Vk):\s+[\w.%+-]+.'
FB_RE = re.compile(r"""
(Facebook|Vk) # bloque, contiene la cadena 'Facebook' o 'Vk' (case insensitive)
: # contiene el caracter ':'
\s+ # cualquier espacio (\t\n\r\f), una o mas ocurrencias
[\w.%+-]+ # cualquier caracter alfanumerico mas los signos (.%+-), una o mas ocurrencias
. # cualquier caracter, excepto '\n'
""", re.X|re.I) # banderas: verbose|case insensitive
# re: r'\(?(Foro:\s+)?(https?:\/\/)?([w{3}.])?[\w%+-]+(\.[a-zA-Z]{2,6}){1,2}[/\w.#$%&+-]*\)?.'
URL_RE = re.compile(r"""
\(? # contiene o no caracter '(', ninguna o una vez
(Foro: # inicio de bloque, contiene la cadena 'Foro:' (case insensitive)
\s+ # cualquier espacio (\t\n\r\f), una o mas ocurrencias
)? # fin de bloque, ninguna o una ocurrencia del bloque
(http # inicio de bloque, contiene cadena 'http'
s? # contiene o no caracter 's'
:\/\/ # contiene cadena '://'
)? # fin de bloque, ninguna o una ocurrencia del bloque
([w{3}.])? # el caracter 'w' tres veces y/o punto (www.), ninguna o una vez
[\w%+-]+ # cualquier caracter alfanumerico mas los signos (%+-), una o mas ocurrencias
(\. # inicio de bloque, contiene caracter '.'
[a-zA-Z]{2,6} # 2 a 6 letras, minusculas o mayusculas
){1,2} # fin de bloque, bloque se repite de 1 a 2 veces
[/\w.#$%&+-]* # seguido de '/', cualquier caracter alfanumerico mas los signos (.#$%&+-), cero o mas ocurrencias
\)? # contiene o no caracter ')', ninguna o una vez
. # cualquier caracter, excepto '\n'
""", re.X|re.I) # banderas: verbose|case insensitive
# re: r'[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?'
EMAIL_RE = re.compile(r"""
[\w.-]+ # cualquier caracter alfanumerico mas los signos (.-), una o mas repeticiones
@ # seguido de '@'
[\w-]+ # cualquier caracter alfanumerico mas el signo '-', una o mas repeticiones
(\. # inicio de bloque, contiene '.'
[a-zA-Z]{2,6} # 2 a 6 letras, minusculas o mayusculas
){1,2} # fin de bloque, bloque se repite de 1 a 2 veces
\s? # cualquier espacio (\t\n\r\f), ninguna o una coincidencia
""", re.X|re.I) # banderas: verbose|case insensitive
DIVP_RE = re.compile(r'(<div class="(credito-(autor|titulo)|hemero)">.*?<\/div>|<p class="s-s">.{,35}<\/p>|<span class="loc">.*?<\/span>)', re.S)
TRANSLATION_RE = re.compile(r'Traducci.n.*', re.I|re.S)
def clean_raw(rawText):
text = rawText.replace("* * *", '')
text = DIVP_RE.sub('', text)
text = TRANSLATION_RE.sub('', text)
return text
def text_cleaning(text):
"""
Function for cleaning news text
"""
"""
Elimina los espacios dobles, triples o mayores, innecesarios dentro del texto. Primero divide el texto de
acuerdo a los saltos de linea, despues divide cada segmento en palabras sin tomar en cuenta espacios. Luego
las palabras son agrupadas en nuevos segmentos con un solo espacio entre ellas y se agregan los saltos de
linea necesarios.
"""
newText = ''
counter = 0
text = text.replace(u'\u0164', '')
text = text.replace("Afp", '')
for segment in text.split("\n"):
counter += 1
if counter == 1:
newText += " ".join(segment.split())
elif counter > 1:
newText += "\n" + " ".join(segment.split())
"""---------------------------------------------------------------------------------------------------"""
"""
Elimina del texto la info de facebook, twitter, foro y correo electronico.
"""
newText = TW_RE.sub('', newText)
newText = FB_RE.sub('', newText)
newText = EMAIL_RE.sub('', newText)
newText = URL_RE.sub('', newText)
newText = TRANSLATION_RE.sub('', newText)
"""---------------------------------------------------------------------------------------------------"""
return newText
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para centro de mexico: utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
self.counter = 0
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://www.jornada.unam.mx/'+year+'/'+month.zfill(2)+'/'+day.zfill(2)+'/'
self.comparison_date_1 = date(2001, 12, 7)
self.comparison_date_2 = date(2002, 1, 8)
self.comparison_date_3 = date(2003, 4, 25)
self.comparison_date_4 = date(2004, 11, 16)
self.comparison_date_5 = date(2004, 12, 12)
self.comparison_date_6 = date(2005, 1, 31)
self.comparison_date_7 = date(2009, 2, 15)
self.date = date(int(year), int(month), int(day))
self.parse_month = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4,
'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
# self.section_list = ['opinion', 'politica', 'economia', 'mundo', 'estados', 'ciencias',
# 'capital', 'sociedad', 'cultura', 'espectaculos', 'deportes']
# for section in section_list:
# para la fecha 2009/02/15 o menores, se tiene una estructura determinada de pagina
# para las fechas mayores a esa la estructura cambia
# if ( requested_date <= comparison_date_1 ):
# yield scrapy.Request(url=self.baseURL+section, callback=self.parse)
# else:
# yield scrapy.Request(url=self.baseURL+section, callback=self.parse_2)
if self.date <= self.comparison_date_2:
section_list = ['index.html', 'edito.html', 'opinion.html', 'politica.html',
'economia.html', 'cultura.html', 'espectaculos.html', 'estados.html',
'capital.html', 'mundo.html', 'soc-jus.html', 'deportes.html']
parse_s = {'index.html': 'Portada', 'edito.html': 'Editorial', 'opinion.html': 'Opinion',
'politica.html': 'Politica', 'economia.html': 'Economia',
'cultura.html': 'Cultura', 'espectaculos.html': 'Espectaculos', 'estados.html': 'Estados',
'capital.html': 'Capital', 'mundo.html': 'Mundo', 'soc-jus.html': 'Sociedad',
'deportes.html': 'Deportes'}
for s in section_list:
item = NoticiasItem()
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['topic'] = parse_s[s]
if s == 'edito.html':
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item)
else:
request = scrapy.Request(url=self.baseURL+s, callback=self.parse)
request.meta['item'] = item
yield request
elif self.date > self.comparison_date_2 and self.date <= self.comparison_date_3:
section_list = ['index.html', 'edito.html', 'opinion.html', 'politica.html',
'economia.html', 'cultura.html', 'espectaculos.html', 'estados.html',
'capital.html', 'mundo.html', 'soc-jus.html', 'deportes.html',
'index.php', 'edito.php', 'opinion.php', 'politica.php',
'economia.php', 'cultura.php', 'espectaculos.php', 'estados.php',
'capital.php', 'mundo.php', 'soc-jus.php', 'deportes.php']
parse_s = {'index.html': 'Portada', 'edito.html': 'Editorial', 'opinion.html': 'Opinion',
'politica.html': 'Politica', 'economia.html': 'Economia',
'cultura.html': 'Cultura', 'espectaculos.html': 'Espectaculos', 'estados.html': 'Estados',
'capital.html': 'Capital', 'mundo.html': 'Mundo', 'soc-jus.html': 'Sociedad',
'deportes.html': 'Deportes',
'index.php': 'Portada', 'edito.php': 'Editorial', 'opinion.php': 'Opinion',
'politica.php': 'Politica', 'economia.php': 'Economia',
'cultura.php': 'Cultura', 'espectaculos.php': 'Espectaculos', 'estados.php': 'Estados',
'capital.php': 'Capital', 'mundo.php': 'Mundo', 'soc-jus.php': 'Sociedad',
'deportes.php': 'Deportes'}
for s in section_list:
item = NoticiasItem()
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['topic'] = parse_s[s]
if s == 'edito.html' or s == 'correo.html' or s == 'edito.php' or s == 'correo.php':
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item)
else:
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_2)
request.meta['item'] = item
yield request
elif self.date > self.comparison_date_3 and self.date <= self.comparison_date_6:
section_list = ['indexfla.php', 'edito.php', 'opinion.php', 'correo.php', 'politica.php',
'economia.php', 'cultura.php', 'espectaculos.php', 'estados.php',
'capital.php', 'mundo.php', 'soc-jus.php', 'deportes.php', 'index.php']
parse_s = {'indexfla.php': 'Portada', 'edito.php': 'Editorial', 'opinion.php': 'Opinion',
'correo.php': 'Correo', 'politica.php': 'Politica', 'economia.php': 'Economia',
'cultura.php': 'Cultura', 'espectaculos.php': 'Espectaculos', 'estados.php': 'Estados',
'capital.php': 'Capital', 'mundo.php': 'Mundo', 'soc-jus.php': 'Sociedad',
'deportes.php': 'Deportes','index.php': 'Portada'}
for s in section_list:
item = NoticiasItem()
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['topic'] = parse_s[s]
if s == 'edito.php' or s == 'correo.php':
if self.date > self.comparison_date_3 and self.date <= self.comparison_date_5:
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item)
elif self.date > self.comparison_date_5 and self.date <= self.comparison_date_6:
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item_2)
else:
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_3)
request.meta['item'] = item
yield request
elif self.date > self.comparison_date_6:
# print 'first filter'
section_list = ['opinion', 'politica', 'economia', 'mundo', 'estados', 'ciencias',
'capital', 'sociedad', 'cultura', 'espectaculos', 'deportes']
for s in section_list:
# para las fechas menores a 2009/02/15 y mayores a 2005/01/31, se tiene una estructura determinada de pagina
# para las fechas mayores a esa la estructura cambia
if self.date <= self.comparison_date_7:
yield scrapy.Request(url=self.baseURL+s, callback=self.parse_5)
elif self.date > self.comparison_date_7:
# print 'second filter in ' + self.baseURL + s
yield scrapy.Request(url=self.baseURL+s, callback=self.parse_6)
def parse(self, response):
item = response.meta['item']
if self.date <= self.comparison_date_1:
if item['topic'] == 'Portada':
path = '//td[@rowspan="3"]'
else:
if len(response.xpath('//td[@align="center"]').css('a::attr(href)').extract()) > 0:
path = '//td[@align="center"]'
else:
path = '//td[@align="CENTER"]'
elif self.date > self.comparison_date_1 and self.date <= self.comparison_date_2:
if item['topic'] == 'Portada':
path = '//empieza'
else:
path = '//table[@bordercolor="#CCCCCC"]'
for r in response.xpath(path).css('a::attr(href)').extract():
if r[-5:] == '.html':
request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_2(self, response):
item = response.meta['item']
for r in response.xpath('//table[@bordercolor="#CCCCCC"]').css('a::attr(href)').extract():
if r[-5:] == '.html':
request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_3(self, response):
item = response.meta['item']
link_list = []
link_list.extend(response.xpath('//td[@width="100%"]').css('a::attr(href)').extract())
link_list.extend(response.xpath('//td[@width="52%"]').css('a::attr(href)').extract())
link_list.extend(response.xpath('//td[@width="24%"]').css('a::attr(href)').extract())
link_list.extend(response.xpath('//td[@width="646"]').css('a::attr(href)').extract())
link_list.extend(response.xpath('//table[@width="100%"]').css('a::attr(href)').extract())
for r in link_list:
if r[-11:] == '.html&fly=1' or r[-9:] == '.php&fly=' or r[-4:] == '.php':
if self.date > self.comparison_date_3 and self.date <= self.comparison_date_6:
if self.date <= self.comparison_date_4:
request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item)
request.meta['item'] = item
yield request
elif self.date > self.comparison_date_4 and self.date <= self.comparison_date_6:
if r[:4] == 'http' and r[-4:] == '.php':
this_url = r.replace('\n','')
if self.date <= self.comparison_date_5:
request = scrapy.Request(url=this_url, callback=self.parse_item)
elif self.date > self.comparison_date_5 and self.date <= self.comparison_date_6:
request = scrapy.Request(url=this_url, callback=self.parse_item_2)
request.meta['item'] = item
yield request
# elif self.date > self.comparison_date_5 and self.date <= self.comparison_date_6:
# request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item_2)
# request.meta['item'] = item
# yield request
def parse_4(self, response):
print response.url
for r in response.xpath('//td[@width="646"]').css('a::attr(href)').extract():
if r[-4:] == '.php':
print r.replace('\n','')
# request = scrapy.Request(url=r.replace('\n',''), callback=self.parse_item)
# request.meta['item'] = item
# yield request
def parse_5(self, response):
if response.url[:response.url.rfind('/')+1] == self.baseURL: # verifica que se conserva la misma URL base
section = response.url[response.url.rfind('/')+1:]
if section == 'opinion': # la seccion 'opinion' tiene una estructura diferente a las otras
path_list = ['//*[@id="columnas"]/p/a/@href',
'//*[@id="opinion"]/p/a/@href']
else:
path_list = ['//*[@id="article_list"]/h2/a/@href',
'//*[@id="article_list"]/h3/a/@href']
for path in path_list:
for link in response.xpath(path).extract():
yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item_3)
def parse_6(self, response):
if response.url[:response.url.rfind('/')+1] == self.baseURL:
# linkSet = set()
# path_list = ['//*[@class="itemfirst"]/div/a/@href', '//*[@class="item start"]/div/a/@href',
# '//*[@class="item"]/div/a/@href']
#
# for path in path_list:
# for link in response.xpath(path).extract():
# if link not in linkSet:
# linkSet.add(link)
# yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item_4)
linkSet = set()
linkLst = []
linkLst.extend(response.xpath('//*[@class="itemfirst"]/div/a/@href').extract())
linkLst.extend(response.xpath('//*[@class="item start"]/div/a/@href').extract())
linkLst.extend(response.xpath('//*[@class="item"]/div/a/@href').extract())
for l in linkLst:
link = self.baseURL + l
if not link in linkSet:
linkSet.add(link)
yield scrapy.Request(url=link, callback=self.parse_item_4)
def parse_item(self, response):
"""
FECHAS <= 2004-12-12
"""
item = response.meta['item']
flag = True
text = ''
try:
title = remove_tags(response.xpath('//font[@size="5"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//p/font[@size="5"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//p/font[@size="5"]').extract()[1])
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//font[@size="4"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//p/font[@size="4"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//p/font[@size="4"][1]').extract()[1])
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//font[@size="3"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//p/font[@size="3"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//p/font[@size="3"][1]').extract()[1])
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//font[@size="+1"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//font[@size="+0"]').extract_first())
item['title'] = title
except:
if self.date <= date(1999, 10, 3): # en esta fecha hay un cambio respecto a las otras en cuanto al html de la pag
try:
title = remove_tags(response.xpath('//center').extract_first())
item['title'] = title
flag = False
except:
pass
else:
pass
if flag:
if self.date <= self.comparison_date_1:
"""
FECHAS > 1999-10-03 Y FECHAS <= 2001-12-07
"""
for p in response.css('p').extract():
# text += remove_tags(p).replace('\r','') ## no toma en cuenta los primeros indices donde esta el titulo
# text = text.replace('\t','')
p = clean_raw(p)
newsText = remove_tags(p)
text += text_cleaning(newsText)
m = re.search(title, text)
if title[-1] == "?": text = text[m.end()+1:]
else: text = text[m.end():]
text = text.lstrip("\n")
text = text.rstrip("\n")
elif self.date > self.comparison_date_1 and self.date <= self.comparison_date_3:
"""
FECHAS > 2001-12-07 Y FECHAS <= 2003-04-25
"""
for p in response.xpath('//table[@bordercolor="#CCCCCC"]').css('p').extract():
# text += remove_tags(p).replace('\r','')
# text = text.replace('\t','')
p = clean_raw(p)
newsText = remove_tags(p)
text += text_cleaning(newsText)
m = re.search(title, text)
if title[-1] == "?": text = text[m.end()+1:]
else: text = text[m.end():]
text = text.lstrip("\n")
text = text.rstrip("\n")
elif self.date > self.comparison_date_3 and self.date <= self.comparison_date_4:
"""
FECHAS > 2003-04-25 Y FECHAS <= 2004-11-16
"""
p = response.css('p').extract()
for i in range(0, len(p)):
# text += remove_tags(p[i]).replace('\r','')
# text = text.replace('\t','')
aux = clean_raw(p[i])
newsText = remove_tags(aux).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
elif self.date > self.comparison_date_4 and self.date <= self.comparison_date_5:
"""
FECHAS > 2004-11-16 Y FECHAS <= 2004-12-12
"""
p = response.css('p').extract()
for i in range(3, len(p)):
# text += remove_tags(p[i]).replace('\r','')
# text = text.replace('\t','')
aux = clean_raw(p[i])
newsText = remove_tags(aux).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
if text == '':
for i in range(0, len(p)):
# text += remove_tags(p[i]).replace('\r','')
# text = text.replace('\t','')
aux = clean_raw(p[i])
newsText = remove_tags(aux).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
else:
"""
FECHAS <= 1999-10-03
"""
# text = remove_tags(response.body)
# text = text[len(title):]
m = re.search(title, response.body)
body = response.body[m.end():]
body = clean_raw(body)
newsText = remove_tags(body).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
item['text'] = text
item['url'] = response.url
yield item
def parse_item_2(self, response):
"""
FECHAS > 2004-12-12 Y FECHAS <= 2005-01-31
"""
item = response.meta['item']
text = ''
# titleLst = []
# titleLst.extend(response.xpath('//*[@id="contenido"]/h1/text()').extract())
# titleLst.extend(response.xpath('//h1/text()').extract())
titleSet = set()
titleSet.add(response.xpath('//*[@id="contenido"]/h1').extract_first())
titleSet.add(response.xpath('//h1').extract_first())
for t in titleSet:
if t is not None and t != '':
title = remove_tags(t).replace('\r','')
title = title.replace('\t','')
item['title'] = title
p = response.css('p').extract()
for i in range(4, len(p)):
# text += remove_tags(p[i]).replace('\r','')
# text = text.replace('\t','')
newsText = remove_tags(p[i]).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
if text == '':
for i in range(0, len(p)):
# text += remove_tags(p[i]).replace('\r','')
# text = text.replace('\t','')
newsText = remove_tags(p[i]).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
item['text'] = text
item['url'] = response.url
yield item
def parse_item_3(self, response):
"""
FECHAS > 2005-01-31 Y FECHAS <= 2009-02-15
"""
item = NoticiasItem()
text = ''
titleSet = set()
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
# title = response.xpath('//*[@class="documentContent"]/h1[@class="title"]/text()').extract()
# if len(title) > 0:
# item['title'] = title[0]
# else:
# item['title'] = response.xpath('//*[@class="documentContent"]/h1/text()').extract_first()
titleSet.add(response.xpath('//*[@class="documentContent"]/h1[@class="title"]').extract_first())
titleSet.add(response.xpath('//*[@class="documentContent"]/h1').extract_first())
for t in titleSet:
if t is not None and t != '':
title = remove_tags(t).replace('\r','')
title = title.replace('\t','')
item['title'] = title
item['topic'] = response.xpath('//*[@id="portal-breadcrumbs"]/a[2]/text()').extract_first()
for p in response.xpath('//*[@class="documentContent"]/p').extract():
# text += remove_tags(p).replace('\r','')
# text = text.replace('\t','')
newsText = remove_tags(p).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
def parse_item_4(self, response):
"""
FECHAS > 2009-02-15
"""
d = response.xpath('//*[@class="main-fecha"]/text()').extract_first()
d = d.replace('de', '').replace(' ', ' ').split(' ')
newsDate = date(int(d[3]), self.parse_month[d[2].lower()], int(d[1]))
if newsDate == self.date:
item = NoticiasItem()
text = ''
# path_list = ['//*[@class="col"]/p', '//*[@class="col col1"]/p', '//*[@class="col col2"]/p']
# path_list = ['//*[@class="col"]', '//*[@class="col col1"]', '//*[@class="col col2"]']
textLst = []
textLst.extend(response.xpath('//*[@class="col"]').extract())
textLst.extend(response.xpath('//*[@class="col col1"]').extract())
textLst.extend(response.xpath('//*[@class="col col2"]').extract())
# item['date'] = self.date
item['date'] = datetime.combine(newsDate, time()).replace(tzinfo=self.tz).isoformat('T')
title = remove_tags(response.xpath('//*[@class="cabeza"]').extract_first())
item['title'] = " ".join(title.split())
item['topic'] = response.xpath('//*[@class="breadcrumb gui"]/span[2]/a/text()').extract_first()
author = response.xpath('//*[@class="credito-autor"]/text()').extract_first()
if author is None or author == '':
author = response.xpath('//*[@class="credito-articulo"]/text()').extract_first()
if author is not None and author != '':
item['author'] = author
location = remove_tags(response.xpath('//p[@class="s-s"]').extract_first())
if location is not None and location != '' and len(location) <= 35:
item['location'] = location
for p in textLst:
# text += remove_tags(p).replace('\r', '')
# text = text.replace('\t', '')
p = clean_raw(p)
# newsText = remove_tags(p).lstrip("\n")
# newsText = newsText.rstrip("\n")
# text += text_cleaning(newsText)
text += remove_tags(p)
text = text.lstrip("\n")
text = text.rstrip("\n")
text = text_cleaning(text)
item['text'] = text
item['url'] = response.url
# print item['title']
# print 'title: ' + item['title'] + '\nurl: ' + item['url'] + '\n'
yield item
......@@ -4,8 +4,8 @@
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = laJornadaBC2.settings
default = laJornada.settings
[deploy]
#url = http://localhost:6800/
project = laJornadaBC2
project = laJornada
......@@ -4,7 +4,7 @@
===============================================================================
THIS VERSION OF La Tribuna Honduras IS NOW DEPRECATED SINCE THE SITE'S WEB PAGE
CHANGE ITS ACCESS TO ACCES BY DAY.
CHANGED ITS ACCESS TO ACCES BY DAY.
THE NEW VERSION CAN BE FOUNd IN THE descarga_por_dia/foraneos FOLDER.
===============================================================================
......
......@@ -5,13 +5,15 @@ MEDIA:
Diario de Chiapas, Chiapas
USAGE
$ cd diarioDeChiapas
$ cd diarioDeChiapas/
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
for the news contained in noticias.json being splitted into files by date. ##
$ scrapy crawl noticias --nolog -s filename=noticias.json
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to a specific date. ##
$ scrapy crawl noticias --nolog -s filename=2018-08-30.json -a year=2018 -a month=8 -a day=30
"""
......
# -*- coding: utf-8 -*-
import scrapy, re, json, ast
from scrapy.selector import Selector
from datetime import datetime, date
from elSalvador.items import NoticiasItem
"""
MEDIO:
El Salvador
USO:
// Si se quiere obtener todas las noticias desde las más actuales hasta las más antiguas. //
scrapy crawl noticias --nolog -s filename=noticias.json
-------------------------------------------------------------------------------------------------
// Si se quiere obtener todas las noticias desde las más actuales hasta una fecha específica. //
scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=3 -a day=11
-------------------------------------------------------------------------------------------------
Después será necesario hacer uso del archivo parse_date_files.py para que las noticias contenidas
en noticias.json sean separadas en archivos por fecha.
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
# LOC = re.compile(r'[A-Z].*?, ?[A-Z].+?\.')
class ImportantData(scrapy.Item):
CONTINUE_SEARCHING = scrapy.Field()
LAST_LINK = scrapy.Field()
page = scrapy.Field()
section_url = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
if year is not None and month is not None and day is not None:
self.stopDate = date(int(year), int(month), int(day))
else:
self.stopDate = None
baseURL = "http://www.elsalvador.com/category/noticias/"
# sectionList = []
sectionList = ["internacional"]
if self.stopDate is None:
for s in sectionList:
info = ImportantData()
info['page'] = 1
request = scrapy.Request(url=baseURL + s + "/", callback=self.parse)
request.meta['item'] = info
yield request
else:
for s in sectionList:
info = ImportantData()
info['page'] = 0
info['CONTINUE_SEARCHING'] = False
request = scrapy.Request(url=baseURL + s + "/", callback=self.parse_with_stop_date)
request.meta['item'] = info
yield request
# for s in sectionList:
# yield scrapy.Request(url=baseURL + s + "/", callback=self.parse)
def parse(self, response):
# print response.body
# print "----------------------------------------------------\n\n"
# print response.url
# searchData = response.meta['item']
# CONTINUE_SEARCHING = True
# linkSet = set()
# if searchData['page'] == 1:
# searchData['section_url'] = response.url
# linkSet = linkSet.union(set(response.xpath('//article[@id="destacada"]/a/@href').extract()))
# linkSet = linkSet.union(set(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract()))
# linkSet = linkSet.union(set(response.xpath('//section[@id="principal"]/article/a/@href').extract()))
# linkSet = linkSet.union(set(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract()))
# linkSet = linkSet.union(set(response.xpath('//div[@class="contenedor"]/article/a/@href').extract()))
# linkSet = linkSet.union(set(response.xpath('//article[@class="nobordes"]/div/a/@href').extract()))
# linkSet.remove(searchData['section_url'])
#
# else:
# linkSet = linkSet.union(set(response.xpath('//div[@class="contenedor"]/article/a/@href').extract()))
# linkSet = linkSet.union(set(response.xpath('//article[@class="nobordes"]/div/a/@href').extract()))
# try:
# linkSet.remove(searchData['section_url'])
# except KeyError:
# pass
# if len(linkSet) <= 0:
# CONTINUE_SEARCHING = False
#
# for link in linkSet:
# yield scrapy.Request(url=link, callback=self.parse_item)
#
# if CONTINUE_SEARCHING:
# searchData['page'] += 1
# page = searchData['page']
# url = searchData['section_url']
# request = scrapy.Request(url=url + "?page=" + str(page), callback=self.parse)
# request.meta['item'] = searchData
# yield request
linkList = response.xpath('//div[@id="main"]').css('h2.large-title').xpath('./a/@href').extract()
linkList.extend(response.xpath('//div[@class="container even"]').css('h2.large-title').xpath('./a/@href').extract())
for link in linkList:
print link
# url = "http://www.elsalvador.com//wp-json/LoadMore/Category/posts"
# url_peticion = "/category/noticias/internacional/"
# frmdata = {'pppDestacado': "5", 'pppNoDestacado': "4", 'slug': "internacional", 'paged': "4526", 'category_name': "Internacional", 'url_peticion': url_peticion}
#
# yield scrapy.http.FormRequest(url=url, formdata=frmdata, callback=self.after_post)
# def after_post(self, response):
# searchData = response.meta['item']
# # from scrapy.shell import inspect_response
# # print "This is response: "
# unescaped = ast.literal_eval(response.body.strip())
# body = Selector(text=unescaped)
# # inspect_response(response, self)
# newsList = []
# linksObtained = body.xpath('//div[@class="row news"]').css('div.subsection').css('h2').xpath('./a/@href').extract()
# for link in linksObtained:
# link = link.replace('\\', '')
# if not link in newsList:
# newsList.append(link)
#
# # print len(newsList) checar length de newList para determinar el paro
# if len(newsList) > 0:
# for link in newsList:
# info = ImportantData()
# info['url'] = searchData['url']
# info['page'] = searchData['page']
# info['section_url'] = searchData['section_url']
# if link == linkList[-1]: info['LAST_LINK'] = True
# else: info['LAST_LINK'] = False
# reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
# reqst.meta['item'] = info
# yield reqst
def parse_with_stop_date(self, response):
searchData = response.meta['item']
CONTINUE_SEARCHING = searchData['CONTINUE_SEARCHING']
if not CONTINUE_SEARCHING:
if searchData['page'] == 0:
searchData['section_url'] = response.url
newsList = response.xpath('//div[@id="main"]').css('h2.large-title').xpath('./a/@href').extract()
# newsList.extend(response.xpath('//div[@class="container even"]').css('h2.large-title').xpath('./a/@href').extract())
else:
unescaped = ast.literal_eval(response.body.strip())
body = Selector(text=unescaped)
newsList = []
for link in body.xpath('//div[@class="row news"]').css('div.subsection').css('h2').xpath('./a/@href').extract():
link = link.replace('\\', '')
if not link in newsList:
newsList.append(link)
if len(newsList) > 0:
for link in newsList:
info = ImportantData()
# info['url'] = response.url
info['page'] = searchData['page']
info['section_url'] = searchData['section_url']
if link == newsList[-1]: info['LAST_LINK'] = True
else: info['LAST_LINK'] = False
reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
reqst.meta['item'] = info
yield reqst
# if searchData['page'] == 1:
# searchData['section_url'] = response.url
# linkList = response.xpath('//article[@id="destacada"]/a/@href').extract()
# linkList.extend(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract())
# linkList.extend(response.xpath('//section[@id="principal"]/article/a/@href').extract())
# linkList.extend(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract())
# linkList.extend(response.xpath('//div[@class="contenedor"]/article/a/@href').extract())
# linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
# linkList.remove(searchData['section_url'])
#
# else:
# linkList = response.xpath('//div[@class="contenedor"]/article/a/@href').extract()
# linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
# try:
# linkList.remove(searchData['section_url'])
# except KeyError:
# pass
#
# newsList = []
# for link in linkList:
# if not link in newsList:
# newsList.append(link)
#
# for link in newsList:
# info = ImportantData()
# info['url'] = response.url
# info['page'] = searchData['page']
# info['section_url'] = searchData['section_url']
# if link == linkList[-1]: info['LAST_LINK'] = True
# else: info['LAST_LINK'] = False
# reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
# reqst.meta['item'] = info
# yield reqst
else:
searchData['CONTINUE_SEARCHING'] = False
searchData['page'] += 1
page = str(searchData['page'])
url = "http://www.elsalvador.com//wp-json/LoadMore/Category/posts"
url_peticion = "/category/noticias/internacional/"
frmdata = {'pppDestacado': "5", 'pppNoDestacado': "4", 'slug': "internacional", 'paged': page,
'category_name': "Internacional", 'url_peticion': url_peticion}
request = scrapy.http.FormRequest(url=url, formdata=frmdata, callback=self.parse_with_stop_date)
request.meta['item'] = searchData
yield request
# searchData['CONTINUE_SEARCHING'] = False
# searchData['page'] += 1
# page = searchData['page']
# url = searchData['section_url']
# request = scrapy.Request(url=url + "?page=" + str(page), callback=self.parse_with_stop_date)
# request.meta['item'] = searchData
# yield request
def parse_item(self, response):
item = NoticiasItem()
d = response.xpath('//time/text()').extract_first()
res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
newsData = json.loads(res)
item['date'] = datetime.strptime(d, '%d.%m.%Y').isoformat("T")
item['title'] = newsData['headline']
try:
topic = newsData['articleSection']
except:
topic = None
item['topic'] = topic
text = newsData['articleBody']
if text.find(u'\u00a0') >= 0:
loc = text[:text.find(u'\u00a0')] + "."
m = LOC.match(loc)
if m:
item['location'] = m.group(0)
text = text[text.find(u'\u00a0') + 1:]
item['text'] = text
item['url'] = response.url
yield item
def parse_item_with_stop_date(self, response):
d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
dt = datetime.strptime(d[:10], '%Y-%m-%d').date()
if dt >= self.stopDate:
info = response.meta['item']
item = NoticiasItem()
res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
newsData = json.loads(res)
item['date'] = datetime.strptime(d, '%d.%m.%Y').isoformat("T")
item['title'] = newsData['headline']
try:
topic = newsData['articleSection']
except:
topic = None
item['topic'] = topic
text = newsData['articleBody']
if text.find(u'\u00a0') >= 0:
loc = text[:text.find(u'\u00a0')] + "."
m = LOC.match(loc)
if m:
item['location'] = m.group(0)
text = text[text.find(u'\u00a0')+1:]
item['text'] = text
item['url'] = response.url
yield item
if info['LAST_LINK']:
info['CONTINUE_SEARCHING'] = True
request = scrapy.Request(url=info['section_url'], callback=self.parse_with_stop_date, dont_filter=True)
request.meta['item'] = info
yield request
......@@ -3,7 +3,7 @@
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
......
......@@ -3,7 +3,7 @@
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
......@@ -20,14 +20,14 @@ class PrensahnSpiderMiddleware(object):
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
......@@ -35,7 +35,7 @@ class PrensahnSpiderMiddleware(object):
for i in result:
yield i
def process_spider_exception(response, exception, spider):
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
......@@ -43,7 +43,7 @@ class PrensahnSpiderMiddleware(object):
# or Item objects.
pass
def process_start_requests(start_requests, spider):
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
......@@ -54,3 +54,50 @@ class PrensahnSpiderMiddleware(object):
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class PrensahnDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
......@@ -3,7 +3,7 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
......
......@@ -5,9 +5,9 @@
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'prensaHn'
......@@ -25,7 +25,7 @@ NEWSPIDER_MODULE = 'prensaHn.spiders'
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
......@@ -45,31 +45,31 @@ COOKIES_ENABLED = False
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'prensaHn.middlewares.PrensahnSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'prensaHn.middlewares.MyCustomDownloaderMiddleware': 543,
# 'prensaHn.middlewares.PrensahnDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'prensaHn.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
......@@ -82,7 +82,7 @@ ITEM_PIPELINES = {
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
......
# -*- coding: utf-8 -*-
"""
MEDIA:
La Prensa, Honduras
USAGE:
$ cd prensaHn/
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
for the news contained in noticias.json being splitted into files by date. ##
$ scrapy crawl noticias --nolog -s filename=noticias.json
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to a specific date. ##
scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=3 -a day=7
"""
import scrapy, re, json
from datetime import datetime, date
from prensaHn.items import NoticiasItem
"""
MEDIO:
La Prensa, Honduras
USO:
// Si se quiere obtener todas las noticias desde las más actuales hasta las más antiguas. //
scrapy crawl noticias --nolog -s filename=noticias.json
-------------------------------------------------------------------------------------------------
// Si se quiere obtener todas las noticias desde las más actuales hasta una fecha específica. //
scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=3 -a day=7
-------------------------------------------------------------------------------------------------
Después será necesario hacer uso del archivo parse_date_files.py para que las noticias contenidas
en noticias.json sean separadas en archivos por fecha.
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class ImportantData(scrapy.Item):
CONTINUE_SEARCHING = scrapy.Field()
LAST_LINK = scrapy.Field()
page = scrapy.Field()
section_url = scrapy.Field()
url = scrapy.Field()
class ImportantFlowData(scrapy.Item):
"""
Useful data for the flow of the implementation
"""
to_next_page = scrapy.Field()
is_last_link = scrapy.Field()
news_section = scrapy.Field()
section_url = scrapy.Field()
return_url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
day = getattr(self, "day", None)
if year is not None and month is not None and day is not None:
self.stopDate = date(int(year), int(month), int(day))
else:
self.stopDate = None
base_url = "http://www.laprensa.hn/"
section_list = ["economia", "mundo", "tecnologia", "cine", "cultura", "turismo",
"honduras", "sucesos", "espectaculos", "deportes"]
baseURL = "http://www.laprensa.hn/"
if year is not None and month is not None and day is not None:
self.stop_date = date(int(year), int(month), int(day))
for s in section_list:
flow_info = ImportantFlowData()
flow_info['to_next_page'] = False
flow_info['news_section'] = s
flow_info['section_url'] = base_url + s + "/"
request = scrapy.Request(url=base_url + s, callback=self.parse_with_stop_date)
request.meta['item'] = flow_info
yield request
sectionList = ["economia", "mundo", "tecnologia", "cine", "cultura", "turismo",
"honduras", "sucesos", "espectaculos", "deportes"]
# sectionList = ["economia"]
elif year is None and month is None and day is None:
for s in section_list:
flow_info = ImportantFlowData()
flow_info['news_section'] = s
flow_info['section_url'] = base_url + s + "/"
if self.stopDate is None:
for s in sectionList:
info = ImportantData()
info['page'] = 1
request = scrapy.Request(url=baseURL + s, callback=self.parse)
request.meta['item'] = info
request = scrapy.Request(url=base_url + s, callback=self.parse)
request.meta['item'] = flow_info
yield request
else:
for s in sectionList:
info = ImportantData()
info['page'] = 1
info['CONTINUE_SEARCHING'] = False
request = scrapy.Request(url=baseURL + s, callback=self.parse_with_stop_date)
request.meta['item'] = info
yield request
print "Unable to execute this crawler with current given parameters."
print "Enter all parameters: year, month and day, or none of them."
def parse(self, response):
searchData = response.meta['item']
CONTINUE_SEARCHING = True
if searchData['page'] == 1:
searchData['section_url'] = response.url + "/"
entrySet = set(response.css('article.entry').css('div.content').css('a::attr(href)').extract())
entrySet.remove(searchData['section_url'])
linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract())
linkSet.remove(searchData['section_url'])
linkSet.union(entrySet)
def parse(self, response):
flow_info = response.meta['item']
news_section = flow_info['news_section']
section_url = flow_info['section_url']
link_list = []
if news_section == "deportes":
section = news_section.capitalize()
elif news_section == "espectaculos":
section = u'Expect\xc3\xa1culos'
else:
linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract())
try:
linkSet.remove(searchData['section_url'])
except KeyError:
CONTINUE_SEARCHING = False
section = response.xpath('//div[@id="section_title"]/h1/a').extract_first()
if section is not None : section = remove_tags(section)
for link in linkSet:
for entry in response.css('article.grid').css('div.content'):
if section in entry.css('a.category').extract_first():
link_list.append(entry.xpath('./a[2]/@href').extract_first())
for link in link_list:
yield scrapy.Request(url=link, callback=self.parse_item)
if CONTINUE_SEARCHING:
searchData['page'] += 1
page = searchData['page']
url = searchData['section_url']
request = scrapy.Request(url=url + "?page=" + str(page), callback=self.parse)
request.meta['item'] = searchData
next_page = response.xpath('//ul[@id="paginacion"]').css('span.next > a::attr(href)').extract_first()
if next_page is not None:
flow_info = ImportantFlowData()
flow_info['news_section'] = news_section
flow_info['section_url'] = section_url
request = scrapy.Request(url=section_url + next_page, callback=self.parse)
request.meta['item'] = flow_info
yield request
def parse_with_stop_date(self, response):
searchData = response.meta['item']
CONTINUE_SEARCHING = searchData['CONTINUE_SEARCHING']
if not CONTINUE_SEARCHING:
if searchData['page'] == 1:
searchData['section_url'] = response.url + "/"
entrySet = set(response.css('article.entry').css('div.content').css('a::attr(href)').extract())
entrySet.remove(searchData['section_url'])
linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract())
linkSet.remove(searchData['section_url'])
linkSet.union(entrySet)
linkList = list(linkSet)
flow_info = response.meta['item']
news_section = flow_info['news_section']
section_url = flow_info['section_url']
if not flow_info['to_next_page']:
link_list = []
if news_section == "deportes":
section = news_section.capitalize()
elif news_section == "espectaculos":
section = u'Expect\xc3\xa1culos'
else:
linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract())
try:
linkSet.remove(searchData['section_url'])
linkList = list(linkSet)
except KeyError:
linkList = []
for link in linkList:
info = ImportantData()
info['url'] = response.url
info['page'] = searchData['page']
info['section_url'] = searchData['section_url']
if link == linkList[-1]: info['LAST_LINK'] = True
else: info['LAST_LINK'] = False
reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
reqst.meta['item'] = info
yield reqst
section = response.xpath('//div[@id="section_title"]/h1/a').extract_first()
if section is not None : section = remove_tags(section)
for entry in response.css('article.grid').css('div.content'):
if section in entry.css('a.category').extract_first():
link_list.append(entry.xpath('./a[2]/@href').extract_first())
for link in link_list:
flow_info = ImportantFlowData()
flow_info['news_section'] = news_section
flow_info['section_url'] = section_url
flow_info['return_url'] = response.url
if link_list.index(link) == link_list.index(link_list[-1]):
flow_info['is_last_link'] = True
else:
flow_info['is_last_link'] = False
request = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
request.meta['item'] = flow_info
yield request
else:
searchData['CONTINUE_SEARCHING'] = False
searchData['page'] += 1
page = searchData['page']
url = searchData['section_url']
request = scrapy.Request(url=url + "?page=" + str(page), callback=self.parse_with_stop_date)
request.meta['item'] = searchData
yield request
next_page = response.xpath('//ul[@id="paginacion"]').css('span.next > a::attr(href)').extract_first()
if next_page is not None:
flow_info = ImportantFlowData()
flow_info['to_next_page'] = False
flow_info['news_section'] = news_section
flow_info['section_url'] = section_url
request = scrapy.Request(url=section_url + next_page, callback=self.parse_with_stop_date)
request.meta['item'] = flow_info
yield request
def parse_item(self, response):
item = NoticiasItem()
res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
newsData = json.loads(res)
item['date'] = newsData['datePublished'][:-1]
item['title'] = newsData['headline']
resp = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
news_data = json.loads(resp)
try:
topic = newsData['articleSection']
topic = news_data['articleSection']
except:
topic = None
item['topic'] = topic
item['text'] = newsData['articleBody']
item['url'] = response.url
## News item info ##
item['date'] = news_data['datePublished'][:-1]
item['title'] = news_data['headline']
item['topic'] = topic
item['text'] = news_data['articleBody']
item['url'] = response.url
yield item
def parse_item_with_stop_date(self, response):
res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
newsData = json.loads(res)
d = newsData['datePublished']
d = d[:d.find("T")]
dt = datetime.strptime(d, '%Y-%m-%d').date()
if dt >= self.stopDate:
info = response.meta['item']
resp = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
news_data = json.loads(resp)
news_date = news_data['datePublished']
news_date = news_date[:news_date.find("T")]
news_date = datetime.strptime(news_date, '%Y-%m-%d').date()
if news_date >= self.stop_date:
flow_info = response.meta['item']
item = NoticiasItem()
text = ''
item['date'] = newsData['datePublished'][:-1]
item['title'] = newsData['headline']
try:
topic = newsData['articleSection']
topic = news_data['articleSection']
except:
topic = None
item['topic'] = topic
item['text'] = newsData['articleBody']
item['url'] = response.url
## News item info ##
item['date'] = news_data['datePublished'][:-1]
item['title'] = news_data['headline']
item['topic'] = topic
item['text'] = news_data['articleBody']
item['url'] = response.url
yield item
if info['LAST_LINK']:
info['CONTINUE_SEARCHING'] = True
request = scrapy.Request(url=info['url'], callback=self.parse_with_stop_date, dont_filter=True)
request.meta['item'] = info
if flow_info['is_last_link']:
flow_info['to_next_page'] = True
request = scrapy.Request(url=flow_info['return_url'], callback=self.parse_with_stop_date, dont_filter=True)
request.meta['item'] = flow_info
yield request
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = prensaHn.settings
......
......@@ -5,13 +5,15 @@ MEDIA:
El Heraldo de Chihuahua, Chihuahua
USAGE
$ cd heraldoChihuahua
$ cd heraldoChihuahua/
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
for the news contained in noticias.json being splitted into files by date. ##
$ scrapy crawl noticias --nolog -s filename=noticias.json
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to a specific date. ##
$ scrapy crawl noticias --nolog -s filename=2018-08-30.json -a year=2018 -a month=8 -a day=30
"""
......
......@@ -3,12 +3,18 @@
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class LajornadabcItem(scrapy.Item):
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
......@@ -3,7 +3,7 @@
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
......@@ -20,14 +20,14 @@ class LajornadabcSpiderMiddleware(object):
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
......@@ -35,7 +35,7 @@ class LajornadabcSpiderMiddleware(object):
for i in result:
yield i
def process_spider_exception(response, exception, spider):
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
......@@ -43,7 +43,7 @@ class LajornadabcSpiderMiddleware(object):
# or Item objects.
pass
def process_start_requests(start_requests, spider):
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
......@@ -54,3 +54,50 @@ class LajornadabcSpiderMiddleware(object):
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class LajornadabcDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
......@@ -3,9 +3,73 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class LajornadabcPipeline(object):
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
......@@ -5,9 +5,9 @@
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'laJornadaBC'
......@@ -19,21 +19,21 @@ NEWSPIDER_MODULE = 'laJornadaBC.spiders'
#USER_AGENT = 'laJornadaBC (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -45,31 +45,31 @@ ROBOTSTXT_OBEY = True
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'laJornadaBC.middlewares.LajornadabcSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'laJornadaBC.middlewares.MyCustomDownloaderMiddleware': 543,
# 'laJornadaBC.middlewares.LajornadabcDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'laJornadaBC.pipelines.LajornadabcPipeline': 300,
#}
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'laJornadaBC.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
......@@ -82,7 +82,7 @@ ROBOTSTXT_OBEY = True
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
......
import scrapy, re
from datetime import datetime, date, timedelta
# -*- coding: utf-8 -*-
"""
Esta version descarga ingresando una fecha.
USO:
MEDIA:
La Jornada Baja California, Baja California
USAGE:
$ cd laJornadaBC/
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
for the news contained in noticias.json being splitted into files by date. ##
$ scrapy crawl noticias --nolog -s filename=noticias.json
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to a specific date. ##
$ scrapy crawl noticias --nolog -s filename=2017-02-21.json -a year=2017 -a month=2 -a day=21
"""
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=2 -a day=21
import scrapy, re, json
from datetime import datetime, date
from laJornadaBC.items import NoticiasItem
No es recomendable para fechas de mas de un mes de antiguas.
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class ImportantFlowData(scrapy.Item):
"""
Useful data for the flow of the implementation
"""
to_next_page = scrapy.Field()
is_last_link = scrapy.Field()
return_url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo',
'cultura', 'espectaculos', 'deportes']
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL = 'http://jornadabc.mx/'
self.date = date(int(year), int(month), int(day))
self.pages = 100
for s in section_list:
yield scrapy.Request(url=self.baseURL+'seccion/'+s, callback=self.parse_pagination)
def parse_pagination(self, response):
pagination = response.xpath('//li[@class="pager-last odd last"]/a/@href').extract()
if len(pagination) > 0:
p = 1
while p <= self.pages:
if p == 1:
yield scrapy.Request(url=response.url, callback=self.parse_link, dont_filter=True)
elif p > 1:
yield scrapy.Request(url=response.url+'?page='+str(p+1), callback=self.parse_link)
p += 1
else:
yield scrapy.Request(url=response.url, callback=self.parse_link, dont_filter=True)
def parse_link(self, response):
section = response.url[response.url.rfind('/')+1:]
if section == 'espectaculos' or section == 'deportes':
path = '//*[@class="region region-soft-first"]'
else:
path = '//*[@class="region region-hard-first"]'
link_list = response.xpath(path).css('div.views-row').xpath('./*[@class="views-field views-field-title"]/span[@class="field-content"]/a/@href').extract()
if len(link_list) > 0:
for link in link_list:
news_date = datetime.strptime(link[:link.rfind('/')][link[:link.rfind('/')].rfind('/')+1:], '%d-%m-%Y').date()
if news_date == self.date:
yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
## la fecha de la noticia ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
title = response.xpath('//*[@class="block-content"]/h1/a/text()').extract()
if len(title) > 0:
item['title'] = title[0]
else:
item['title'] = response.xpath('//*[@class="block-content"]/h1/text()').extract_first()
item['topic'] = response.xpath('//span[@class="section"]/text()').extract_first()
for paragraph in response.xpath('//*[@class="field-item even"]/p/text()').extract():
text += paragraph
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.base_url = "http://jornadabc.mx"
section_list = ["baja-california", "chihuahua", "mexico", "mundo", "cultura", "espectaculos", "deportes"]
if year is not None and month is not None and day is not None:
self.stop_date = date(int(year), int(month), int(day))
for s in section_list:
flow_info = ImportantFlowData()
flow_info['to_next_page'] = False
request = scrapy.Request(url=self.base_url + "/seccion/" + s, callback=self.parse_with_stop_date)
request.meta['item'] = flow_info
yield request
elif year is None and month is None and day is None:
for s in section_list:
yield scrapy.Request(url=self.base_url + "/seccion/" + s, callback=self.parse)
else:
print "Unable to execute this crawler with current given parameters."
print "Enter all parameters: year, month and day, or none of them."
def parse(self, response):
link_list = response.css('div.region-hard-first').css('div.views-field-title > span > a::attr(href)').extract()
for link in link_list:
yield scrapy.Request(url=self.base_url + link, callback=self.parse_item)
next_page = response.css('li.pager-next > a::attr(href)').extract_first()
if next_page is not None:
yield scrapy.Request(url=self.base_url + next_page, callback=self.parse)
def parse_with_stop_date(self, response):
flow_info = response.meta['item']
if not flow_info['to_next_page']:
link_list = response.css('div.region-hard-first').css('div.views-field-title > span > a::attr(href)').extract()
for link in link_list:
flow_info = ImportantFlowData()
flow_info['return_url'] = response.url
if link_list.index(link) == link_list.index(link_list[-1]):
flow_info['is_last_link'] = True
else:
flow_info['is_last_link'] = False
request = scrapy.Request(url=self.base_url + link, callback=self.parse_item_with_stop_date)
request.meta['item'] = flow_info
yield request
else:
next_page = response.css('li.pager-next > a::attr(href)').extract_first()
if next_page is not None:
flow_info = ImportantFlowData()
flow_info['to_next_page'] = False
request = scrapy.Request(url=self.base_url + next_page, callback=self.parse_with_stop_date)
request.meta['item'] = flow_info
yield request
def parse_item(self, response):
item = NoticiasItem()
news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
title = response.css('div.block-content > h1 > a').extract_first()
if title is not None : title = remove_tags(title)
topic = response.css('span.section').extract_first()
if topic is not None : topic = remove_tags(topic)
text = response.css('div.region-article-second').css('div.block-content').css('div.rtejustify').extract_first()
if text is not None:
text = remove_tags(text)
elif text is None or text == '':
text = ''
for p in response.css('div.region-article-second').css('div.block-content').css('div.field-items').css('p').extract():
text += remove_tags(p) + "\n"
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
def parse_item_with_stop_date(self, response):
news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
news_date = news_date[:news_date.find("T")]
news_date = datetime.strptime(news_date, '%Y-%m-%d').date()
if news_date >= self.stop_date:
flow_info = response.meta['item']
item = NoticiasItem()
news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
title = response.css('div.block-content > h1 > a').extract_first()
if title is not None : title = remove_tags(title)
topic = response.css('span.section').extract_first()
if topic is not None : topic = remove_tags(topic)
text = response.css('div.region-article-second').css('div.block-content').css('div.rtejustify').extract_first()
if text is not None:
text = remove_tags(text)
elif text is None or text == '':
text = ''
for p in response.css('div.region-article-second').css('div.block-content').css('div.field-items').css('p').extract():
text += remove_tags(p) + "\n"
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
if flow_info['is_last_link']:
flow_info['to_next_page'] = True
request = scrapy.Request(url=flow_info['return_url'], callback=self.parse_with_stop_date, dont_filter=True)
request.meta['item'] = flow_info
yield request
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = laJornadaBC.settings
......
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class Lajornadabc2Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class Lajornadabc2Pipeline(object):
def process_item(self, item, spider):
return item
# -*- coding: utf-8 -*-
# Scrapy settings for laJornadaBC2 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'laJornadaBC2'
SPIDER_MODULES = ['laJornadaBC2.spiders']
NEWSPIDER_MODULE = 'laJornadaBC2.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'laJornadaBC2 (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'laJornadaBC2.middlewares.Lajornadabc2SpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'laJornadaBC2.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'laJornadaBC2.pipelines.Lajornadabc2Pipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
import scrapy, re
"""
Esta version descarga todas las noticias contenidas en la pagina, sin necesidad
de una fecha especifica.
USO:
scrapy crawl noticias -t json --nolog -o noticias.json
Genera un archivo JSON con todas las noticias disponibles. El archivo 'parse_date_file.py'
puede servir para clasificar dichas noticias en sus respectivas fechas.
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = 'noticias'
def start_requests(self):
self.baseURL = 'http://jornadabc.mx'
section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo',
'cultura', 'espectaculos', 'deportes']
for section in section_list:
yield scrapy.Request(url=self.baseURL+'/seccion/'+section, callback=self.parse)
def parse(self, response):
pagination = response.xpath('//ul[@class="pager"]/li[@class="pager-last odd last"]/a/@href').extract()
if ( len(pagination) > 0 ):
section = response.url[response.url.rfind('/')+1:]
pagination = pagination[0]
pages = int(pagination[pagination.rfind('=')+1:])
for page in range(0, pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+'/seccion/'+section+'?page='+str(page), callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
section = response.url[response.url.rfind('/')+1:]
section = section[:section.rfind('?')]
if ( section == 'espectaculos' or section == 'deportes' ):
link_list = response.xpath('//*[@class="region region-soft-first"]').css('div.views-row').xpath('./*[@class="views-field views-field-title"]/span[@class="field-content"]/a/@href').extract()
else:
link_list = response.xpath('//*[@class="region region-hard-first"]').css('div.views-row').xpath('./*[@class="views-field views-field-title"]/span[@class="field-content"]/a/@href').extract()
if len(link_list) <= 0:
link_list = response.xpath('//*[@class="view-content"]').css('div.views-row').xpath('./*[@class="views-field views-field-title"]/span[@class="field-content"]/a/@href').extract()
for link in link_list:
yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
## la fecha de las noticias ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
title = response.xpath('//*[@class="block-content"]/h1/a/text()').extract()
if ( len(title) > 0 ):
item['title'] = title[0]
else:
item['title'] = response.xpath('//*[@class="block-content"]/h1/text()').extract_first()
item['topic'] = response.xpath('//span[@class="section"]/text()').extract_first()
for paragraph in response.xpath('//*[@class="field-item even"]/p/text()').extract():
text += paragraph
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
......@@ -67,21 +67,20 @@ class QuotesSpider(scrapy.Spider):
request.meta['item'] = flow_info
yield request
else:
if year is None and month is None and day is None:
self.stop_date = None
for s in section_list:
flow_info = ImportantFlowData()
flow_info['news_section'] = s
elif year is None and month is None and day is None:
self.stop_date = None
for s in section_list:
flow_info = ImportantFlowData()
flow_info['news_section'] = s
request = scrapy.Request(url=self.base_url + s, callback=self.parse)
request.meta['item'] = flow_info
yield request
request = scrapy.Request(url=self.base_url + s, callback=self.parse)
request.meta['item'] = flow_info
yield request
else:
print "Unable to execute this crawler with current given parameters."
print "Enter all parameters: year, month and day, or none of them."
else:
print "Unable to execute this crawler with current given parameters."
print "Enter all parameters: year, month and day, or none of them."
......
......@@ -5,13 +5,15 @@ MEDIA:
Tinta Fresca, Chiapas
USAGE
$ cd tintaFresca
$ cd tintaFresca/
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
for the news contained in noticias.json being splitted into files by date. ##
$ scrapy crawl noticias --nolog -s filename=noticias.json
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to a specific date. ##
$ scrapy crawl noticias --nolog -s filename=2018-08-30.json -a year=2018 -a month=8 -a day=30
"""
......@@ -69,8 +71,8 @@ class QuotesSpider(scrapy.Spider):
baseURL = "http://tintafresca.com.mx/"
# section_list = ["letras_en_su_tinta/page1/", "tgz/page1/", "patria_chica/page1/", "hecho_en_chiapas/page1/", "show/page1/", "rafaga/page1/"]
section_list = ["tgz/page1/", "patria_chica/page1/", "hecho_en_chiapas/page1/", "show/page1/"]
section_list = ["letras_en_su_tinta/page1/", "tgz/page1/", "patria_chica/page1/", "hecho_en_chiapas/page1/", "show/page1/", "rafaga/page1/"]
# section_list = ["tgz/page1/", "patria_chica/page1/", "hecho_en_chiapas/page1/", "show/page1/"]
self.month_parser = dict(Enero='01', Febrero='02', Marzo='03', Abril='04', Mayo='05', Junio='06',
Julio='07', Agosto='08', Septiembre='09', Octubre='10', Noviembre='11', Diciembre='12')
......
......@@ -76,7 +76,7 @@ class QuotesSpider(scrapy.Spider):
flow_info = response.meta['item']
for link in response.css('ul.news-list').xpath('./li/h5/a/@href').extract():
to_next_page = True
flow_info['to_next_page'] = True
news_link = self.baseURL + link
yield scrapy.Request(url=news_link, callback=self.parse_item)
......@@ -102,7 +102,7 @@ class QuotesSpider(scrapy.Spider):
news_date = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
title = response.css('div.post-title').css('h1').extract_first()
if title is not None : remove_tags(title)
if title is not None : title = remove_tags(title)
topic = response.css('div.big-title').xpath('./h2/a/span').extract_first()
if topic is not None : topic = remove_tags(topic)
......
# -*- coding: utf-8 -*-
"""
===========================================================================
THIS VERSION OF Expreso Chiapas IS NOW DEPRECATED SINCE THE SITE'S WEB PAGE
NO LONGER ALLOWS ACCES BY DAY.
THE NEW VERSION CAN BE FOUNd IN THE descarga_hacia_atras FOLDER.
===========================================================================
"""
"""
MEDIA:
Expreso Chiapas, Chiapas
......@@ -31,20 +19,7 @@ TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
CLEAN_RE = re.compile(r'\A.*?\sl\s')
class UTC(tzinfo):
"""
Class for Time Zone
"""
def utcoffset(self, dt):
## Time zone for Chiapas: UTC-6 ##
return timedelta(hours=-6)
def tzname(self, dt):
## Time zone name ##
return 'UTC-6'
# CLEAN_RE = re.compile(r'\A.*?\sl\s')
......@@ -56,24 +31,23 @@ class QuotesSpider(scrapy.Spider):
def start_requests(self):
self.tz = UTC()
self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "http://expresochiapas.com/noticias/" + self.year + "/" + self.month.zfill(2) + "/" + self.day.zfill(2)
baseURL = "http://expresochiapas.com/noticias/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
yield scrapy.Request(url=baseURL, callback=self.parse)
def parse(self, response):
for link in response.xpath('//div[@class="mag-box-container"]/ul/li/a/@href').extract():
for link in response.css('div.penci-archive__list_posts').css('h2.entry-title > a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
nextPage = response.xpath('//div[@class="pages-nav"]/div').css('span.last-page').xpath('./a/@href').extract_first()
if nextPage is not None and nextPage != '':
yield scrapy.Request(url=nextPage, callback=self.parse)
next_page = response.css('div.nav-links').css('a.next::attr(href)').extract_first()
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse)
......@@ -81,28 +55,24 @@ class QuotesSpider(scrapy.Spider):
item = NoticiasItem()
text = ''
try:
## this date includes time zone ##
res = response.xpath('//article[@id="the-post"]/script[@type="application/ld+json"]').extract_first()
if res is not None:
res = remove_tags(res)
jsonObj = json.loads(res)
dat = jsonObj['datePublished']
except:
dat = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
item['date'] = dat
news_date = response.css('time.entry-date::attr(datetime)').extract_first()
item['title'] = remove_tags(response.xpath('//div[@class="entry-header"]/h1').extract_first()).strip()
title = response.css('h1.entry-title').extract_first()
if title is not None: title = remove_tags(title)
topic = response.xpath('//div[@class="entry-header"]/h5/a').extract_first()
if topic is not None:
item['topic'] = remove_tags(topic)
topic = response.css('div.penci-entry-categories').css('a').extract_first()
if topic is not None: topic = remove_tags(topic)
for p in response.xpath('//div[@class="entry-content entry clearfix"]/p').extract():
for p in response.css('div.penci-entry-content > p').extract():
p = p.replace("<br>", "\n")
text += remove_tags(p) + "\n"
text = text.strip()
item['text'] = CLEAN_RE.sub('', text)
item['url'] = response.url
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
......@@ -3,7 +3,7 @@
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
......
......@@ -3,7 +3,7 @@
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
......@@ -20,14 +20,14 @@ class LajornadaSpiderMiddleware(object):
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
......@@ -35,7 +35,7 @@ class LajornadaSpiderMiddleware(object):
for i in result:
yield i
def process_spider_exception(response, exception, spider):
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
......@@ -43,7 +43,7 @@ class LajornadaSpiderMiddleware(object):
# or Item objects.
pass
def process_start_requests(start_requests, spider):
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
......@@ -54,3 +54,50 @@ class LajornadaSpiderMiddleware(object):
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class LajornadaDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
......@@ -3,7 +3,7 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
......
......@@ -5,9 +5,9 @@
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'laJornada'
......@@ -25,7 +25,7 @@ NEWSPIDER_MODULE = 'laJornada.spiders'
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
......@@ -45,31 +45,31 @@ COOKIES_ENABLED = False
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'laJornada.middlewares.LajornadaSpiderMiddleware': 543,
#}
# }
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'laJornada.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'laJornada.middlewares.LajornadaDownloaderMiddleware': 543,
# }
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'laJornada.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
......@@ -82,7 +82,7 @@ ITEM_PIPELINES = {
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
......
# -*- coding: utf-8 -*-
"""
=======================================================================
THIS VERSION OF La Jornada IS NOW DEPRECATED SINCE THE SITE'S WEB PAGE
NO LONGER ALLOWS ACCES BY DAY.
THE NEW VERSION CAN BE FOUND IN THE descarga_por_rss FOLDER.
=======================================================================
MEDIA:
La Jornada, CDMX
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd laJornada/
$ scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
"""
MEDIO:
La Jornada, CDMX
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
import scrapy, re
import scrapy, re, json
from laJornada.items import NoticiasItem
from datetime import date, datetime, timedelta, tzinfo, time
from collections import OrderedDict
from datetime import datetime, timedelta, tzinfo
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
# re: r'(Twitter:\s+)?(@[\w.%+-]+.)?'
TW_RE = re.compile(r"""
(Twitter: # inicio de bloque, contiene la cadena 'Twitter:' (case insensitive)
\s+ # cualquier espacio (\t\n\r\f), una o mas ocurrencias
)? # fin de bloque, ninguna o una ocurrencia del bloque
(@ # inicio de bloque, contiene caracter '@'
[\w.%+-]+ # cualquier caracter alfanumerico mas los signos (.%+-), una o mas ocurrencias
. # cualquier caracter, excepto '\n'
)? # fin de bloque, ninguna o una ocurrencia del bloque
""", re.X|re.I) # banderas: verbose|case insensitive
# re: r'(Facebook|Vk):\s+[\w.%+-]+.'
FB_RE = re.compile(r"""
(Facebook|Vk) # bloque, contiene la cadena 'Facebook' o 'Vk' (case insensitive)
: # contiene el caracter ':'
\s+ # cualquier espacio (\t\n\r\f), una o mas ocurrencias
[\w.%+-]+ # cualquier caracter alfanumerico mas los signos (.%+-), una o mas ocurrencias
. # cualquier caracter, excepto '\n'
""", re.X|re.I) # banderas: verbose|case insensitive
# re: r'\(?(Foro:\s+)?(https?:\/\/)?([w{3}.])?[\w%+-]+(\.[a-zA-Z]{2,6}){1,2}[/\w.#$%&+-]*\)?.'
URL_RE = re.compile(r"""
\(? # contiene o no caracter '(', ninguna o una vez
(Foro: # inicio de bloque, contiene la cadena 'Foro:' (case insensitive)
\s+ # cualquier espacio (\t\n\r\f), una o mas ocurrencias
)? # fin de bloque, ninguna o una ocurrencia del bloque
(http # inicio de bloque, contiene cadena 'http'
s? # contiene o no caracter 's'
:\/\/ # contiene cadena '://'
)? # fin de bloque, ninguna o una ocurrencia del bloque
([w{3}.])? # el caracter 'w' tres veces y/o punto (www.), ninguna o una vez
[\w%+-]+ # cualquier caracter alfanumerico mas los signos (%+-), una o mas ocurrencias
(\. # inicio de bloque, contiene caracter '.'
[a-zA-Z]{2,6} # 2 a 6 letras, minusculas o mayusculas
){1,2} # fin de bloque, bloque se repite de 1 a 2 veces
[/\w.#$%&+-]* # seguido de '/', cualquier caracter alfanumerico mas los signos (.#$%&+-), cero o mas ocurrencias
\)? # contiene o no caracter ')', ninguna o una vez
. # cualquier caracter, excepto '\n'
""", re.X|re.I) # banderas: verbose|case insensitive
# re: r'[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?'
EMAIL_RE = re.compile(r"""
[\w.-]+ # cualquier caracter alfanumerico mas los signos (.-), una o mas repeticiones
@ # seguido de '@'
[\w-]+ # cualquier caracter alfanumerico mas el signo '-', una o mas repeticiones
(\. # inicio de bloque, contiene '.'
[a-zA-Z]{2,6} # 2 a 6 letras, minusculas o mayusculas
){1,2} # fin de bloque, bloque se repite de 1 a 2 veces
\s? # cualquier espacio (\t\n\r\f), ninguna o una coincidencia
""", re.X|re.I) # banderas: verbose|case insensitive
DIVP_RE = re.compile(r'(<div class="(credito-(autor|titulo)|hemero)">.*?<\/div>|<p class="s-s">.{,35}<\/p>|<span class="loc">.*?<\/span>)', re.S)
TRANSLATION_RE = re.compile(r'Traducci.n.*', re.I|re.S)
def clean_raw(rawText):
text = rawText.replace("* * *", '')
text = DIVP_RE.sub('', text)
text = TRANSLATION_RE.sub('', text)
return text
def text_cleaning(text):
"""
Function for cleaning news text
"""
"""
Elimina los espacios dobles, triples o mayores, innecesarios dentro del texto. Primero divide el texto de
acuerdo a los saltos de linea, despues divide cada segmento en palabras sin tomar en cuenta espacios. Luego
las palabras son agrupadas en nuevos segmentos con un solo espacio entre ellas y se agregan los saltos de
linea necesarios.
"""
newText = ''
counter = 0
text = text.replace(u'\u0164', '')
text = text.replace("Afp", '')
for segment in text.split("\n"):
counter += 1
if counter == 1:
newText += " ".join(segment.split())
elif counter > 1:
newText += "\n" + " ".join(segment.split())
"""---------------------------------------------------------------------------------------------------"""
class QuotesSpider(scrapy.Spider):
"""
Elimina del texto la info de facebook, twitter, foro y correo electronico.
Basic Scrapy Spider class
"""
newText = TW_RE.sub('', newText)
newText = FB_RE.sub('', newText)
newText = EMAIL_RE.sub('', newText)
newText = URL_RE.sub('', newText)
newText = TRANSLATION_RE.sub('', newText)
"""---------------------------------------------------------------------------------------------------"""
return newText
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para centro de mexico: utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
self.counter = 0
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://www.jornada.unam.mx/'+year+'/'+month.zfill(2)+'/'+day.zfill(2)+'/'
self.comparison_date_1 = date(2001, 12, 7)
self.comparison_date_2 = date(2002, 1, 8)
self.comparison_date_3 = date(2003, 4, 25)
self.comparison_date_4 = date(2004, 11, 16)
self.comparison_date_5 = date(2004, 12, 12)
self.comparison_date_6 = date(2005, 1, 31)
self.comparison_date_7 = date(2009, 2, 15)
self.date = date(int(year), int(month), int(day))
self.parse_month = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4,
'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
# self.section_list = ['opinion', 'politica', 'economia', 'mundo', 'estados', 'ciencias',
# 'capital', 'sociedad', 'cultura', 'espectaculos', 'deportes']
# for section in section_list:
# para la fecha 2009/02/15 o menores, se tiene una estructura determinada de pagina
# para las fechas mayores a esa la estructura cambia
# if ( requested_date <= comparison_date_1 ):
# yield scrapy.Request(url=self.baseURL+section, callback=self.parse)
# else:
# yield scrapy.Request(url=self.baseURL+section, callback=self.parse_2)
if self.date <= self.comparison_date_2:
section_list = ['index.html', 'edito.html', 'opinion.html', 'politica.html',
'economia.html', 'cultura.html', 'espectaculos.html', 'estados.html',
'capital.html', 'mundo.html', 'soc-jus.html', 'deportes.html']
parse_s = {'index.html': 'Portada', 'edito.html': 'Editorial', 'opinion.html': 'Opinion',
'politica.html': 'Politica', 'economia.html': 'Economia',
'cultura.html': 'Cultura', 'espectaculos.html': 'Espectaculos', 'estados.html': 'Estados',
'capital.html': 'Capital', 'mundo.html': 'Mundo', 'soc-jus.html': 'Sociedad',
'deportes.html': 'Deportes'}
for s in section_list:
item = NoticiasItem()
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['topic'] = parse_s[s]
if s == 'edito.html':
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item)
else:
request = scrapy.Request(url=self.baseURL+s, callback=self.parse)
request.meta['item'] = item
yield request
elif self.date > self.comparison_date_2 and self.date <= self.comparison_date_3:
section_list = ['index.html', 'edito.html', 'opinion.html', 'politica.html',
'economia.html', 'cultura.html', 'espectaculos.html', 'estados.html',
'capital.html', 'mundo.html', 'soc-jus.html', 'deportes.html',
'index.php', 'edito.php', 'opinion.php', 'politica.php',
'economia.php', 'cultura.php', 'espectaculos.php', 'estados.php',
'capital.php', 'mundo.php', 'soc-jus.php', 'deportes.php']
parse_s = {'index.html': 'Portada', 'edito.html': 'Editorial', 'opinion.html': 'Opinion',
'politica.html': 'Politica', 'economia.html': 'Economia',
'cultura.html': 'Cultura', 'espectaculos.html': 'Espectaculos', 'estados.html': 'Estados',
'capital.html': 'Capital', 'mundo.html': 'Mundo', 'soc-jus.html': 'Sociedad',
'deportes.html': 'Deportes',
'index.php': 'Portada', 'edito.php': 'Editorial', 'opinion.php': 'Opinion',
'politica.php': 'Politica', 'economia.php': 'Economia',
'cultura.php': 'Cultura', 'espectaculos.php': 'Espectaculos', 'estados.php': 'Estados',
'capital.php': 'Capital', 'mundo.php': 'Mundo', 'soc-jus.php': 'Sociedad',
'deportes.php': 'Deportes'}
for s in section_list:
item = NoticiasItem()
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['topic'] = parse_s[s]
if s == 'edito.html' or s == 'correo.html' or s == 'edito.php' or s == 'correo.php':
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item)
else:
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_2)
request.meta['item'] = item
yield request
elif self.date > self.comparison_date_3 and self.date <= self.comparison_date_6:
section_list = ['indexfla.php', 'edito.php', 'opinion.php', 'correo.php', 'politica.php',
'economia.php', 'cultura.php', 'espectaculos.php', 'estados.php',
'capital.php', 'mundo.php', 'soc-jus.php', 'deportes.php', 'index.php']
parse_s = {'indexfla.php': 'Portada', 'edito.php': 'Editorial', 'opinion.php': 'Opinion',
'correo.php': 'Correo', 'politica.php': 'Politica', 'economia.php': 'Economia',
'cultura.php': 'Cultura', 'espectaculos.php': 'Espectaculos', 'estados.php': 'Estados',
'capital.php': 'Capital', 'mundo.php': 'Mundo', 'soc-jus.php': 'Sociedad',
'deportes.php': 'Deportes','index.php': 'Portada'}
for s in section_list:
item = NoticiasItem()
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['topic'] = parse_s[s]
if s == 'edito.php' or s == 'correo.php':
if self.date > self.comparison_date_3 and self.date <= self.comparison_date_5:
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item)
elif self.date > self.comparison_date_5 and self.date <= self.comparison_date_6:
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_item_2)
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
else:
request = scrapy.Request(url=self.baseURL+s, callback=self.parse_3)
self.this_date = year + "-" + month.zfill(2) + "-" + self.day.zfill(2)
request.meta['item'] = item
yield request
self.baseURL = "https://www.jornada.com.mx/" + year + "/" + month.zfill(2) + "/" + self.day.zfill(2) + "/"
elif self.date > self.comparison_date_6:
# print 'first filter'
section_list = ['opinion', 'politica', 'economia', 'mundo', 'estados', 'ciencias',
'capital', 'sociedad', 'cultura', 'espectaculos', 'deportes']
section_list = ["politica", "mundo", "capital", "cultura", "deportes",
"economia", "sociedad", "estados", "espectaculos"]
for s in section_list:
# para las fechas menores a 2009/02/15 y mayores a 2005/01/31, se tiene una estructura determinada de pagina
# para las fechas mayores a esa la estructura cambia
if self.date <= self.comparison_date_7:
yield scrapy.Request(url=self.baseURL+s, callback=self.parse_5)
elif self.date > self.comparison_date_7:
# print 'second filter in ' + self.baseURL + s
yield scrapy.Request(url=self.baseURL+s, callback=self.parse_6)
for s in section_list:
yield scrapy.Request(url=self.baseURL + s, callback=self.parse)
def parse(self, response):
item = response.meta['item']
if self.date <= self.comparison_date_1:
if item['topic'] == 'Portada':
path = '//td[@rowspan="3"]'
else:
if len(response.xpath('//td[@align="center"]').css('a::attr(href)').extract()) > 0:
path = '//td[@align="center"]'
else:
path = '//td[@align="CENTER"]'
elif self.date > self.comparison_date_1 and self.date <= self.comparison_date_2:
if item['topic'] == 'Portada':
path = '//empieza'
else:
path = '//table[@bordercolor="#CCCCCC"]'
for r in response.xpath(path).css('a::attr(href)').extract():
if r[-5:] == '.html':
request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_2(self, response):
item = response.meta['item']
for r in response.xpath('//table[@bordercolor="#CCCCCC"]').css('a::attr(href)').extract():
if r[-5:] == '.html':
request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_3(self, response):
item = response.meta['item']
link_list = []
link_list.extend(response.xpath('//td[@width="100%"]').css('a::attr(href)').extract())
link_list.extend(response.xpath('//td[@width="52%"]').css('a::attr(href)').extract())
link_list.extend(response.xpath('//td[@width="24%"]').css('a::attr(href)').extract())
link_list.extend(response.xpath('//td[@width="646"]').css('a::attr(href)').extract())
link_list.extend(response.xpath('//table[@width="100%"]').css('a::attr(href)').extract())
for r in link_list:
if r[-11:] == '.html&fly=1' or r[-9:] == '.php&fly=' or r[-4:] == '.php':
if self.date > self.comparison_date_3 and self.date <= self.comparison_date_6:
if self.date <= self.comparison_date_4:
request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item)
request.meta['item'] = item
yield request
elif self.date > self.comparison_date_4 and self.date <= self.comparison_date_6:
if r[:4] == 'http' and r[-4:] == '.php':
this_url = r.replace('\n','')
if self.date <= self.comparison_date_5:
request = scrapy.Request(url=this_url, callback=self.parse_item)
elif self.date > self.comparison_date_5 and self.date <= self.comparison_date_6:
request = scrapy.Request(url=this_url, callback=self.parse_item_2)
request.meta['item'] = item
yield request
# elif self.date > self.comparison_date_5 and self.date <= self.comparison_date_6:
# request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item_2)
# request.meta['item'] = item
# yield request
def parse_4(self, response):
print response.url
for r in response.xpath('//td[@width="646"]').css('a::attr(href)').extract():
if r[-4:] == '.php':
print r.replace('\n','')
# request = scrapy.Request(url=r.replace('\n',''), callback=self.parse_item)
# request.meta['item'] = item
# yield request
def parse_5(self, response):
if response.url[:response.url.rfind('/')+1] == self.baseURL: # verifica que se conserva la misma URL base
section = response.url[response.url.rfind('/')+1:]
if section == 'opinion': # la seccion 'opinion' tiene una estructura diferente a las otras
path_list = ['//*[@id="columnas"]/p/a/@href',
'//*[@id="opinion"]/p/a/@href']
else:
path_list = ['//*[@id="article_list"]/h2/a/@href',
'//*[@id="article_list"]/h3/a/@href']
for path in path_list:
for link in response.xpath(path).extract():
yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item_3)
def parse_6(self, response):
if response.url[:response.url.rfind('/')+1] == self.baseURL:
# linkSet = set()
# path_list = ['//*[@class="itemfirst"]/div/a/@href', '//*[@class="item start"]/div/a/@href',
# '//*[@class="item"]/div/a/@href']
#
# for path in path_list:
# for link in response.xpath(path).extract():
# if link not in linkSet:
# linkSet.add(link)
# yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item_4)
linkSet = set()
linkLst = []
linkLst.extend(response.xpath('//*[@class="itemfirst"]/div/a/@href').extract())
linkLst.extend(response.xpath('//*[@class="item start"]/div/a/@href').extract())
linkLst.extend(response.xpath('//*[@class="item"]/div/a/@href').extract())
for l in linkLst:
link = self.baseURL + l
if not link in linkSet:
linkSet.add(link)
yield scrapy.Request(url=link, callback=self.parse_item_4)
link_set = set(response.css('div.section-cont').css('a.cabeza::attr(href)').extract())
for link in link_set:
yield scrapy.Request(url=self.baseURL + link, callback=self.parse_item)
def parse_item(self, response):
"""
FECHAS <= 2004-12-12
"""
item = response.meta['item']
flag = True
text = ''
try:
title = remove_tags(response.xpath('//font[@size="5"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//p/font[@size="5"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//p/font[@size="5"]').extract()[1])
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//font[@size="4"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//p/font[@size="4"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//p/font[@size="4"][1]').extract()[1])
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//font[@size="3"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//p/font[@size="3"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//p/font[@size="3"][1]').extract()[1])
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//font[@size="+1"]').extract_first())
item['title'] = title
except:
try:
title = remove_tags(response.xpath('//font[@size="+0"]').extract_first())
item['title'] = title
except:
if self.date <= date(1999, 10, 3): # en esta fecha hay un cambio respecto a las otras en cuanto al html de la pag
try:
title = remove_tags(response.xpath('//center').extract_first())
item['title'] = title
flag = False
except:
pass
else:
pass
if flag:
if self.date <= self.comparison_date_1:
"""
FECHAS > 1999-10-03 Y FECHAS <= 2001-12-07
"""
for p in response.css('p').extract():
# text += remove_tags(p).replace('\r','') ## no toma en cuenta los primeros indices donde esta el titulo
# text = text.replace('\t','')
p = clean_raw(p)
newsText = remove_tags(p)
text += text_cleaning(newsText)
m = re.search(title, text)
if title[-1] == "?": text = text[m.end()+1:]
else: text = text[m.end():]
text = text.lstrip("\n")
text = text.rstrip("\n")
elif self.date > self.comparison_date_1 and self.date <= self.comparison_date_3:
"""
FECHAS > 2001-12-07 Y FECHAS <= 2003-04-25
"""
for p in response.xpath('//table[@bordercolor="#CCCCCC"]').css('p').extract():
# text += remove_tags(p).replace('\r','')
# text = text.replace('\t','')
p = clean_raw(p)
newsText = remove_tags(p)
text += text_cleaning(newsText)
m = re.search(title, text)
if title[-1] == "?": text = text[m.end()+1:]
else: text = text[m.end():]
text = text.lstrip("\n")
text = text.rstrip("\n")
elif self.date > self.comparison_date_3 and self.date <= self.comparison_date_4:
"""
FECHAS > 2003-04-25 Y FECHAS <= 2004-11-16
"""
p = response.css('p').extract()
for i in range(0, len(p)):
# text += remove_tags(p[i]).replace('\r','')
# text = text.replace('\t','')
aux = clean_raw(p[i])
newsText = remove_tags(aux).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
elif self.date > self.comparison_date_4 and self.date <= self.comparison_date_5:
"""
FECHAS > 2004-11-16 Y FECHAS <= 2004-12-12
"""
p = response.css('p').extract()
for i in range(3, len(p)):
# text += remove_tags(p[i]).replace('\r','')
# text = text.replace('\t','')
aux = clean_raw(p[i])
newsText = remove_tags(aux).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
if text == '':
for i in range(0, len(p)):
# text += remove_tags(p[i]).replace('\r','')
# text = text.replace('\t','')
aux = clean_raw(p[i])
newsText = remove_tags(aux).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
else:
"""
FECHAS <= 1999-10-03
"""
# text = remove_tags(response.body)
# text = text[len(title):]
m = re.search(title, response.body)
body = response.body[m.end():]
body = clean_raw(body)
newsText = remove_tags(body).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
item['text'] = text
item['url'] = response.url
yield item
def parse_item_2(self, response):
"""
FECHAS > 2004-12-12 Y FECHAS <= 2005-01-31
"""
item = response.meta['item']
text = ''
# titleLst = []
# titleLst.extend(response.xpath('//*[@id="contenido"]/h1/text()').extract())
# titleLst.extend(response.xpath('//h1/text()').extract())
titleSet = set()
titleSet.add(response.xpath('//*[@id="contenido"]/h1').extract_first())
titleSet.add(response.xpath('//h1').extract_first())
for t in titleSet:
if t is not None and t != '':
title = remove_tags(t).replace('\r','')
title = title.replace('\t','')
item['title'] = title
p = response.css('p').extract()
for i in range(4, len(p)):
# text += remove_tags(p[i]).replace('\r','')
# text = text.replace('\t','')
newsText = remove_tags(p[i]).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
if text == '':
for i in range(0, len(p)):
# text += remove_tags(p[i]).replace('\r','')
# text = text.replace('\t','')
newsText = remove_tags(p[i]).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
item['text'] = text
item['url'] = response.url
yield item
def parse_item_3(self, response):
"""
FECHAS > 2005-01-31 Y FECHAS <= 2009-02-15
"""
item = NoticiasItem()
text = ''
titleSet = set()
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
# title = response.xpath('//*[@class="documentContent"]/h1[@class="title"]/text()').extract()
# if len(title) > 0:
# item['title'] = title[0]
# else:
# item['title'] = response.xpath('//*[@class="documentContent"]/h1/text()').extract_first()
titleSet.add(response.xpath('//*[@class="documentContent"]/h1[@class="title"]').extract_first())
titleSet.add(response.xpath('//*[@class="documentContent"]/h1').extract_first())
for t in titleSet:
if t is not None and t != '':
title = remove_tags(t).replace('\r','')
title = title.replace('\t','')
item['title'] = title
item['topic'] = response.xpath('//*[@id="portal-breadcrumbs"]/a[2]/text()').extract_first()
for p in response.xpath('//*[@class="documentContent"]/p').extract():
# text += remove_tags(p).replace('\r','')
# text = text.replace('\t','')
newsText = remove_tags(p).lstrip("\n")
newsText = newsText.rstrip("\n")
text += text_cleaning(newsText)
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
def parse_item_4(self, response):
"""
FECHAS > 2009-02-15
"""
d = response.xpath('//*[@class="main-fecha"]/text()').extract_first()
d = d.replace('de', '').replace(' ', ' ').split(' ')
newsDate = date(int(d[3]), self.parse_month[d[2].lower()], int(d[1]))
news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
d, t = news_date.split("T")
if d != self.this_date:
news_date = d[:d.rfind('-')] + "-" + self.day.zfill(2) + "T" + t
if newsDate == self.date:
item = NoticiasItem()
text = ''
# path_list = ['//*[@class="col"]/p', '//*[@class="col col1"]/p', '//*[@class="col col2"]/p']
# path_list = ['//*[@class="col"]', '//*[@class="col col1"]', '//*[@class="col col2"]']
textLst = []
textLst.extend(response.xpath('//*[@class="col"]').extract())
textLst.extend(response.xpath('//*[@class="col col1"]').extract())
textLst.extend(response.xpath('//*[@class="col col2"]').extract())
title = response.css('div.cabeza').extract_first()
if title is not None: title = remove_tags(title)
# item['date'] = self.date
item['date'] = datetime.combine(newsDate, time()).replace(tzinfo=self.tz).isoformat('T')
title = remove_tags(response.xpath('//*[@class="cabeza"]').extract_first())
item['title'] = " ".join(title.split())
item['topic'] = response.xpath('//*[@class="breadcrumb gui"]/span[2]/a/text()').extract_first()
topic = response.css('img.title::attr(title)').extract_first()
if topic is not None: topic = remove_tags(topic)
author = response.xpath('//*[@class="credito-autor"]/text()').extract_first()
if author is None or author == '':
author = response.xpath('//*[@class="credito-articulo"]/text()').extract_first()
if author is not None and author != '':
item['author'] = author
for p in response.css('div.text').css('p').extract():
p = p.replace("<br>", "\n")
text += remove_tags(p) + "\n"
location = remove_tags(response.xpath('//p[@class="s-s"]').extract_first())
if location is not None and location != '' and len(location) <= 35:
item['location'] = location
for p in textLst:
# text += remove_tags(p).replace('\r', '')
# text = text.replace('\t', '')
p = clean_raw(p)
# newsText = remove_tags(p).lstrip("\n")
# newsText = newsText.rstrip("\n")
# text += text_cleaning(newsText)
text += remove_tags(p)
text = text.lstrip("\n")
text = text.rstrip("\n")
text = text_cleaning(text)
item['text'] = text
item['url'] = response.url
# print item['title']
# print 'title: ' + item['title'] + '\nurl: ' + item['url'] + '\n'
yield item
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = laJornada.settings
......
......@@ -3,7 +3,7 @@
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
......
......@@ -8,7 +8,7 @@
from scrapy import signals
class ElsalvadorSpiderMiddleware(object):
class PorestoSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
......@@ -56,7 +56,7 @@ class ElsalvadorSpiderMiddleware(object):
spider.logger.info('Spider opened: %s' % spider.name)
class ElsalvadorDownloaderMiddleware(object):
class PorestoDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
......
......@@ -3,7 +3,7 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
......
# -*- coding: utf-8 -*-
# Scrapy settings for elSalvador project
# Scrapy settings for porEsto project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
......@@ -9,14 +9,14 @@
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'elSalvador'
BOT_NAME = 'porEsto'
SPIDER_MODULES = ['elSalvador.spiders']
NEWSPIDER_MODULE = 'elSalvador.spiders'
SPIDER_MODULES = ['porEsto.spiders']
NEWSPIDER_MODULE = 'porEsto.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'elSalvador (+http://www.yourdomain.com)'
#USER_AGENT = 'porEsto (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
......@@ -27,7 +27,7 @@ NEWSPIDER_MODULE = 'elSalvador.spiders'
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
......@@ -47,13 +47,13 @@ COOKIES_ENABLED = False
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'elSalvador.middlewares.ElsalvadorSpiderMiddleware': 543,
# 'porEsto.middlewares.PorestoSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'elSalvador.middlewares.ElsalvadorDownloaderMiddleware': 543,
# 'porEsto.middlewares.PorestoDownloaderMiddleware': 543,
#}
# Enable or disable extensions
......@@ -65,7 +65,7 @@ COOKIES_ENABLED = False
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'elSalvador.pipelines.JsonWriterPipeline': 300,
'porEsto.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
......
# -*- coding: utf-8 -*-
"""
MEDIA:
Por Esto!, Yucatán
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd porEsto/
$ scrapy crawl noticias --nolog -s filename=2018-08-22.json -a year=2018 -a month=8 -a day=22
"""
import scrapy, re, json
from porEsto.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
baseURL = "http://www.poresto.net/" + year + "/" + month.zfill(2) + "/" + day.zfill(2) + "/"
yield scrapy.Request(url=baseURL, callback=self.parse)
def parse(self, response):
for link in response.css('div.jeg_inner_content').css('h3.jeg_post_title > a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
next_page = response.css('div.jeg_navigation').css('a.next::attr(href)').extract_first()
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_item(self, response):
item = NoticiasItem()
text = ''
news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
title = response.css('h1.jeg_post_title').extract_first()
if title is not None: title = remove_tags(title)
topic = response.css('div.jeg_meta_category > span > a').extract_first()
if topic is not None: topic = remove_tags(topic)
for p in response.css('div.entry-content').css('p').extract():
text += remove_tags(p) + "\n"
if text == '':
text = response.css('div.entry-content').css('div.content-inner').extract_first()
text.replace("<br>", '')
text = remove_tags(text)
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
......@@ -4,8 +4,8 @@
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = elSalvador.settings
default = porEsto.settings
[deploy]
#url = http://localhost:6800/
project = elSalvador
project = porEsto
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = tiempoDigital.settings
[deploy]
#url = http://localhost:6800/
project = tiempoDigital
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class TiempodigitalSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from tiempoDigital.items import NoticiasItem
"""
MEDIO:
Tiempo Digital, Oaxaca
USO:
scrapy crawl noticias --nolog -s filename=2018-02-04.json -a year=2018 -a month=2 -a day=4
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
# LOC_RE = re.compile(r'\n.+?,? ?.+? ?\. ?- ?')
# G_RE = re.compile(r' ?- ?')
# EM_RE = re.compile(r'((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?')
# TW_RE = re.compile(r'M.{1,3}s de la P.{1,3}lvora en Twitter: @[\w.%+-]+.', re.I)
# TW2_RE = re.compile(r'((\| )?Twitter:\s+)?(@[\w.%+-]+.)?', re.I)
# TAG2_RE = re.compile(r'\ntransition_[^\]]+\]')
# TAG3_RE = re.compile(r'\[[^\]]+[\]\n]')
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "http://tiempodigital.mx/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.page-nav').xpath('./a/@href').extract()
if len(pagination) > 0:
try:
pagination = pagination[-2]
except:
pagination = pagination[-1]
pagination = pagination.strip('/')
pages = int(pagination[pagination.rfind('/') + 1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL + '/page/' + str(page + 1), callback=self.parse_page)
def parse_page(self, response):
for post in response.xpath('//div[@class="td-ss-main-content"]').css('div.item-details'):
item = NoticiasItem()
topic = post.css('div.td-module-meta-info').xpath('./a').extract_first()
if topic is not None:
item['topic'] = remove_tags(topic)
else:
item['topic'] = None
author = post.css('div.td-module-meta-info').xpath('./span[@class="td-post-author-name"]/a').extract_first()
if author is not None:
item['author'] = remove_tags(author)
link = post.css('h3.entry-title').xpath('./a/@href').extract_first()
request = scrapy.Request(url=link, callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_item(self, response):
item = response.meta['item']
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.xpath('//span[@class="td-post-date"]/time/@datetime').extract_first()
item['title'] = remove_tags(response.xpath('//header[@class="td-post-title"]/h1').extract_first()).strip()
for p in response.xpath('//div[@class="td-post-content"]').css('p').extract():
text += remove_tags(p) + "\n"
# result = LOC_RE.search(text)
# if result:
# m = result.group(0)
# location = G_RE.sub('', m).strip()
# if len(location) <= 35:
# item['location'] = location
# text = text[text.find(m)+len(m):]
# text = EM_RE.sub('', text)
# text = TW_RE.sub('', text)
# text = TW2_RE.sub('', text)
# text = TAG2_RE.sub("\n", text)
# text = TAG3_RE.sub('', text)
item['text'] = text.strip()
item['url'] = response.url
yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment