Commit 834d071c authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

merge foraneos

parents a760e68d 9a8630fb
...@@ -473,6 +473,16 @@ Adicionalmente se cuenta con los siguientes medios extranjeros: ...@@ -473,6 +473,16 @@ Adicionalmente se cuenta con los siguientes medios extranjeros:
scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23 scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
``` ```
No se encontró notas antes del 2013.10.04. No se encontró notas antes del 2013.10.04.
* [El Heraldo, Honduras](http://www.elheraldo.hn)
Uso:
```bash
cd descarga_hacia_atras/foraneos/heraldoHn
scrapy crawl noticias --nolog -s filename=noticias.json // obtiene todas las posibles
scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=3 -a day=9 //obtiene hasta una fecha dada
```
No se encontró notas antes del ----.--.--.
* [La Prensa Gráfica, El Salvador](https://www.laprensagrafica.com) * [La Prensa Gráfica, El Salvador](https://www.laprensagrafica.com)
Uso: Uso:
......
...@@ -36,6 +36,7 @@ ...@@ -36,6 +36,7 @@
{"nombre": "Yucatan al Minuto", "crawler": "descarga_hacia_atras/yucatanAlMinuto", "desde": "17-01-2017", "url": "http://www.yucatanalminuto.com"}, {"nombre": "Yucatan al Minuto", "crawler": "descarga_hacia_atras/yucatanAlMinuto", "desde": "17-01-2017", "url": "http://www.yucatanalminuto.com"},
{"nombre": "Yucatan en Corto", "crawler": "descarga_por_dia/yucatanEnCorto", "desde": "02-04-2011", "url": "http://www.yucatanencorto.com/noticias"}, {"nombre": "Yucatan en Corto", "crawler": "descarga_por_dia/yucatanEnCorto", "desde": "02-04-2011", "url": "http://www.yucatanencorto.com/noticias"},
{"nombre": "Diario Co Latino", "crawler": "descarga_por_dia/foraneos/diarioCoLatino", "desde": "04-10-2013", "url": "https://www.diariocolatino.com"}, {"nombre": "Diario Co Latino", "crawler": "descarga_por_dia/foraneos/diarioCoLatino", "desde": "04-10-2013", "url": "https://www.diariocolatino.com"},
{"nombre": "El Heraldo Hn", "crawler": "descarga_hacia_atras/foraneos/heraldoHn", "url": "http://www.elheraldo.hn"},
{"nombre": "La Prensa Grafica", "crawler": "descarga_hacia_atras/foraneos/prensaGrafica", "desde": "05-09-2017", "url": "https://www.laprensagrafica.com"}, {"nombre": "La Prensa Grafica", "crawler": "descarga_hacia_atras/foraneos/prensaGrafica", "desde": "05-09-2017", "url": "https://www.laprensagrafica.com"},
{"nombre": "The San Pedro Sun", "crawler": "descarga_por_dia/foraneos/sanPedroSun", "desde": "21-07-2008", "url": "https://www.sanpedrosun.com"}, {"nombre": "The San Pedro Sun", "crawler": "descarga_por_dia/foraneos/sanPedroSun", "desde": "21-07-2008", "url": "https://www.sanpedrosun.com"},
{"nombre": "Tiempo Digital Hn", "crawler": "descarga_por_dia/foraneos/tiempoDigitalHn", "desde": "17-04-2015", "url": "https://tiempo.hn"}, {"nombre": "Tiempo Digital Hn", "crawler": "descarga_por_dia/foraneos/tiempoDigitalHn", "desde": "17-04-2015", "url": "https://tiempo.hn"},
......
...@@ -21,6 +21,8 @@ TAG_RE = re.compile(r'<[^>]+>') ...@@ -21,6 +21,8 @@ TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
LOC = re.compile(r'[A-Z].*?, ?[A-Z].+?\.')
class ImportantData(scrapy.Item): class ImportantData(scrapy.Item):
CONTINUE_SEARCHING = scrapy.Field() CONTINUE_SEARCHING = scrapy.Field()
...@@ -45,15 +47,15 @@ class QuotesSpider(scrapy.Spider): ...@@ -45,15 +47,15 @@ class QuotesSpider(scrapy.Spider):
baseURL = "http://www.elheraldo.hn/" baseURL = "http://www.elheraldo.hn/"
sectionList = ["economia", "mundo", "tecnologia", "cine", "cultura", "turismo", sectionList = ["tegucigalpa", "economia", "mundo", "revistas/crimenes",
"honduras", "sucesos", "espectaculos", "deportes"] "pais", "sucesos", "deportes", "entretenimiento"]
# sectionList = ["economia"] # sectionList = ["tegucigalpa"]
if self.stopDate is None: if self.stopDate is None:
for s in sectionList: for s in sectionList:
info = ImportantData() info = ImportantData()
info['page'] = 1 info['page'] = 1
request = scrapy.Request(url=baseURL + s, callback=self.parse) request = scrapy.Request(url=baseURL + s + "/", callback=self.parse)
request.meta['item'] = info request.meta['item'] = info
yield request yield request
...@@ -62,27 +64,34 @@ class QuotesSpider(scrapy.Spider): ...@@ -62,27 +64,34 @@ class QuotesSpider(scrapy.Spider):
info = ImportantData() info = ImportantData()
info['page'] = 1 info['page'] = 1
info['CONTINUE_SEARCHING'] = False info['CONTINUE_SEARCHING'] = False
request = scrapy.Request(url=baseURL + s, callback=self.parse_with_stop_date) request = scrapy.Request(url=baseURL + s + "/", callback=self.parse_with_stop_date)
request.meta['item'] = info request.meta['item'] = info
yield request yield request
def parse(self, response): def parse(self, response):
print response.url
searchData = response.meta['item'] searchData = response.meta['item']
CONTINUE_SEARCHING = True CONTINUE_SEARCHING = True
linkSet = set()
if searchData['page'] == 1: if searchData['page'] == 1:
searchData['section_url'] = response.url + "/" searchData['section_url'] = response.url
entrySet = set(response.css('article.entry').css('div.content').css('a::attr(href)').extract()) linkSet = linkSet.union(set(response.xpath('//article[@id="destacada"]/a/@href').extract()))
entrySet.remove(searchData['section_url']) linkSet = linkSet.union(set(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract()))
linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract()) linkSet = linkSet.union(set(response.xpath('//section[@id="principal"]/article/a/@href').extract()))
linkSet = linkSet.union(set(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract()))
linkSet = linkSet.union(set(response.xpath('//div[@class="contenedor"]/article/a/@href').extract()))
linkSet = linkSet.union(set(response.xpath('//article[@class="nobordes"]/div/a/@href').extract()))
linkSet.remove(searchData['section_url']) linkSet.remove(searchData['section_url'])
linkSet.union(entrySet)
else: else:
linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract()) linkSet = linkSet.union(set(response.xpath('//div[@class="contenedor"]/article/a/@href').extract()))
linkSet = linkSet.union(set(response.xpath('//article[@class="nobordes"]/div/a/@href').extract()))
try: try:
linkSet.remove(searchData['section_url']) linkSet.remove(searchData['section_url'])
except KeyError: except KeyError:
pass
if len(linkSet) <= 0:
CONTINUE_SEARCHING = False CONTINUE_SEARCHING = False
for link in linkSet: for link in linkSet:
...@@ -103,21 +112,27 @@ class QuotesSpider(scrapy.Spider): ...@@ -103,21 +112,27 @@ class QuotesSpider(scrapy.Spider):
if not CONTINUE_SEARCHING: if not CONTINUE_SEARCHING:
if searchData['page'] == 1: if searchData['page'] == 1:
searchData['section_url'] = response.url + "/" searchData['section_url'] = response.url
entrySet = set(response.css('article.entry').css('div.content').css('a::attr(href)').extract()) linkList = response.xpath('//article[@id="destacada"]/a/@href').extract()
entrySet.remove(searchData['section_url']) linkList.extend(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract())
linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract()) linkList.extend(response.xpath('//section[@id="principal"]/article/a/@href').extract())
linkSet.remove(searchData['section_url']) linkList.extend(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract())
linkSet.union(entrySet) linkList.extend(response.xpath('//div[@class="contenedor"]/article/a/@href').extract())
linkList = list(linkSet) linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
linkList.remove(searchData['section_url'])
else: else:
linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract()) linkList = response.xpath('//div[@class="contenedor"]/article/a/@href').extract()
linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
try: try:
linkSet.remove(searchData['section_url']) linkList.remove(searchData['section_url'])
linkList = list(linkSet)
except KeyError: except KeyError:
linkList = [] pass
newsList = []
for link in linkList:
if not link in newsList:
newsList.append(link)
for link in linkList: for link in linkList:
info = ImportantData() info = ImportantData()
...@@ -142,10 +157,11 @@ class QuotesSpider(scrapy.Spider): ...@@ -142,10 +157,11 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
d = response.xpath('//time/text()').extract_first()
res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first()) res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
newsData = json.loads(res) newsData = json.loads(res)
item['date'] = newsData['datePublished'][:-1] item['date'] = datetime.strptime(d, '%d.%m.%Y').isoformat("T")
item['title'] = newsData['headline'] item['title'] = newsData['headline']
try: try:
...@@ -154,7 +170,15 @@ class QuotesSpider(scrapy.Spider): ...@@ -154,7 +170,15 @@ class QuotesSpider(scrapy.Spider):
topic = None topic = None
item['topic'] = topic item['topic'] = topic
item['text'] = newsData['articleBody'] text = newsData['articleBody']
if text.find(u'\u00a0') >= 0:
loc = text[:text.find(u'\u00a0')] + "."
m = LOC.match(loc)
if m:
item['location'] = m.group(0)
text = text[text.find(u'\u00a0') + 1:]
item['text'] = text
item['url'] = response.url item['url'] = response.url
...@@ -162,18 +186,16 @@ class QuotesSpider(scrapy.Spider): ...@@ -162,18 +186,16 @@ class QuotesSpider(scrapy.Spider):
def parse_item_with_stop_date(self, response): def parse_item_with_stop_date(self, response):
res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first()) d = response.xpath('//time/text()').extract_first()
newsData = json.loads(res) dt = datetime.strptime(d, '%d.%m.%Y').date()
d = newsData['datePublished']
d = d[:d.find("T")]
dt = datetime.strptime(d, '%Y-%m-%d').date()
if dt >= self.stopDate: if dt >= self.stopDate:
info = response.meta['item'] info = response.meta['item']
item = NoticiasItem() item = NoticiasItem()
text = '' res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
newsData = json.loads(res)
item['date'] = newsData['datePublished'][:-1] item['date'] = datetime.strptime(d, '%d.%m.%Y').isoformat("T")
item['title'] = newsData['headline'] item['title'] = newsData['headline']
try: try:
...@@ -182,7 +204,15 @@ class QuotesSpider(scrapy.Spider): ...@@ -182,7 +204,15 @@ class QuotesSpider(scrapy.Spider):
topic = None topic = None
item['topic'] = topic item['topic'] = topic
item['text'] = newsData['articleBody'] text = newsData['articleBody']
if text.find(u'\u00a0') >= 0:
loc = text[:text.find(u'\u00a0')] + "."
m = LOC.match(loc)
if m:
item['location'] = m.group(0)
text = text[text.find(u'\u00a0')+1:]
item['text'] = text
item['url'] = response.url item['url'] = response.url
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment