merge foraneos

834d071c · Renán Sosa Guillen · a760e68d · 9a8630fb · 834d071c · 834d071c
Commit 834d071c authored Apr 09, 2018 by Renán Sosa Guillen
15 changed files
--- a/README.md
+++ b/README.md
@@ -473,6 +473,16 @@ Adicionalmente se cuenta con los siguientes medios extranjeros:
 	scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
 	```
 	No se encontró notas antes del 2013.10.04.
+* [El Heraldo, Honduras](http://www.elheraldo.hn)
+    Uso:
+	```bash
+	cd descarga_hacia_atras/foraneos/heraldoHn
+	scrapy crawl noticias --nolog -s filename=noticias.json     // obtiene todas las posibles
+	scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=3 -a day=9     //obtiene hasta una fecha dada
+	```
+	No se encontró notas antes del ----.--.--.
 * [La Prensa Gráfica, El Salvador](https://www.laprensagrafica.com)
    Uso:

--- a/crawler_data.json
+++ b/crawler_data.json
@@ -36,6 +36,7 @@
 {"nombre": "Yucatan al Minuto", "crawler": "descarga_hacia_atras/yucatanAlMinuto", "desde": "17-01-2017", "url": "http://www.yucatanalminuto.com"},
 {"nombre": "Yucatan en Corto", "crawler": "descarga_por_dia/yucatanEnCorto", "desde": "02-04-2011", "url": "http://www.yucatanencorto.com/noticias"},
 {"nombre": "Diario Co Latino", "crawler": "descarga_por_dia/foraneos/diarioCoLatino", "desde": "04-10-2013", "url": "https://www.diariocolatino.com"},
+{"nombre": "El Heraldo Hn", "crawler": "descarga_hacia_atras/foraneos/heraldoHn", "url": "http://www.elheraldo.hn"},
 {"nombre": "La Prensa Grafica", "crawler": "descarga_hacia_atras/foraneos/prensaGrafica", "desde": "05-09-2017", "url": "https://www.laprensagrafica.com"},
 {"nombre": "The San Pedro Sun", "crawler": "descarga_por_dia/foraneos/sanPedroSun", "desde": "21-07-2008", "url": "https://www.sanpedrosun.com"},
 {"nombre": "Tiempo Digital Hn", "crawler": "descarga_por_dia/foraneos/tiempoDigitalHn", "desde": "17-04-2015", "url": "https://tiempo.hn"},

--- a/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/__init__.pyc
+++ b/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/__init__.pyc
--- a/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/items.pyc
+++ b/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/items.pyc
--- a/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/pipelines.pyc
+++ b/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/pipelines.pyc
--- a/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/settings.pyc
+++ b/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/settings.pyc
--- a/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/spiders/__init__.pyc
+++ b/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/spiders/__init__.pyc
--- a/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/spiders/noticias.py
+++ b/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/spiders/noticias.py
@@ -21,6 +21,8 @@ TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)
+LOC = re.compile(r'[A-Z].*?, ?[A-Z].+?\.')
 class ImportantData(scrapy.Item):
    CONTINUE_SEARCHING = scrapy.Field()
@@ -45,15 +47,15 @@ class QuotesSpider(scrapy.Spider):
        baseURL = "http://www.elheraldo.hn/"
-        sectionList = ["economia", "mundo", "tecnologia", "cine", "cultura", "turismo",
+        sectionList = ["tegucigalpa", "economia", "mundo", "revistas/crimenes",
-                       "honduras", "sucesos", "espectaculos", "deportes"]
+                       "pais", "sucesos", "deportes", "entretenimiento"]
-        # sectionList = ["economia"]
+        # sectionList = ["tegucigalpa"]
        if self.stopDate is None:
            for s in sectionList:
                info = ImportantData()
                info['page'] = 1
-                request = scrapy.Request(url=baseURL + s, callback=self.parse)
+                request = scrapy.Request(url=baseURL + s + "/", callback=self.parse)
                request.meta['item'] = info
                yield request
@@ -62,27 +64,34 @@ class QuotesSpider(scrapy.Spider):
                info = ImportantData()
                info['page'] = 1
                info['CONTINUE_SEARCHING'] = False
-                request = scrapy.Request(url=baseURL + s, callback=self.parse_with_stop_date)
+                request = scrapy.Request(url=baseURL + s + "/", callback=self.parse_with_stop_date)
                request.meta['item'] = info
                yield request
    def parse(self, response):
+        print response.url
        searchData = response.meta['item']
        CONTINUE_SEARCHING = True
+        linkSet = set()
        if searchData['page'] == 1:
-            searchData['section_url'] = response.url + "/"
+            searchData['section_url'] = response.url
-            entrySet = set(response.css('article.entry').css('div.content').css('a::attr(href)').extract())
+            linkSet = linkSet.union(set(response.xpath('//article[@id="destacada"]/a/@href').extract()))
-            entrySet.remove(searchData['section_url'])
+            linkSet = linkSet.union(set(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract()))
-            linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract())
+            linkSet = linkSet.union(set(response.xpath('//section[@id="principal"]/article/a/@href').extract()))
+            linkSet = linkSet.union(set(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract()))
+            linkSet = linkSet.union(set(response.xpath('//div[@class="contenedor"]/article/a/@href').extract()))
+            linkSet = linkSet.union(set(response.xpath('//article[@class="nobordes"]/div/a/@href').extract()))
            linkSet.remove(searchData['section_url'])
-            linkSet.union(entrySet)
        else:
-            linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract())
+            linkSet = linkSet.union(set(response.xpath('//div[@class="contenedor"]/article/a/@href').extract()))
+            linkSet = linkSet.union(set(response.xpath('//article[@class="nobordes"]/div/a/@href').extract()))
            try:
                linkSet.remove(searchData['section_url'])
            except KeyError:
+                pass
+            if len(linkSet) <= 0:
                CONTINUE_SEARCHING = False
        for link in linkSet:
@@ -103,21 +112,27 @@ class QuotesSpider(scrapy.Spider):
        if not CONTINUE_SEARCHING:
            if searchData['page'] == 1:
-                searchData['section_url'] = response.url + "/"
+                searchData['section_url'] = response.url
-                entrySet = set(response.css('article.entry').css('div.content').css('a::attr(href)').extract())
+                linkList = response.xpath('//article[@id="destacada"]/a/@href').extract()
-                entrySet.remove(searchData['section_url'])
+                linkList.extend(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract())
-                linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract())
+                linkList.extend(response.xpath('//section[@id="principal"]/article/a/@href').extract())
-                linkSet.remove(searchData['section_url'])
+                linkList.extend(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract())
-                linkSet.union(entrySet)
+                linkList.extend(response.xpath('//div[@class="contenedor"]/article/a/@href').extract())
-                linkList = list(linkSet)
+                linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
+                linkList.remove(searchData['section_url'])
            else:
-                linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract())
+                linkList = response.xpath('//div[@class="contenedor"]/article/a/@href').extract()
+                linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
                try:
-                    linkSet.remove(searchData['section_url'])
+                    linkList.remove(searchData['section_url'])
-                    linkList = list(linkSet)
                except KeyError:
-                    linkList = []
+                    pass
+            newsList = []
+            for link in linkList:
+                if not link in newsList:
+                    newsList.append(link)
            for link in linkList:
                info = ImportantData()
@@ -142,10 +157,11 @@ class QuotesSpider(scrapy.Spider):
    def parse_item(self, response):
        item = NoticiasItem()
+        d = response.xpath('//time/text()').extract_first()
        res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
        newsData = json.loads(res)
-        item['date'] = newsData['datePublished'][:-1]
+        item['date'] = datetime.strptime(d, '%d.%m.%Y').isoformat("T")
        item['title'] = newsData['headline']
        try:
@@ -154,7 +170,15 @@ class QuotesSpider(scrapy.Spider):
            topic = None
        item['topic'] = topic
-        item['text'] = newsData['articleBody']
+        text = newsData['articleBody']
+        if text.find(u'\u00a0') >= 0:
+            loc = text[:text.find(u'\u00a0')] + "."
+            m = LOC.match(loc)
+            if m:
+                item['location'] = m.group(0)
+                text = text[text.find(u'\u00a0') + 1:]
+        item['text'] = text
        item['url'] = response.url
@@ -162,18 +186,16 @@ class QuotesSpider(scrapy.Spider):
    def parse_item_with_stop_date(self, response):
-        res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
+        d = response.xpath('//time/text()').extract_first()
-        newsData = json.loads(res)
+        dt = datetime.strptime(d, '%d.%m.%Y').date()
-        d = newsData['datePublished']
-        d = d[:d.find("T")]
-        dt = datetime.strptime(d, '%Y-%m-%d').date()
        if dt >= self.stopDate:
            info = response.meta['item']
            item = NoticiasItem()
-            text = ''
+            res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
+            newsData = json.loads(res)
-            item['date'] = newsData['datePublished'][:-1]
+            item['date'] = datetime.strptime(d, '%d.%m.%Y').isoformat("T")
            item['title'] = newsData['headline']
            try:
@@ -182,7 +204,15 @@ class QuotesSpider(scrapy.Spider):
                topic = None
            item['topic'] = topic
-            item['text'] = newsData['articleBody']
+            text = newsData['articleBody']
+            if text.find(u'\u00a0') >= 0:
+                loc = text[:text.find(u'\u00a0')] + "."
+                m = LOC.match(loc)
+                if m:
+                    item['location'] = m.group(0)
+                    text = text[text.find(u'\u00a0')+1:]
+            item['text'] = text
            item['url'] = response.url

--- a/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/spiders/noticias.pyc
+++ b/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/spiders/noticias.pyc
--- a/descarga_hacia_atras/foraneos/prensaHn/prensaHn/__init__.pyc
+++ b/descarga_hacia_atras/foraneos/prensaHn/prensaHn/__init__.pyc
--- a/descarga_hacia_atras/foraneos/prensaHn/prensaHn/items.pyc
+++ b/descarga_hacia_atras/foraneos/prensaHn/prensaHn/items.pyc
--- a/descarga_hacia_atras/foraneos/prensaHn/prensaHn/pipelines.pyc
+++ b/descarga_hacia_atras/foraneos/prensaHn/prensaHn/pipelines.pyc
--- a/descarga_hacia_atras/foraneos/prensaHn/prensaHn/settings.pyc
+++ b/descarga_hacia_atras/foraneos/prensaHn/prensaHn/settings.pyc
--- a/descarga_hacia_atras/foraneos/prensaHn/prensaHn/spiders/__init__.pyc
+++ b/descarga_hacia_atras/foraneos/prensaHn/prensaHn/spiders/__init__.pyc
--- a/descarga_hacia_atras/foraneos/prensaHn/prensaHn/spiders/noticias.pyc
+++ b/descarga_hacia_atras/foraneos/prensaHn/prensaHn/spiders/noticias.pyc