crawlers

20e53f8f · Renán Sosa Guillen · 15cf2c16 · 20e53f8f · 20e53f8f · 20e53f8f
Commit 20e53f8f authored Apr 09, 2018 by Renán Sosa Guillen
13 changed files
--- a/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/__init__.pyc
+++ b/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/__init__.pyc
--- a/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/items.pyc
+++ b/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/items.pyc
--- a/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/pipelines.pyc
+++ b/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/pipelines.pyc
--- a/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/settings.pyc
+++ b/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/settings.pyc
--- a/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/spiders/__init__.pyc
+++ b/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/spiders/__init__.pyc
--- a/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/spiders/noticias.py
+++ b/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/spiders/noticias.py
@@ -21,6 +21,8 @@ TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)

+LOC = re.compile(r'[A-Z].*?, ?[A-Z].+?\.')
+

 class ImportantData(scrapy.Item):
    CONTINUE_SEARCHING = scrapy.Field()
@@ -45,9 +47,9 @@ class QuotesSpider(scrapy.Spider):

        baseURL = "http://www.elheraldo.hn/"

-        # sectionList = ["tegucigalpa", "economia", "opinion", "revistas/crimenes",
-        #                "pais", "sucesos", "deportes", "entretenimiento"]
-        sectionList = ["tegucigalpa"]
+        sectionList = ["tegucigalpa", "economia", "mundo", "revistas/crimenes",
+                       "pais", "sucesos", "deportes", "entretenimiento"]
+        # sectionList = ["tegucigalpa"]

        if self.stopDate is None:
            for s in sectionList:
@@ -68,6 +70,7 @@ class QuotesSpider(scrapy.Spider):


    def parse(self, response):
+        print response.url
        searchData = response.meta['item']
        CONTINUE_SEARCHING = True
        linkSet = set()
@@ -75,6 +78,8 @@ class QuotesSpider(scrapy.Spider):
            searchData['section_url'] = response.url
            linkSet = linkSet.union(set(response.xpath('//article[@id="destacada"]/a/@href').extract()))
            linkSet = linkSet.union(set(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract()))
+            linkSet = linkSet.union(set(response.xpath('//section[@id="principal"]/article/a/@href').extract()))
+            linkSet = linkSet.union(set(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract()))
            linkSet = linkSet.union(set(response.xpath('//div[@class="contenedor"]/article/a/@href').extract()))
            linkSet = linkSet.union(set(response.xpath('//article[@class="nobordes"]/div/a/@href').extract()))
            linkSet.remove(searchData['section_url'])
@@ -85,6 +90,8 @@ class QuotesSpider(scrapy.Spider):
            try:
                linkSet.remove(searchData['section_url'])
            except KeyError:
+                pass
+            if len(linkSet) <= 0:
                CONTINUE_SEARCHING = False

        for link in linkSet:
@@ -106,20 +113,26 @@ class QuotesSpider(scrapy.Spider):
        if not CONTINUE_SEARCHING:
            if searchData['page'] == 1:
                searchData['section_url'] = response.url
-                entrySet = set(response.css('article.entry').css('div.content').css('a::attr(href)').extract())
-                entrySet.remove(searchData['section_url'])
-                linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract())
-                linkSet.remove(searchData['section_url'])
-                linkSet.union(entrySet)
-                linkList = list(linkSet)
+                linkList = response.xpath('//article[@id="destacada"]/a/@href').extract()
+                linkList.extend(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract())
+                linkList.extend(response.xpath('//section[@id="principal"]/article/a/@href').extract())
+                linkList.extend(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract())
+                linkList.extend(response.xpath('//div[@class="contenedor"]/article/a/@href').extract())
+                linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
+                linkList.remove(searchData['section_url'])

            else:
-                linkSet = set(response.css('article.grid').css('div.content').css('a::attr(href)').extract())
+                linkList = response.xpath('//div[@class="contenedor"]/article/a/@href').extract()
+                linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
                try:
-                    linkSet.remove(searchData['section_url'])
-                    linkList = list(linkSet)
+                    linkList.remove(searchData['section_url'])
                except KeyError:
-                    linkList = []
+                    pass
+
+            newsList = []
+            for link in linkList:
+                if not link in newsList:
+                    newsList.append(link)

            for link in linkList:
                info = ImportantData()
@@ -144,10 +157,11 @@ class QuotesSpider(scrapy.Spider):

    def parse_item(self, response):
        item = NoticiasItem()
+        d = response.xpath('//time/text()').extract_first()
        res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
        newsData = json.loads(res)

-        item['date'] = newsData['datePublished'][:-1]
+        item['date'] = datetime.strptime(d, '%d.%m.%Y').isoformat("T")
        item['title'] = newsData['headline']

        try:
@@ -156,7 +170,15 @@ class QuotesSpider(scrapy.Spider):
            topic = None
        item['topic'] = topic

-        item['text'] = newsData['articleBody']
+        text = newsData['articleBody']
+        if text.find(u'\u00a0') >= 0:
+            loc = text[:text.find(u'\u00a0')] + "."
+            m = LOC.match(loc)
+            if m:
+                item['location'] = m.group(0)
+                text = text[text.find(u'\u00a0') + 1:]
+
+        item['text'] = text

        item['url'] = response.url

@@ -164,18 +186,16 @@ class QuotesSpider(scrapy.Spider):


    def parse_item_with_stop_date(self, response):
-        res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
-        newsData = json.loads(res)
-        d = newsData['datePublished']
-        d = d[:d.find("T")]
-        dt = datetime.strptime(d, '%Y-%m-%d').date()
+        d = response.xpath('//time/text()').extract_first()
+        dt = datetime.strptime(d, '%d.%m.%Y').date()

        if dt >= self.stopDate:
            info = response.meta['item']
            item = NoticiasItem()
-            text = ''
+            res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
+            newsData = json.loads(res)

-            item['date'] = newsData['datePublished'][:-1]
+            item['date'] = datetime.strptime(d, '%d.%m.%Y').isoformat("T")
            item['title'] = newsData['headline']

            try:
@@ -184,7 +204,15 @@ class QuotesSpider(scrapy.Spider):
                topic = None
            item['topic'] = topic

-            item['text'] = newsData['articleBody']
+            text = newsData['articleBody']
+            if text.find(u'\u00a0') >= 0:
+                loc = text[:text.find(u'\u00a0')] + "."
+                m = LOC.match(loc)
+                if m:
+                    item['location'] = m.group(0)
+                    text = text[text.find(u'\u00a0')+1:]
+
+            item['text'] = text

            item['url'] = response.url


--- a/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/spiders/noticias.pyc
+++ b/descarga_hacia_atras/foraneos/heraldoHn/heraldoHn/spiders/noticias.pyc
--- a/descarga_hacia_atras/foraneos/prensaHn/prensaHn/__init__.pyc
+++ b/descarga_hacia_atras/foraneos/prensaHn/prensaHn/__init__.pyc
--- a/descarga_hacia_atras/foraneos/prensaHn/prensaHn/items.pyc
+++ b/descarga_hacia_atras/foraneos/prensaHn/prensaHn/items.pyc
--- a/descarga_hacia_atras/foraneos/prensaHn/prensaHn/pipelines.pyc
+++ b/descarga_hacia_atras/foraneos/prensaHn/prensaHn/pipelines.pyc
--- a/descarga_hacia_atras/foraneos/prensaHn/prensaHn/settings.pyc
+++ b/descarga_hacia_atras/foraneos/prensaHn/prensaHn/settings.pyc
--- a/descarga_hacia_atras/foraneos/prensaHn/prensaHn/spiders/__init__.pyc
+++ b/descarga_hacia_atras/foraneos/prensaHn/prensaHn/spiders/__init__.pyc
--- a/descarga_hacia_atras/foraneos/prensaHn/prensaHn/spiders/noticias.pyc
+++ b/descarga_hacia_atras/foraneos/prensaHn/prensaHn/spiders/noticias.pyc