crawlers

bf20a7d5 · Renán Sosa Guillen · 19792a13 · bf20a7d5 · bf20a7d5 · bf20a7d5
Commit bf20a7d5 authored Mar 30, 2018 by Renán Sosa Guillen
7 changed files
--- a/descarga_hacia_atras/foraneos/tribunaHn/tribunaHn/__init__.pyc
+++ b/descarga_hacia_atras/foraneos/tribunaHn/tribunaHn/__init__.pyc
--- a/descarga_hacia_atras/foraneos/tribunaHn/tribunaHn/items.pyc
+++ b/descarga_hacia_atras/foraneos/tribunaHn/tribunaHn/items.pyc
--- a/descarga_hacia_atras/foraneos/tribunaHn/tribunaHn/pipelines.pyc
+++ b/descarga_hacia_atras/foraneos/tribunaHn/tribunaHn/pipelines.pyc
--- a/descarga_hacia_atras/foraneos/tribunaHn/tribunaHn/settings.pyc
+++ b/descarga_hacia_atras/foraneos/tribunaHn/tribunaHn/settings.pyc
--- a/descarga_hacia_atras/foraneos/tribunaHn/tribunaHn/spiders/__init__.pyc
+++ b/descarga_hacia_atras/foraneos/tribunaHn/tribunaHn/spiders/__init__.pyc
--- a/descarga_hacia_atras/foraneos/tribunaHn/tribunaHn/spiders/noticias.py
+++ b/descarga_hacia_atras/foraneos/tribunaHn/tribunaHn/spiders/noticias.py
@@ -43,8 +43,6 @@ class QuotesSpider(scrapy.Spider):
            self.stopDate = None
        baseURL = "http://www.latribuna.hn/"
-        # self.baseURI = self.baseURL + "/ajax/get_section_news.html?viewmore=%2Fajax%2Fget_section_news.html&page="
-        # self.sectionURI = "&size=6&section="
        sectionList = ["noticias", "honduras", "sociedad", "cafeteando", "dejenme-decirles", "desde-usa",
                       "ecomentarios", "el-cambio-climatico", "el-dossier-de-atenea", "enfoques",
@@ -55,14 +53,13 @@ class QuotesSpider(scrapy.Spider):
        # sectionList = ["noticias"]
        for s in sectionList:
-            yield scrapy.Request(url=self.baseURL + s, callback=self.parse)
+            yield scrapy.Request(url=baseURL + s, callback=self.parse)
    def parse(self, response):
-        # searchData = ImportantData()
        CONTINUE_SEARCHING = True
-        linkList = response.xpath('//section[@class="section-67"]').css('article.linkbox').xpath('./a[@itemprop="url"]/@href').extract()
+        linkList = response.xpath('//div[@id="main"]').css('article.linkbox').xpath('./a[@itemprop="url"]/@href').extract()
-        linkList.extend(response.xpath('//section[@class="section-67"]').css('div.bottom-margin').css('div.col-sm-6').xpath('./h3/a[@itemprop="url"]/@href').extract())
+        linkList.extend(response.xpath('//div[@id="main"]').css('div.bottom-margin').css('div.col-sm-6').xpath('./h3/a[@itemprop="url"]/@href').extract())
        if self.stopDate is None:
            for link in linkList:
@@ -87,64 +84,21 @@ class QuotesSpider(scrapy.Spider):
                yield scrapy.Request(url=nextPage, callback=self.parse)
-    # def continue_searching(self, response):
-    #     searchData = response.meta['item']
-    #     CONTINUE_SEARCHING = True
-    #
-    #     linkList = response.xpath('//article/div/h1/a/@href').extract()
-    #
-    #     if len(linkList) > 0:
-    #         if self.stopDate is None:
-    #             for link in linkList:
-    #                 yield scrapy.Request(url=self.baseURL + link, callback=self.parse_item)
-    #
-    #         else:
-    #             for link in linkList:
-    #                 res = DAT_RE.search(link)
-    #                 if res:
-    #                     dat = res.group(0).replace("-", '')
-    #                     newsDate = date(int(dat[:4]), int(dat[4:6]), int(dat[6:]))
-    #                     if newsDate >= self.stopDate:
-    #                         yield scrapy.Request(url=self.baseURL + link, callback=self.parse_item)
-    #
-    #                     else:
-    #                         CONTINUE_SEARCHING = False
-    #                         break
-    #
-    #     else:
-    #         CONTINUE_SEARCHING = False
-    #
-    #     if CONTINUE_SEARCHING:
-    #         searchData['page'] += 1
-    #         page = searchData['page']
-    #         section = searchData['section']
-    #         url = self.baseURI + str(page) + self.sectionURI + section
-    #         request = scrapy.Request(url=url, callback=self.continue_searching)
-    #         request.meta['item'] = searchData
-    #         yield request
    def parse_item(self, response):
        item = NoticiasItem()
        text = ''
        "La fecha obtenida ya incluye formato y zona horaria"
-        newsData = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
+        item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
-        newsDict = json.loads(newsData)
+        item['title'] = remove_tags(response.xpath('//header/h1[@itemprop="name"]').extract_first())
-        item['date'] = newsDict['datePublished']
-        item['title'] = newsDict['headline']
        try:
-            topic = response.xpath('//div[@class="news-line"]/a/text()').extract()[-1]
+            topic = response.xpath('//aside[@class="tags"]/ul/li/a/text()').extract()[0]
        except:
            topic = None
        item['topic'] = topic
-        author = response.css('div.content-author').xpath('./p/meta[@itemprop="name"]/@content').extract_first()
+        for p in response.css('div.article-post-content').css('p').extract():
-        if author is not None:
-            item['author'] = author
-        for p in response.css('div.news-body').css('p').extract():
            text += remove_tags(p) + "\n"
        item['text'] = text.strip()

--- a/descarga_hacia_atras/foraneos/tribunaHn/tribunaHn/spiders/noticias.pyc
+++ b/descarga_hacia_atras/foraneos/tribunaHn/tribunaHn/spiders/noticias.pyc