noticias.py

# -*- coding: utf-8 -*-

"""
    ===============================================================================

    THIS VERSION OF La Tribuna Honduras IS NOW DEPRECATED SINCE THE SITE'S WEB PAGE
    CHANGED ITS ACCESS TO ACCES BY DAY.
    THE NEW VERSION CAN BE FOUNd IN THE descarga_por_dia/foraneos FOLDER.

    ===============================================================================
"""

import scrapy, re, json
from datetime import date
from tribunaHn.items import NoticiasItem

"""
MEDIO:
La Tribuna, Honduras
USO:
// Si se quiere obtener todas las noticias desde las más actuales hasta las más antiguas. //
scrapy crawl noticias --nolog -s filename=noticias.json
-------------------------------------------------------------------------------------------------
// Si se quiere obtener todas las noticias desde las más actuales hasta una fecha específica. //
scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=2 -a day=29
-------------------------------------------------------------------------------------------------
Después será necesario hacer uso del archivo parse_date_files.py para que las noticias contenidas
en noticias.json sean separadas en archivos por fecha.
"""

TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
    return TAG_RE.sub('', text)

DAT_RE = re.compile(r'\d{4}\/\d{2}\/\d{2}')


class ImportantData(scrapy.Item):
    section = scrapy.Field()
    page = scrapy.Field()


class QuotesSpider(scrapy.Spider):
    name = "noticias"

    def start_requests(self):
        self.year = getattr(self, "year", None)
        self.month = getattr(self, "month", None)
        self.day = getattr(self, "day", None)

        if self.year is not None and self.month is not None and self.day is not None:
            self.stopDate = date(int(self.year), int(self.month), int(self.day))
        else:
            self.stopDate = None

        baseURL = "http://www.latribuna.hn/"

        sectionList = ["noticias", "honduras", "sociedad", "cafeteando", "dejenme-decirles", "desde-usa",
                       "ecomentarios", "el-cambio-climatico", "el-dossier-de-atenea", "enfoques",
                       "pecadillos-idiomaticos", "pildoritas", "columnistas", "editorial", "tribuna-del-pueblo",
                       "anales-historicos", "cine", "dejando-huellas", "dia-7", "dominicales", "done-un-aula",
                       "especiales-lt", "la-cobra-pregunta", "la-tribuna-agropecuaria", "la-tribuna-cultural",
                       "nuestro-orgullo", "turismo"]
        # sectionList = ["noticias"]

        for s in sectionList:
            yield scrapy.Request(url=baseURL + s, callback=self.parse)


    def parse(self, response):
        CONTINUE_SEARCHING = True
        linkList = response.xpath('//div[@id="main"]').css('article.linkbox').xpath('./a[@itemprop="url"]/@href').extract()
        linkList.extend(response.xpath('//div[@id="main"]').css('div.bottom-margin').css('div.col-sm-6').xpath('./h3/a[@itemprop="url"]/@href').extract())

        if self.stopDate is None:
            for link in linkList:
                yield scrapy.Request(url=link, callback=self.parse_item)

        else:
            for link in linkList:
                res = DAT_RE.search(link)
                if res:
                    dat = map(int, res.group(0).split("/"))
                    newsDate = date(dat[0], dat[1], dat[2])
                    if newsDate >= self.stopDate:
                        yield scrapy.Request(url=link, callback=self.parse_item)

                    else:
                        CONTINUE_SEARCHING = False
                        break

        if CONTINUE_SEARCHING:
            nextPage = response.xpath('//span[@class="next"]/a/@href').extract_first()
            if nextPage is not None:
                yield scrapy.Request(url=nextPage, callback=self.parse)


    def parse_item(self, response):
        item = NoticiasItem()
        text = ''

        "La fecha obtenida ya incluye formato y zona horaria"
        item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
        item['title'] = remove_tags(response.xpath('//header/h1[@itemprop="name"]').extract_first())

        try:
            topic = response.xpath('//aside[@class="tags"]/ul/li/a/text()').extract()[0]
        except:
            topic = None
        item['topic'] = topic

        for p in response.css('div.article-post-content').css('p').extract():
            text += remove_tags(p) + "\n"

        item['text'] = text.strip()

        item['url'] = response.url

        yield item