crawlers

f16d1798 · Renán Sosa Guillen · eaa37668 · f16d1798 · eaa37668 · f16d1798
Commit f16d1798 authored Sep 28, 2018 by Renán Sosa Guillen
9 changed files
--- a/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/spiders/noticias.py
+++ b/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/spiders/noticias.py
@@ -9,9 +9,11 @@ USAGE:
    ------------------------------------------------------------------------------------------------------------
    ## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
    for the news contained in noticias.json being splitted into files by date. ##
+    
    $ scrapy crawl noticias --nolog -s filename=noticias.json
    ------------------------------------------------------------------------------------------------------------
    ## Get all the news from the most current to a specific date. ##
+    
    $ scrapy crawl noticias --nolog -s filename=2018-08-10.json -a year=2018 -a month=8 -a day=10
 """

@@ -160,15 +162,20 @@ class QuotesSpider(scrapy.Spider):
            item = NoticiasItem()
            text = ''

+            news_date = datetime.strptime(news_date.isoformat(), '%Y-%m-%d').isoformat("T")
+
+            title = response.css('h1.colorRojo').extract_first()
+            if title is not None : title = remove_tags(title)
+
            topic = response.xpath('//span[@class="badge"]').extract_first()
-            if topic is not None : remove_tags(topic)
+            if topic is not None : topic = remove_tags(topic)

            for p in response.css('div.cuerpo_noticia').css('p').extract():
                text += remove_tags(p) + "\n"

            ## News item info ##
-            item['date']  = datetime.strptime(news_date.isoformat(), '%Y-%m-%d').isoformat("T")
-            item['title'] = remove_tags(response.css('h1.colorRojo').extract_first())
+            item['date']  = news_date
+            item['title'] = title
            item['topic'] = topic
            item['text']  = text.strip()
            item['url']   = response.url

--- a/descarga_por_dia/diarioPuntual/2018-09-05.json
+++ b/descarga_por_dia/diarioPuntual/2018-09-05.json
--- a/descarga_por_mes/proceso/proceso/items.py
+++ b/descarga_por_mes/proceso/proceso/items.py
@@ -3,12 +3,18 @@
 # Define here the models for your scraped items
 #
 # See documentation in:
-# http://doc.scrapy.org/en/latest/topics/items.html
+# https://doc.scrapy.org/en/latest/topics/items.html

 import scrapy


-class ProcesoItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_mes/proceso/proceso/middlewares.py
+++ b/descarga_por_mes/proceso/proceso/middlewares.py
@@ -5,52 +5,100 @@
 # See documentation in:
 # http://doc.scrapy.org/en/latest/topics/spider-middleware.html

-from scrapy import signals
+import os, tempfile, time, sys, logging, dryscrape
+from scrapy.downloadermiddlewares.redirect import RedirectMiddleware

+logger = logging.getLogger(__name__)

-class ProcesoSpiderMiddleware(object):
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the spider middleware does not modify the
-    # passed objects.

-    @classmethod
-    def from_crawler(cls, crawler):
-        # This method is used by Scrapy to create your spiders.
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
+class ThreatDefenceRedirectMiddleware(RedirectMiddleware):
+    def __init__(self, settings):
+        super(ThreatDefenceRedirectMiddleware, self).__init__(settings)

-    def process_spider_input(response, spider):
-        # Called for each response that goes through the spider
-        # middleware and into the spider.
+        # start xvfb to support headless scraping
+        if 'linux' in sys.platform:
+            dryscrape.start_xvfb()

-        # Should return None or raise an exception.
-        return None
+        self.dryscrape_session = dryscrape.Session(base_url='https://hemeroteca.proceso.com.mx/')
+        for key, value in settings['DEFAULT_REQUEST_HEADERS'].items():
+            # seems to be a bug with how webkit-server handles accept-encoding
+            if key.lower() != 'accept-encoding':
+                self.dryscrape_session.set_header(key, value)

-    def process_spider_output(response, result, spider):
-        # Called with the results returned from the Spider, after
-        # it has processed the response.
+    def _redirect(self, redirected, request, spider, reason):
+        # act normally if this isn't a threat defense redirect
+        if not self.is_threat_defense_url(redirected.url):
+            return super(ThreatDefenceRedirectMiddleware, self)._redirect(redirected, request, spider, reason)

-        # Must return an iterable of Request, dict or Item objects.
-        for i in result:
-            yield i
+        logger.debug('Proceso threat defense triggered for {0}'.format(request.url))
+        request.cookies = self.bypass_threat_defense(redirected.url)
+        request.dont_filter = True # prevents the original link being marked a dupe
+        return request

-    def process_spider_exception(response, exception, spider):
-        # Called when a spider or process_spider_input() method
-        # (from other spider middleware) raises an exception.
+    def is_threat_defense_url(self, url):
+        return 'proceso.com.mx' in url

-        # Should return either None or an iterable of Response, dict
-        # or Item objects.
-        pass
+    def bypass_threat_defense(self, url=None):
+        # only navigate if any explicit url is provided
+        if url:
+            self.dryscrape_session.visit(url)

-    def process_start_requests(start_requests, spider):
-        # Called with the start requests of the spider, and works
-        # similarly to the process_spider_output() method, except
-        # that it doesn’t have a response associated.
+        # solve the captcha if there is one
+        # captcha_images = self.dryscrape_session.css('img[src *= captcha]')
+        # if len(captcha_images) > 0:
+        #     return self.solve_captcha(captcha_images[0])

-        # Must return only requests (not items).
-        for r in start_requests:
-            yield r
+        # click on any explicit retry links
+        # retry_links = self.dryscrape_session.css('a[href *= threat_defence]')
+        # if len(retry_links) > 0:
+        #     return self.bypass_threat_defense(retry_links[0].get_attr('href'))

-    def spider_opened(self, spider):
-        spider.logger.info('Spider opened: %s' % spider.name)
+        # otherwise, we're on a redirect page so wait for the redirect and try again
+        self.wait_for_redirect()
+        return self.bypass_threat_defense()
+
+    def wait_for_redirect(self, url = None, wait = 0.1, timeout=10):
+        url = url or self.dryscrape_session.url()
+        for i in range(int(timeout//wait)):
+            time.sleep(wait)
+            if self.dryscrape_session.url() != url:
+                return self.dryscrape_session.url()
+        logger.error("Maybe {0} isn't a redirect URL?".format(self.dryscrape_session.url()))
+        raise Exception('Timed out on the zipru redirect page.')
+
+    # def solve_captcha(self, img, width=1280, height=800):
+    #     # take a screenshot of the page
+    #     self.dryscrape_session.set_viewport_size(width, height)
+    #     filename = tempfile.mktemp('.png')
+    #     self.dryscrape_session.render(filename, width, height)
+
+    #     # inject javascript to find the bounds of the captcha
+    #     js = 'document.querySelector("img[src *= captcha]").getBoundingClientRect()'
+    #     rect = self.dryscrape_session.eval_script(js)
+    #     box = (int(rect['left']), int(rect['top']), int(rect['right']), int(rect['bottom']))
+
+    #     # solve the captcha in the screenshot
+    #     image = Image.open(filename)
+    #     os.unlink(filename)
+    #     captcha_image = image.crop(box)
+    #     captcha = pytesseract.image_to_string(captcha_image)
+    #     logger.debug(f'Solved the Zipru captcha: "{captcha}"')
+
+    #     # submit the captcha
+    #     input = self.dryscrape_session.xpath('//input[@id = "solve_string"]')[0]
+    #     input.set(captcha)
+    #     button = self.dryscrape_session.xpath('//button[@id = "button_submit"]')[0]
+    #     url = self.dryscrape_session.url()
+    #     button.click()
+
+    #     # try again if it we redirect to a threat defense URL
+    #     if self.is_threat_defense_url(self.wait_for_redirect(url)):
+    #         return self.bypass_threat_defense()
+
+    #     # otherwise return the cookies as a dict
+    #     cookies = {}
+    #     for cookie_string in self.dryscrape_session.cookies():
+    #         if 'domain=zipru.to' in cookie_string:
+    #             key, value = cookie_string.split(';')[0].split('=')
+    #             cookies[key] = value
+    #     return cookies
--- a/descarga_por_mes/proceso/proceso/pipelines.py
+++ b/descarga_por_mes/proceso/proceso/pipelines.py
@@ -3,9 +3,73 @@
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()

-class ProcesoPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
        return item
--- a/descarga_por_mes/proceso/proceso/settings.py
+++ b/descarga_por_mes/proceso/proceso/settings.py
@@ -16,10 +16,10 @@ NEWSPIDER_MODULE = 'proceso.spiders'


 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'proceso (+http://www.yourdomain.com)'
+USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,22 +27,27 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 1
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = True
+COOKIES_DEBUG = True
+SPLASH_COOKIES_DEBUG = True

 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False

 # Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
-#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-#   'Accept-Language': 'en',
-#}
+# DEFAULT_REQUEST_HEADERS = {
+#     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#     'User-Agent': USER_AGENT,
+#     'Connection': 'Keep-Alive',
+#     # 'Accept-Encoding': 'gzip, deflate',
+#     'Accept-Language': 'en',
+# }

 # Enable or disable spider middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
@@ -54,10 +59,11 @@ SPIDER_MIDDLEWARES = {
 # Enable or disable downloader middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 DOWNLOADER_MIDDLEWARES = {
-   # 'proceso.middlewares.MyCustomDownloaderMiddleware': 543,
-   'scrapy_splash.SplashCookiesMiddleware': 723,
-   'scrapy_splash.SplashMiddleware': 725,
-   'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
+    # 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None,
+    'scrapy_splash.SplashCookiesMiddleware': 723,
+    'scrapy_splash.SplashMiddleware': 725,
+    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
+    # 'proceso.middlewares.ThreatDefenceRedirectMiddleware': 820,
 }

 # Enable or disable extensions
@@ -68,9 +74,9 @@ DOWNLOADER_MIDDLEWARES = {

 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'proceso.pipelines.ProcesoPipeline': 300,
-#}
+ITEM_PIPELINES = {
+   'proceso.pipelines.JsonWriterPipeline': 300,
+}

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_mes/proceso/proceso/settings_org.py
+++ b/descarga_por_mes/proceso/proceso/settings_org.py
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for proceso project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'proceso'
+
+SPIDER_MODULES = ['proceso.spiders']
+NEWSPIDER_MODULE = 'proceso.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'proceso (+http://www.yourdomain.com)'
+# USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36'
+
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 1
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+# COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+SPIDER_MIDDLEWARES = {
+   # 'proceso.middlewares.ProcesoSpiderMiddleware': 543,
+   'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
+}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+DOWNLOADER_MIDDLEWARES = {
+   # 'proceso.middlewares.MyCustomDownloaderMiddleware': 543,
+   'scrapy_splash.SplashCookiesMiddleware': 723,
+   'scrapy_splash.SplashMiddleware': 725,
+   'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
+}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'proceso.pipelines.JsonWriterPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
+
+SPLASH_URL = 'http://localhost:8050/'
+
+DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
--- a/descarga_por_mes/proceso/proceso/spiders/noticias.py
+++ b/descarga_por_mes/proceso/proceso/spiders/noticias.py
--- a/descarga_por_mes/proceso/proceso/spiders/noticias_org.py
+++ b/descarga_por_mes/proceso/proceso/spiders/noticias_org.py
+# -*- coding: utf-8 -*-
+
+"""
+MEDIA:
+    Proceso, CDMX
+
+USAGE:
+    ## For this crawler 'scrapy-splash' is used because the content is loaded through javascript. ##
+    ## Read especs_sitio_proceso.txt file. ##
+    
+    $ cd proceso/
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
+    for the news contained in noticias.json being splitted into files by date. ##
+    
+    $ scrapy crawl noticias --nolog -s filename=noticias.json
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to a specific year-month date. ##
+    
+    $ scrapy crawl noticias --nolog -s filename=2018-09.json -a year=2018 -a month=9
+"""
+
+import scrapy, re, time, cfscrape
+from proceso.items import NoticiasItem
+from datetime import datetime, date, timedelta, tzinfo
+from scrapy_splash import SplashRequest, SplashFormRequest
+
+
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+
+
+
+class UTC(tzinfo):
+    """
+    Class for Time Zone
+    """
+    def utcoffset(self, dt):
+        ## Time zone for CDMX: UTC-6 ##
+        return timedelta(hours=-6)
+
+    def tzname(self, dt):
+        ## Time zone name ##
+        return 'UTC-6'
+
+
+
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    # name = "noticias"
+    
+    def start_requests(self):
+        self.tz = UTC()
+        year = getattr(self, 'year', None)
+        month = getattr(self, 'month', None)
+
+        self.month_parser = {
+            'enero'      : '01', 'febrero' : '02',  'marzo'     : '03',  'abril'     : '04',
+            'mayo'       : '05', 'junio'   : '06',  'julio'     : '07',  'agosto'    : '08',
+            'septiembre' : '09', 'octubre' : '10',  'noviembre' : '11',  'diciembre' : '12'
+        }
+
+        self.baseURL = "https://hemeroteca.proceso.com.mx/"
+        login_url = "https://hemeroteca.proceso.com.mx/wp-login.php"
+
+        if year is not None and month is not None:
+            self.stop_date = date(int(year), int(month), 22)
+            # yield scrapy.Request(url=self.baseURL, callback=self.parse_with_stop_date)
+        
+        else:
+            self.stop_date = None
+            # yield scrapy.Request(url=self.baseURL, callback=self.parse)
+
+        # yield scrapy.Request(url=login_url, callback=self.parse_login)
+        yield SplashRequest(url=login_url, callback=self.parse_login, endpoint='render.html', args={ 'wait': 0.5 })
+
+
+
+    def parse_login(self, response):
+        print "parse_login"
+        # return scrapy.FormRequest.from_response(
+        #     response,
+        #     formdata = {
+        #         'log' : 'carlos_silvaforne@yahoo.com.mx',
+        #         'pwd' : 'carlos_silvaforne@'
+        #     },
+        #     callback = self.after_login
+        # )
+        return SplashFormRequest.from_response(
+            response,
+            formdata = {
+                'log' : 'carlos_silvaforne@yahoo.com.mx',
+                'pwd' : 'carlos_silvaforne@'
+                # 'log' : 'myusr',
+                # 'pwd' : 'mypwd'
+            },
+            callback = self.after_login,
+            # callback = self.parse_with_stop_date,
+            dont_click = True
+        )
+
+
+
+    def after_login(self, response):
+        ## Check login succeed before going on ##
+        print "after_login"
+        # print response.body
+        if "authentication failed" in response.body:
+            self.logger.error("Login failed.")
+            return
+
+        else:
+            print "passed"
+            # token, agent = cfscrape.get_tokens(self.baseURL)
+            
+            if self.stop_date is None:
+                pass
+            #     yield scrapy.Request(url=self.baseURL, callback=self.parse, dont_filter=True)
+
+            else:
+                # yield scrapy.Request(url=self.baseURL, callback=self.parse_with_stop_date, dont_filter=True)
+                # yield SplashRequest(url=self.baseURL, callback=self.parse_with_stop_date)
+                yield scrapy.Request(
+                    url=self.baseURL,
+                    callback=self.parse_with_stop_date,
+                    cookies=token,
+                    headers={'User-Agent' : agent}
+                )
+
+
+    def parse_with_stop_date(self, response):
+        TO_NEXT_PAGE = True
+
+        for item in response.css('div.catpor-box > div'):
+            item_date = item.css('span.catpor-published').extract_first()
+
+            if item_date is not None:
+                item_date    = remove_tags(item_date).replace(",", '')
+                item_date    = item_date.split(' ')
+                item_date[1] = self.month_parser[item_date[1]]
+                item_date    = map(int, item_date)
+                item_date    = date(item_date[2], item_date[1], item_date[0])
+                
+                if item_date >= self.stop_date:
+                    item_link = item.css('span.catpor-title > a::attr(href)').extract_first()
+                    yield scrapy.Request(url=item_link, callback=self.parse_links)
+
+                else:
+                    TO_NEXT_PAGE = False
+                    break
+
+        if TO_NEXT_PAGE:
+            next_page = response.css('div.page-navigation > div.nav-next > a::attr(href)').extract_first()
+
+            if next_page is not None:
+                yield scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
+
+
+
+    def parse_links(self, response):
+        for link in response.css('div.post-container > h2 > a::attr(href)').extract():
+            # print link
+            yield SplashRequest(url=link, callback=self.parse_item, endpoint='render.html', args={ 'wait': 0.5 })
+
+
+    def parse_item(self, response):
+        item = NoticiasItem()
+        text = ''
+        
+        news_date = response.xpath('//div[@id="primary"]').css('span.published').extract_first()
+        if news_date is not None:
+            news_date = remove_tags(news_date)
+            print news_date
+            d, t = news_date.split(' ')
+            d = map(int, d.split("-"))
+            t = map(int, t.split(":"))
+            news_date = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat('T')
+
+        title = response.xpath('//div[@id="primary"]/div/h1').extract_first()
+        if title is not None : title = remove_tags(title)
+
+        topic = response.css('span.entry-categories').extract_first()
+        if topic is not None : topic = remove_tags(topic)
+        
+        for paragraph in response.xpath('//div[@id="primary"]').css('div.entry-content > div').css('p').extract():
+            text += remove_tags(paragraph) + '\n'
+        
+        ## News item info ##
+        item['date']  = news_date
+        item['title'] = title
+        item['topic'] = topic
+        item['text']  = text.strip()
+        item['url']   = response.url
+        
+        yield item
+