crawlers

cbeb18b8 · Renán Sosa Guillen · 24040d75 · cbeb18b8 · cbeb18b8 · cbeb18b8
Commit cbeb18b8 authored Dec 18, 2018 by Renán Sosa Guillen
16 changed files
--- a/crawler_script/tracker_proceso.py
+++ b/crawler_script/tracker_proceso.py
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+    Script para el trackeo del medio PROCESO.
+"""
+
+import sys
+import os
+
+
+
+baseDir   = "/home/geoint/virtualHDD/m3/noticias/"
+scrapyDir = "/home/geoint/crawlersNoticias/"
+s = {"crawler": "descarga_por_mes/proceso"}
+
+media = s['crawler'][s['crawler'].rfind("/")+1:]
+
+os.chdir(baseDir)
+
+try:
+    os.makedirs(media)
+except:
+    print "ok"
+
+os.chdir(media)
+
+# rango va de 1976 a 2018
+for year in xrange(1976, 2019):
+    try:
+        os.makedirs(str(year))
+    except:
+        print "ok"
+
+    os.chdir(str(year))
+    filename = str(year) + ".json"
+
+    scrapycommand = "scrapy crawl noticias --nolog -s filename={0} -a year={1}".format(filename, str(year))
+
+    mydir = os.getcwd()
+    print mydir
+    os.chdir(scrapyDir + s['crawler'])
+    print media
+    print scrapycommand
+    os.system(scrapycommand)
+    fileSize = os.stat(filename).st_size
+    if fileSize <= 3: os.system("rm " + filename)
+    else:
+        os.chdir(scrapyDir)
+        sys_command = "python3 parse_date_files.py {0} {1}".format(s['crawler'], filename)
+        os.system(sys_command)
+        os.chdir(media)
+        mediaYears = os.listdir(".")
+        mediaYears.sort()
+
+        for yy in mediaYears:
+            os.chdir(yy)
+            try:
+                os.makedirs(baseDir + media + "/" + yy)
+            except:
+                pass
+            mediaDays = os.listdir(".")
+            mediaDays = [l for l in mediaDays if not l.startswith('.')]
+            mediaDays.sort()
+
+            for dd in mediaDays:
+                os.system("mv " + dd + " " + baseDir + media + "/" + yy)
+
+            os.chdir("..")
+            os.system("rm -R " + yy)
+
+        os.chdir("..")
+        os.system("rm -R " + media)
+        os.chdir(s['crawler'])
+        os.system("rm " + filename)
+        os.chdir(mydir)
+
+    os.chdir("..")
+    # os.chdir("..")
--- a/descarga_por_mes/proceso/proceso/middlewares.py
+++ b/descarga_por_mes/proceso/proceso/middlewares.py
@@ -5,100 +5,52 @@
 # See documentation in:
 # http://doc.scrapy.org/en/latest/topics/spider-middleware.html

-import os, tempfile, time, sys, logging, dryscrape
-from scrapy.downloadermiddlewares.redirect import RedirectMiddleware
+from scrapy import signals

-logger = logging.getLogger(__name__)

+class ProcesoSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.

-class ThreatDefenceRedirectMiddleware(RedirectMiddleware):
-    def __init__(self, settings):
-        super(ThreatDefenceRedirectMiddleware, self).__init__(settings)
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s

-        # start xvfb to support headless scraping
-        if 'linux' in sys.platform:
-            dryscrape.start_xvfb()
+    def process_spider_input(response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.

-        self.dryscrape_session = dryscrape.Session(base_url='https://hemeroteca.proceso.com.mx/')
-        for key, value in settings['DEFAULT_REQUEST_HEADERS'].items():
-            # seems to be a bug with how webkit-server handles accept-encoding
-            if key.lower() != 'accept-encoding':
-                self.dryscrape_session.set_header(key, value)
+        # Should return None or raise an exception.
+        return None

-    def _redirect(self, redirected, request, spider, reason):
-        # act normally if this isn't a threat defense redirect
-        if not self.is_threat_defense_url(redirected.url):
-            return super(ThreatDefenceRedirectMiddleware, self)._redirect(redirected, request, spider, reason)
+    def process_spider_output(response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.

-        logger.debug('Proceso threat defense triggered for {0}'.format(request.url))
-        request.cookies = self.bypass_threat_defense(redirected.url)
-        request.dont_filter = True # prevents the original link being marked a dupe
-        return request
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i

-    def is_threat_defense_url(self, url):
-        return 'proceso.com.mx' in url
+    def process_spider_exception(response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.

-    def bypass_threat_defense(self, url=None):
-        # only navigate if any explicit url is provided
-        if url:
-            self.dryscrape_session.visit(url)
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass

-        # solve the captcha if there is one
-        # captcha_images = self.dryscrape_session.css('img[src *= captcha]')
-        # if len(captcha_images) > 0:
-        #     return self.solve_captcha(captcha_images[0])
+    def process_start_requests(start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.

-        # click on any explicit retry links
-        # retry_links = self.dryscrape_session.css('a[href *= threat_defence]')
-        # if len(retry_links) > 0:
-        #     return self.bypass_threat_defense(retry_links[0].get_attr('href'))
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r

-        # otherwise, we're on a redirect page so wait for the redirect and try again
-        self.wait_for_redirect()
-        return self.bypass_threat_defense()
-
-    def wait_for_redirect(self, url = None, wait = 0.1, timeout=10):
-        url = url or self.dryscrape_session.url()
-        for i in range(int(timeout//wait)):
-            time.sleep(wait)
-            if self.dryscrape_session.url() != url:
-                return self.dryscrape_session.url()
-        logger.error("Maybe {0} isn't a redirect URL?".format(self.dryscrape_session.url()))
-        raise Exception('Timed out on the zipru redirect page.')
-
-    # def solve_captcha(self, img, width=1280, height=800):
-    #     # take a screenshot of the page
-    #     self.dryscrape_session.set_viewport_size(width, height)
-    #     filename = tempfile.mktemp('.png')
-    #     self.dryscrape_session.render(filename, width, height)
-
-    #     # inject javascript to find the bounds of the captcha
-    #     js = 'document.querySelector("img[src *= captcha]").getBoundingClientRect()'
-    #     rect = self.dryscrape_session.eval_script(js)
-    #     box = (int(rect['left']), int(rect['top']), int(rect['right']), int(rect['bottom']))
-
-    #     # solve the captcha in the screenshot
-    #     image = Image.open(filename)
-    #     os.unlink(filename)
-    #     captcha_image = image.crop(box)
-    #     captcha = pytesseract.image_to_string(captcha_image)
-    #     logger.debug(f'Solved the Zipru captcha: "{captcha}"')
-
-    #     # submit the captcha
-    #     input = self.dryscrape_session.xpath('//input[@id = "solve_string"]')[0]
-    #     input.set(captcha)
-    #     button = self.dryscrape_session.xpath('//button[@id = "button_submit"]')[0]
-    #     url = self.dryscrape_session.url()
-    #     button.click()
-
-    #     # try again if it we redirect to a threat defense URL
-    #     if self.is_threat_defense_url(self.wait_for_redirect(url)):
-    #         return self.bypass_threat_defense()
-
-    #     # otherwise return the cookies as a dict
-    #     cookies = {}
-    #     for cookie_string in self.dryscrape_session.cookies():
-    #         if 'domain=zipru.to' in cookie_string:
-    #             key, value = cookie_string.split(';')[0].split('=')
-    #             cookies[key] = value
-    #     return cookies
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_por_mes/proceso/proceso/settings.py
+++ b/descarga_por_mes/proceso/proceso/settings.py
@@ -16,7 +16,7 @@ NEWSPIDER_MODULE = 'proceso.spiders'


 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
+#USER_AGENT = 'proceso (+http://www.yourdomain.com)'

 # Obey robots.txt rules
 # ROBOTSTXT_OBEY = True
@@ -33,38 +33,28 @@ DOWNLOAD_DELAY = 1
 #CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)
-COOKIES_ENABLED = True
-COOKIES_DEBUG = True
-SPLASH_COOKIES_DEBUG = True
+COOKIES_ENABLED = False

 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False

 # Override the default request headers:
-# DEFAULT_REQUEST_HEADERS = {
-#     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-#     'User-Agent': USER_AGENT,
-#     'Connection': 'Keep-Alive',
-#     # 'Accept-Encoding': 'gzip, deflate',
-#     'Accept-Language': 'en',
-# }
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}

 # Enable or disable spider middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
-SPIDER_MIDDLEWARES = {
-   # 'proceso.middlewares.ProcesoSpiderMiddleware': 543,
-   'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
-}
+# SPIDER_MIDDLEWARES = {
+#     'proceso.middlewares.ProcesoSpiderMiddleware': 543,
+# }

 # Enable or disable downloader middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
-DOWNLOADER_MIDDLEWARES = {
-    # 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None,
-    'scrapy_splash.SplashCookiesMiddleware': 723,
-    'scrapy_splash.SplashMiddleware': 725,
-    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
-    # 'proceso.middlewares.ThreatDefenceRedirectMiddleware': 820,
-}
+# DOWNLOADER_MIDDLEWARES = {
+#     'proceso.middlewares.MyCustomDownloaderMiddleware': 543,
+# }

 # Enable or disable extensions
 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
@@ -98,8 +88,3 @@ ITEM_PIPELINES = {
 #HTTPCACHE_DIR = 'httpcache'
 #HTTPCACHE_IGNORE_HTTP_CODES = []
 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
-HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
-
-SPLASH_URL = 'http://localhost:8050/'
-
-DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
--- a/descarga_por_mes/proceso/proceso/spiders/noticias.py
+++ b/descarga_por_mes/proceso/proceso/spiders/noticias.py
@@ -5,303 +5,151 @@ MEDIA:
    Proceso, CDMX

 USAGE:
-    ## For this crawler 'scrapy-splash' is used because the content is loaded through javascript. ##
-    ## Read especs_sitio_proceso.txt file. ##
-    
    $ cd proceso/
    ------------------------------------------------------------------------------------------------------------
-    ## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
-    for the news contained in noticias.json being splitted into files by date. ##
-    
-    $ scrapy crawl noticias --nolog -s filename=noticias.json
-    ------------------------------------------------------------------------------------------------------------
-    ## Get all the news from the most current to a specific year-month date. ##
+    ## Get all the news from the most current to the oldest for one year. It's necessary to use the parse_date_files.py
+    file for the news contained in noticias.json being splitted into files by date. ##
    
-    $ scrapy crawl noticias --nolog -s filename=2018-09.json -a year=2018 -a month=9
+    $ scrapy crawl noticias --nolog -s filename=2018.json -a year=2018
 """

-import scrapy, re, cfscrape
+import scrapy, re, requests
 from proceso.items import NoticiasItem
-from datetime import datetime, date, timedelta, tzinfo
-from scrapy.http.cookies import CookieJar
-from scrapy_splash import SplashRequest, SplashFormRequest
-
+from datetime import date, datetime

 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)

+DATE  = re.compile(r'\$\(\"\#iny-date\"\)\.html\(\'(.*?)\'\);')
+TITLE = re.compile(r'\$\(\"\#iny-title\"\)\.html\(\'(.*?)\'\);')
+TOPIC = re.compile(r'\$\(\"\#iny-category\"\)\.html\(\'(.*?)\'\);')
+CONTENT = re.compile(r'content_lines \+= \'(.*?)\'')


-class UTC(tzinfo):
-    """
-    Class for Time Zone
-    """
-    def utcoffset(self, dt):
-        ## Time zone for CDMX: UTC-6 ##
-        return timedelta(hours=-6)
-
-    def tzname(self, dt):
-        ## Time zone name ##
-        return 'UTC-6'
-
-USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
-
-script = """
-function main(splash)
-  splash:init_cookies(splash.args.cookies)
-  assert(splash:go{
-    splash.args.url,
-    headers=splash.args.headers,
-    http_method=splash.args.http_method,
-    body=splash.args.body,
-    })
-  assert(splash:wait(0.5))
-
-  local entries = splash:history()
-  local last_response = entries[#entries].response
-  return {
-    url = splash:url(),
-    headers = last_response.headers,
-    http_status = last_response.status,
-    cookies = splash:get_cookies(),
-    html = splash:html(),
-  }
-end
-"""
-
 class QuotesSpider(scrapy.Spider):
    """
    Basic Scrapy Spider class
    """
    name = "noticias"
-    
+
+
    def start_requests(self):
-        self.tz = UTC()
-        year = getattr(self, 'year', None)
-        month = getattr(self, 'month', None)
-
-        self.month_parser = {
-            'enero'      : '01', 'febrero' : '02',  'marzo'     : '03',  'abril'     : '04',
-            'mayo'       : '05', 'junio'   : '06',  'julio'     : '07',  'agosto'    : '08',
-            'septiembre' : '09', 'octubre' : '10',  'noviembre' : '11',  'diciembre' : '12'
+        year  = getattr(self, "year", None)
+        month = getattr(self, "month", None)
+        day   = getattr(self, "day", None)
+
+        self.stop_date = date(2018, 9, 15)
+
+        """
+            Iniciar sesion en el navegador, buscar la noticia deseada y en la seccion Network de las herramientas de desarrolladores buscar la archivo
+            wp-login-6a28500e1f.js.php?page_id=420235, click derecho y copiar como curl.
+            Parsear el curl command a python requests.
+            Pegar en este archivo las cookies y los headers, en self.cookies y self.headers respectivamente.
+        """
+
+        self.logged_in_link = "https://hemeroteca.proceso.com.mx/wp-login-6a28500e1f.js.php"
+
+        self.cookies = {
+            '__auc': '716d2c791661564eaf19d7da29c',
+            '_io_un': '',
+            '__gads': 'ID=f530a1448b07ba8d:T=1537957532:S=ALNI_MbwbTDehpjreM-pfG17_0dfanqayQ',
+            '__cfduid': 'df2897ebe2d430cacad2c4cbe30238d6e1537957552',
+            '_ga': 'GA1.3.770661327.1537957227',
+            '_gid': 'GA1.3.56098531.1545061125',
+            'CloudFront-Key-Pair-Id': 'APKAJIUNCAFIFZIBZZNQ',
+            'CloudFront-Policy': 'eyJTdGF0ZW1lbnQiOlt7IlJlc291cmNlIjoiaHR0cCo6Ly9yZXZpc3RhLnByb2Nlc28uY29tLm14LzIxNzMvKiIsIkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiSXBBZGRyZXNzIjp7IkFXUzpTb3VyY2VJcCI6IjE3Ny4yMzcuMTQ0Ljk5LzMyIn0sIkFXUzpFcG9jaFRpbWUiOjE1NDUyMjk1MTV9fX1dfQ__',
+            'CloudFront-Signature': 'jsBurtdKb8cjh33rHDJwH94E3b4aphF3sXbAHPIEAZKcY7VYeE32AxLiCZ5WuGJyTr5h44f67FjRB3chUCShT32XpQ2cEVYneft3i7nRujMo3ElVcxR6GJyjMAfb-2NUMi7Fqi~mQEvD2GS4Dap9utqunObljVf6f0nB2Yw1PEBvhT0kk5VoUDXuSHbS70xqW6TdXR6E8Aatao49Oejx-q20Ke7~jY1L4OKCuoW8ValTXc~c6Jjv13Tv9M3z18x11LlG0Nkxl4l4h9n07wYq~nVkGoi~8ieEiFSbl7Uoo9f37E4FWOfknYFiV8tPMvi7ltC036f6Lah7HevJQvVdXg__',
+            'cf_clearance': 'd76b9e56d6517fdb2e9083d63e69eaedef66bc3f-1545150003-1800-150',
+            '__asc': 'fab2e041167c2184ed47905fbc7',
+            '_gat_gtag_UA_24909634_2': '1',
+            '_gat_Insticator_Embed_v4': '1',
+            'wordpress_test_cookie': 'WP+Cookie+check',
+            'hemeroteca': 'carlos_silvaforne%40yahoo.com.mx%7C1545322845%7CE2Tz5fLMVkSMLh2FNEvKvIsiA2xGncuA4sJdgvipCdB%7Cc29cb11d05da5f7be07ae0cbfee865700e6cd3b0c21dad2e40674668e2d6a9db',
        }

-        self.baseURL = "https://hemeroteca.proceso.com.mx/"
-        login_url = "https://hemeroteca.proceso.com.mx/wp-login.php"
+        self.headers = {
+            'DNT': '1',
+            'Accept-Encoding': 'gzip, deflate',
+            'Accept-Language': 'es-419,es;q=0.9,en;q=0.8',
+            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
+            'Accept': '*/*',
+            'Referer': 'http://hemeroteca.proceso.com.mx/?page_id=420235',
+            'Connection': 'keep-alive',
+        }

-        if year is not None and month is not None:
-            self.stop_date = date(int(year), int(month), 15)
-            # yield scrapy.Request(url=login_url, callback=self.parse_with_stop_date)
-        
-        else:
-            self.stop_date = None
-            # yield scrapy.Request(url=login_url, callback=self.parse)
-
-        token, agent = cfscrape.get_tokens(login_url, user_agent=USER_AGENT)
-        print token
-        print "\n"
-        yield SplashRequest(url=login_url, callback=self.parse_login, endpoint='render.html', args={ 'wait': 0.5 }, cookies=token,
-                    headers={'User-Agent' : agent})
-
-
-    def parse_login(self, response):
-        return SplashFormRequest.from_response(
-            response,
-            formdata = {
-                'log' : 'carlos_silvaforne@yahoo.com.mx',
-                'pwd' : 'carlos_silvaforne@'
-            },
-            callback = self.after_login,
-            dont_click = True
-        )
-
-
-
-    def after_login(self, response):
-        ## Check login succeed before going on ##
-        print response.headers
-        print "\n"
-        print response.real_url
-        print "\n"
-        print response.request.headers
-        print "\n"
-        cookie_list = response.request.headers.getlist('Cookie')
-        cfc, cfd = cookie_list[0].split(';')
-        cfc = cfc.strip().split('=')
-        cfd = cfd.strip().split('=')
-        cookies = [cfc[1], cfd[1]]
-        cookies = {cfc[0]: cfc[1], cfd[0]: cfd[1]}
-        
-        session_legend = response.css('div.topnav > a').extract()[-1]
-        print response.css('h1.entry-title').extract_first()
-        print "\n"
-
-        if session_legend is not None:
-            session_legend = remove_tags(session_legend)
-
-            if not "Cerrar" in session_legend:
-                print "Login failed."
-
-            else:
-                print session_legend
-                print "\n"
-                token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
-                print token
-                print "\n"
-
-                if self.stop_date is None:
-                    yield scrapy.Request(url=self.baseURL, callback=self.parse)
-
-                else:
-                    self.meta = response.request.meta
-                    yield SplashRequest(url=self.baseURL, callback=self.parse_with_stop_date,
-                        meta=self.meta,
-                        endpoint='execute',
-                        cache_args=['lua_source'],
-                        args={'lua_source': script},
-                        headers={'User-Agent': USER_AGENT}
-                    )
-                    # request = SplashRequest(url=self.baseURL, callback=self.parse_with_stop_date,
-                    #     endpoint='execute',
-                    #     cache_args=['lua_source'],
-                    #     args={'lua_source': script},
-                    #     headers={'User-Agent': USER_AGENT}
-                    # )
-                    # request.meta['splash']['session_id'] = cookie_list[0]
-                    # yield request
-
-
-
-        # if "authentication failed" in response.body:
-        #     self.logger.error("Login failed.")
-        #     return
-
-        # else:
-        #     # token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
-        #     if self.stop_date is None:
-        #         yield scrapy.Request(url=self.baseURL, callback=self.parse, dont_filter=True)
-
-        #     else:
-        #         yield scrapy.Request(url=self.baseURL, callback=self.parse_with_stop_date, dont_filter=True)
-                # yield scrapy.Request(
-                #     url=self.baseURL,
-                #     callback=self.parse_with_stop_date,
-                #     cookies=token,
-                #     headers={'User-Agent' : agent}
-                # )
-
-
-
-    def parse_with_stop_date(self, response):
-        print "parse_with_stop_date"
-        print "\n"
-        print response.css('h1.entry-title').extract_first()
-        # print "\n"
-        # print response.cookiejar
-        print "\n"
-        print response.headers
-        print "\n"
+        # self.baseURL = "https://hemeroteca.proceso.com.mx/"
+        self.baseURL = "https://hemeroteca.proceso.com.mx/?page_id=111058&edicion=mexico&page={0}".format(year)
        
-        # session_legend = response.css('div.topnav > a').extract()[-1]
-        # if session_legend is not None :
-        #     print remove_tags(session_legend)
-        #     print "\n"
-        # else :
-        #     print "No log in."
-        
-        TO_NEXT_PAGE = True
+        yield scrapy.Request(url=self.baseURL, callback=self.parse_magazine)

-        for item in response.css('div.catpor-box > div'):
-            item_date = item.css('span.catpor-published').extract_first()

-            if item_date is not None:
-                item_date    = remove_tags(item_date).replace(",", '')
-                item_date    = item_date.split(' ')
-                item_date[1] = self.month_parser[item_date[1]]
-                item_date    = map(int, item_date)
-                item_date    = date(item_date[2], item_date[1], item_date[0])
-                
-                if item_date >= self.stop_date:
-                    item_link = item.css('span.catpor-title > a::attr(href)').extract_first()
-                    print item_link
-                    # token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
-                    # yield scrapy.Request(url=item_link, callback=self.parse_links, cookies=cookies)
-                    # yield scrapy.Request(url=item_link, callback=self.parse_links, cookies=token,
-                    #                         headers={'User-Agent' : agent})
-                    yield SplashRequest(url=item_link, callback=self.parse_links,
-                        endpoint='execute',
-                        cache_args=['lua_source'],
-                        args={'lua_source': script},
-                        headers={'User-Agent': USER_AGENT}
-                    )
-
-                else:
-                    TO_NEXT_PAGE = False
-                    break
-
-        if TO_NEXT_PAGE:
-            next_page = response.css('div.page-navigation > div.nav-next > a::attr(href)').extract_first()
-
-            if next_page is not None:
-                # yield scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
-                yield SplashRequest(url=next_page, callback=self.parse_with_stop_date,
-                    endpoint='execute',
-                    cache_args=['lua_source'],
-                    args={'lua_source': script},
-                    headers={'User-Agent': USER_AGENT}
-                )
-
-
-
-    def parse_links(self, response):
-        print "\n\n"
-        print response.headers
-        for link in response.css('div.post-container > h2 > a::attr(href)').extract():
-            # print link
-            # token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
-
-            # yield SplashRequest(url=link, callback=self.parse_item, endpoint='render.html', args={ 'wait': 0.5 }, cookies=token,
-            #         headers={'User-Agent' : agent})
-            yield SplashRequest(url=link, callback=self.parse_item,
-                endpoint='execute',
-                cache_args=['lua_source'],
-                args={'lua_source': script},
-                headers={'User-Agent': USER_AGENT}
-            )

+    def parse_magazine(self, response):
+        for link in response.css('div.catalogo-portadas').css('div.catpor-post-thumb > a::attr(href)').extract():
+            yield scrapy.Request(url=link, callback=self.parse_news)
+        # url_1 = "https://hemeroteca.proceso.com.mx/?page_id=419579"
+        # yield scrapy.Request(url=url_1, callback=self.parse_news)


-    def parse_item(self, response):
-        # if response.url == "https://hemeroteca.proceso.com.mx/?page_id=278958&a51dc26366d99bb5fa29cea4747565fec=420203":
-        #     print response.body
-        item = NoticiasItem()
-        text = ''
-        
-        news_date = response.xpath('//div[@id="primary"]').css('span.published').extract_first()
-        if news_date is not None:
-            news_date = remove_tags(news_date)
-            print news_date
-            d, t = news_date.split(' ')
-            d = map(int, d.split("-"))
-            t = map(int, t.split(":"))
-            news_date = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat('T')
-
-        title = response.xpath('//div[@id="primary"]/div/h1').extract_first()
-        if title is not None : title = remove_tags(title)
-
-        topic = response.css('span.entry-categories').extract_first()
-        if topic is not None : topic = remove_tags(topic)
-        
-        for paragraph in response.xpath('//div[@id="primary"]').css('div.entry-content > div').css('p').extract():
-            text += remove_tags(paragraph) + '\n'
-        
-        ## News item info ##
-        item['date']  = news_date
-        item['title'] = title
-        item['topic'] = topic
-        item['text']  = text.strip()
-        item['url']   = response.url
-        
-        yield item

+    def parse_news(self, response):
+        for news_link in response.css('div.hfeed').css('h2.entry-title > a::attr(href)').extract():
+        #     print news_link
+        # news_link = response.css('div.hfeed').css('h2.entry-title > a::attr(href)').extract_first()
+        # news_link = "https://hemeroteca.proceso.com.mx/?page_id=278958&a51dc26366d99bb5fa29cea4747565fec=419589"
+
+            news_id = news_link[news_link.rfind("=")+1:]
+
+            self.headers['Referer'] = news_link
+
+            params = (
+                ('page_id', '278958'),
+                ('a51dc26366d99bb5fa29cea4747565fec', news_id),
+            )
+
+            response = requests.get(self.logged_in_link, headers=self.headers,
+                                    params=params, cookies=self.cookies)
+
+            # print "\n"
+            # print response.text + "\n\n"
+            response_text = response.text.replace("var content_lines = '';", '')
+            response_text = response_text.replace('content_lines += \'<p class=\\"p1\\">\';', '')
+            # print response_text
+
+            news_date = DATE.findall(response_text)
+            news_date = news_date[0]
+            news_date = news_date.split(' ')
+            news_d = datetime.strptime(news_date[0], '%Y-%m-%d').date()
+
+            if news_d <= self.stop_date:
+                item = NoticiasItem()
+                news_date = "T".join(news_date)
+                # print news_date
+
+                title = TITLE.findall(response_text)
+                news_title = title[0].strip()
+                # print news_title
+
+                topic = TOPIC.findall(response_text)
+                news_topic = topic[0].strip()
+                # print news_topic
+
+                text = ''
+                matches = CONTENT.findall(response_text)
+                for match in matches:
+                    adition = remove_tags(match)
+                    if adition != '':
+                        text += adition + "\n"
+                news_text = text.strip()
+                # print news_text
+
+                print "date: {0}".format(news_date)   
+                
+                item['date']  = news_date
+                item['title'] = news_title
+                item['topic'] = news_topic
+                item['text']  = news_text
+                item['url']   = news_link
+
+                yield item
--- a/descarga_por_mes/proceso_org/proceso/__init__.py
+++ b/descarga_por_mes/proceso_org/proceso/__init__.py
--- a/descarga_por_mes/proceso_org/proceso/items.py
+++ b/descarga_por_mes/proceso_org/proceso/items.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_mes/proceso_org/proceso/middlewares.py
+++ b/descarga_por_mes/proceso_org/proceso/middlewares.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+import os, tempfile, time, sys, logging, dryscrape
+from scrapy.downloadermiddlewares.redirect import RedirectMiddleware
+
+logger = logging.getLogger(__name__)
+
+
+class ThreatDefenceRedirectMiddleware(RedirectMiddleware):
+    def __init__(self, settings):
+        super(ThreatDefenceRedirectMiddleware, self).__init__(settings)
+
+        # start xvfb to support headless scraping
+        if 'linux' in sys.platform:
+            dryscrape.start_xvfb()
+
+        self.dryscrape_session = dryscrape.Session(base_url='https://hemeroteca.proceso.com.mx/')
+        for key, value in settings['DEFAULT_REQUEST_HEADERS'].items():
+            # seems to be a bug with how webkit-server handles accept-encoding
+            if key.lower() != 'accept-encoding':
+                self.dryscrape_session.set_header(key, value)
+
+    def _redirect(self, redirected, request, spider, reason):
+        # act normally if this isn't a threat defense redirect
+        if not self.is_threat_defense_url(redirected.url):
+            return super(ThreatDefenceRedirectMiddleware, self)._redirect(redirected, request, spider, reason)
+
+        logger.debug('Proceso threat defense triggered for {0}'.format(request.url))
+        request.cookies = self.bypass_threat_defense(redirected.url)
+        request.dont_filter = True # prevents the original link being marked a dupe
+        return request
+
+    def is_threat_defense_url(self, url):
+        return 'proceso.com.mx' in url
+
+    def bypass_threat_defense(self, url=None):
+        # only navigate if any explicit url is provided
+        if url:
+            self.dryscrape_session.visit(url)
+
+        # solve the captcha if there is one
+        # captcha_images = self.dryscrape_session.css('img[src *= captcha]')
+        # if len(captcha_images) > 0:
+        #     return self.solve_captcha(captcha_images[0])
+
+        # click on any explicit retry links
+        # retry_links = self.dryscrape_session.css('a[href *= threat_defence]')
+        # if len(retry_links) > 0:
+        #     return self.bypass_threat_defense(retry_links[0].get_attr('href'))
+
+        # otherwise, we're on a redirect page so wait for the redirect and try again
+        self.wait_for_redirect()
+        return self.bypass_threat_defense()
+
+    def wait_for_redirect(self, url = None, wait = 0.1, timeout=10):
+        url = url or self.dryscrape_session.url()
+        for i in range(int(timeout//wait)):
+            time.sleep(wait)
+            if self.dryscrape_session.url() != url:
+                return self.dryscrape_session.url()
+        logger.error("Maybe {0} isn't a redirect URL?".format(self.dryscrape_session.url()))
+        raise Exception('Timed out on the zipru redirect page.')
+
+    # def solve_captcha(self, img, width=1280, height=800):
+    #     # take a screenshot of the page
+    #     self.dryscrape_session.set_viewport_size(width, height)
+    #     filename = tempfile.mktemp('.png')
+    #     self.dryscrape_session.render(filename, width, height)
+
+    #     # inject javascript to find the bounds of the captcha
+    #     js = 'document.querySelector("img[src *= captcha]").getBoundingClientRect()'
+    #     rect = self.dryscrape_session.eval_script(js)
+    #     box = (int(rect['left']), int(rect['top']), int(rect['right']), int(rect['bottom']))
+
+    #     # solve the captcha in the screenshot
+    #     image = Image.open(filename)
+    #     os.unlink(filename)
+    #     captcha_image = image.crop(box)
+    #     captcha = pytesseract.image_to_string(captcha_image)
+    #     logger.debug(f'Solved the Zipru captcha: "{captcha}"')
+
+    #     # submit the captcha
+    #     input = self.dryscrape_session.xpath('//input[@id = "solve_string"]')[0]
+    #     input.set(captcha)
+    #     button = self.dryscrape_session.xpath('//button[@id = "button_submit"]')[0]
+    #     url = self.dryscrape_session.url()
+    #     button.click()
+
+    #     # try again if it we redirect to a threat defense URL
+    #     if self.is_threat_defense_url(self.wait_for_redirect(url)):
+    #         return self.bypass_threat_defense()
+
+    #     # otherwise return the cookies as a dict
+    #     cookies = {}
+    #     for cookie_string in self.dryscrape_session.cookies():
+    #         if 'domain=zipru.to' in cookie_string:
+    #             key, value = cookie_string.split(';')[0].split('=')
+    #             cookies[key] = value
+    #     return cookies
--- a/descarga_por_mes/proceso_org/proceso/pipelines.py
+++ b/descarga_por_mes/proceso_org/proceso/pipelines.py
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
+        return item
--- a/descarga_por_mes/proceso_org/proceso/settings.py
+++ b/descarga_por_mes/proceso_org/proceso/settings.py
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for proceso project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'proceso'
+
+SPIDER_MODULES = ['proceso.spiders']
+NEWSPIDER_MODULE = 'proceso.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
+
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 1
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = True
+COOKIES_DEBUG = True
+SPLASH_COOKIES_DEBUG = True
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+# DEFAULT_REQUEST_HEADERS = {
+#     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#     'User-Agent': USER_AGENT,
+#     'Connection': 'Keep-Alive',
+#     # 'Accept-Encoding': 'gzip, deflate',
+#     'Accept-Language': 'en',
+# }
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+SPIDER_MIDDLEWARES = {
+   # 'proceso.middlewares.ProcesoSpiderMiddleware': 543,
+   'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
+}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+DOWNLOADER_MIDDLEWARES = {
+    # 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None,
+    'scrapy_splash.SplashCookiesMiddleware': 723,
+    'scrapy_splash.SplashMiddleware': 725,
+    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
+    # 'proceso.middlewares.ThreatDefenceRedirectMiddleware': 820,
+}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'proceso.pipelines.JsonWriterPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
+
+SPLASH_URL = 'http://localhost:8050/'
+
+DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
--- a/descarga_por_mes/proceso/proceso/settings_org.py
+++ b/descarga_por_mes/proceso/proceso/settings_org.py
--- a/descarga_por_mes/proceso_org/proceso/spiders/__init__.py
+++ b/descarga_por_mes/proceso_org/proceso/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_por_mes/proceso_org/proceso/spiders/noticias.py
+++ b/descarga_por_mes/proceso_org/proceso/spiders/noticias.py
+# -*- coding: utf-8 -*-
+
+"""
+MEDIA:
+    Proceso, CDMX
+
+USAGE:
+    ## For this crawler 'scrapy-splash' is used because the content is loaded through javascript. ##
+    ## Read especs_sitio_proceso.txt file. ##
+    
+    $ cd proceso/
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
+    for the news contained in noticias.json being splitted into files by date. ##
+    
+    $ scrapy crawl noticias --nolog -s filename=noticias.json
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to a specific year-month date. ##
+    
+    $ scrapy crawl noticias --nolog -s filename=2018-09.json -a year=2018 -a month=9
+"""
+
+import scrapy, re, cfscrape
+from proceso.items import NoticiasItem
+from datetime import datetime, date, timedelta, tzinfo
+from scrapy.http.cookies import CookieJar
+from scrapy_splash import SplashRequest, SplashFormRequest
+
+
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+
+
+
+class UTC(tzinfo):
+    """
+    Class for Time Zone
+    """
+    def utcoffset(self, dt):
+        ## Time zone for CDMX: UTC-6 ##
+        return timedelta(hours=-6)
+
+    def tzname(self, dt):
+        ## Time zone name ##
+        return 'UTC-6'
+
+USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
+
+script = """
+function main(splash)
+  splash:init_cookies(splash.args.cookies)
+  assert(splash:go{
+    splash.args.url,
+    headers=splash.args.headers,
+    http_method=splash.args.http_method,
+    body=splash.args.body,
+    })
+  assert(splash:wait(0.5))
+
+  local entries = splash:history()
+  local last_response = entries[#entries].response
+  return {
+    url = splash:url(),
+    headers = last_response.headers,
+    http_status = last_response.status,
+    cookies = splash:get_cookies(),
+    html = splash:html(),
+  }
+end
+"""
+
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    name = "noticias"
+    
+    def start_requests(self):
+        self.tz = UTC()
+        year = getattr(self, 'year', None)
+        month = getattr(self, 'month', None)
+
+        self.month_parser = {
+            'enero'      : '01', 'febrero' : '02',  'marzo'     : '03',  'abril'     : '04',
+            'mayo'       : '05', 'junio'   : '06',  'julio'     : '07',  'agosto'    : '08',
+            'septiembre' : '09', 'octubre' : '10',  'noviembre' : '11',  'diciembre' : '12'
+        }
+
+        self.baseURL = "https://hemeroteca.proceso.com.mx/"
+        login_url = "https://hemeroteca.proceso.com.mx/wp-login.php"
+
+        if year is not None and month is not None:
+            self.stop_date = date(int(year), int(month), 15)
+            # yield scrapy.Request(url=login_url, callback=self.parse_with_stop_date)
+        
+        else:
+            self.stop_date = None
+            # yield scrapy.Request(url=login_url, callback=self.parse)
+
+        token, agent = cfscrape.get_tokens(login_url, user_agent=USER_AGENT)
+        print token
+        print "\n"
+        yield SplashRequest(url=login_url, callback=self.parse_login, endpoint='render.html', args={ 'wait': 0.5 }, cookies=token,
+                    headers={'User-Agent' : agent})
+
+
+    def parse_login(self, response):
+        return SplashFormRequest.from_response(
+            response,
+            formdata = {
+                'log' : 'carlos_silvaforne@yahoo.com.mx',
+                'pwd' : 'carlos_silvaforne@'
+            },
+            callback = self.after_login,
+            dont_click = True
+        )
+
+
+
+    def after_login(self, response):
+        ## Check login succeed before going on ##
+        print response.headers
+        print "\n"
+        print response.real_url
+        print "\n"
+        print response.request.headers
+        print "\n"
+        cookie_list = response.request.headers.getlist('Cookie')
+        cfc, cfd = cookie_list[0].split(';')
+        cfc = cfc.strip().split('=')
+        cfd = cfd.strip().split('=')
+        cookies = [cfc[1], cfd[1]]
+        cookies = {cfc[0]: cfc[1], cfd[0]: cfd[1]}
+        
+        session_legend = response.css('div.topnav > a').extract()[-1]
+        print response.css('h1.entry-title').extract_first()
+        print "\n"
+
+        if session_legend is not None:
+            session_legend = remove_tags(session_legend)
+
+            if not "Cerrar" in session_legend:
+                print "Login failed."
+
+            else:
+                print session_legend
+                print "\n"
+                token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
+                print token
+                print "\n"
+
+                if self.stop_date is None:
+                    yield scrapy.Request(url=self.baseURL, callback=self.parse)
+
+                else:
+                    self.meta = response.request.meta
+                    yield SplashRequest(url=self.baseURL, callback=self.parse_with_stop_date,
+                        meta=self.meta,
+                        endpoint='execute',
+                        cache_args=['lua_source'],
+                        args={'lua_source': script},
+                        headers={'User-Agent': USER_AGENT}
+                    )
+                    # request = SplashRequest(url=self.baseURL, callback=self.parse_with_stop_date,
+                    #     endpoint='execute',
+                    #     cache_args=['lua_source'],
+                    #     args={'lua_source': script},
+                    #     headers={'User-Agent': USER_AGENT}
+                    # )
+                    # request.meta['splash']['session_id'] = cookie_list[0]
+                    # yield request
+
+
+
+        # if "authentication failed" in response.body:
+        #     self.logger.error("Login failed.")
+        #     return
+
+        # else:
+        #     # token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
+        #     if self.stop_date is None:
+        #         yield scrapy.Request(url=self.baseURL, callback=self.parse, dont_filter=True)
+
+        #     else:
+        #         yield scrapy.Request(url=self.baseURL, callback=self.parse_with_stop_date, dont_filter=True)
+                # yield scrapy.Request(
+                #     url=self.baseURL,
+                #     callback=self.parse_with_stop_date,
+                #     cookies=token,
+                #     headers={'User-Agent' : agent}
+                # )
+
+
+
+    def parse_with_stop_date(self, response):
+        print "parse_with_stop_date"
+        print "\n"
+        print response.css('h1.entry-title').extract_first()
+        # print "\n"
+        # print response.cookiejar
+        print "\n"
+        print response.headers
+        print "\n"
+        
+        # session_legend = response.css('div.topnav > a').extract()[-1]
+        # if session_legend is not None :
+        #     print remove_tags(session_legend)
+        #     print "\n"
+        # else :
+        #     print "No log in."
+        
+        TO_NEXT_PAGE = True
+
+        for item in response.css('div.catpor-box > div'):
+            item_date = item.css('span.catpor-published').extract_first()
+
+            if item_date is not None:
+                item_date    = remove_tags(item_date).replace(",", '')
+                item_date    = item_date.split(' ')
+                item_date[1] = self.month_parser[item_date[1]]
+                item_date    = map(int, item_date)
+                item_date    = date(item_date[2], item_date[1], item_date[0])
+                
+                if item_date >= self.stop_date:
+                    item_link = item.css('span.catpor-title > a::attr(href)').extract_first()
+                    print item_link
+                    # token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
+                    # yield scrapy.Request(url=item_link, callback=self.parse_links, cookies=cookies)
+                    # yield scrapy.Request(url=item_link, callback=self.parse_links, cookies=token,
+                    #                         headers={'User-Agent' : agent})
+                    yield SplashRequest(url=item_link, callback=self.parse_links,
+                        endpoint='execute',
+                        cache_args=['lua_source'],
+                        args={'lua_source': script},
+                        headers={'User-Agent': USER_AGENT}
+                    )
+
+                else:
+                    TO_NEXT_PAGE = False
+                    break
+
+        if TO_NEXT_PAGE:
+            next_page = response.css('div.page-navigation > div.nav-next > a::attr(href)').extract_first()
+
+            if next_page is not None:
+                # yield scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
+                yield SplashRequest(url=next_page, callback=self.parse_with_stop_date,
+                    endpoint='execute',
+                    cache_args=['lua_source'],
+                    args={'lua_source': script},
+                    headers={'User-Agent': USER_AGENT}
+                )
+
+
+
+    def parse_links(self, response):
+        print "\n\n"
+        print response.headers
+        for link in response.css('div.post-container > h2 > a::attr(href)').extract():
+            # print link
+            # token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
+
+            # yield SplashRequest(url=link, callback=self.parse_item, endpoint='render.html', args={ 'wait': 0.5 }, cookies=token,
+            #         headers={'User-Agent' : agent})
+            yield SplashRequest(url=link, callback=self.parse_item,
+                endpoint='execute',
+                cache_args=['lua_source'],
+                args={'lua_source': script},
+                headers={'User-Agent': USER_AGENT}
+            )
+
+
+
+    def parse_item(self, response):
+        # if response.url == "https://hemeroteca.proceso.com.mx/?page_id=278958&a51dc26366d99bb5fa29cea4747565fec=420203":
+        #     print response.body
+        item = NoticiasItem()
+        text = ''
+        
+        news_date = response.xpath('//div[@id="primary"]').css('span.published').extract_first()
+        if news_date is not None:
+            news_date = remove_tags(news_date)
+            print news_date
+            d, t = news_date.split(' ')
+            d = map(int, d.split("-"))
+            t = map(int, t.split(":"))
+            news_date = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat('T')
+
+        title = response.xpath('//div[@id="primary"]/div/h1').extract_first()
+        if title is not None : title = remove_tags(title)
+
+        topic = response.css('span.entry-categories').extract_first()
+        if topic is not None : topic = remove_tags(topic)
+        
+        for paragraph in response.xpath('//div[@id="primary"]').css('div.entry-content > div').css('p').extract():
+            text += remove_tags(paragraph) + '\n'
+        
+        ## News item info ##
+        item['date']  = news_date
+        item['title'] = title
+        item['topic'] = topic
+        item['text']  = text.strip()
+        item['url']   = response.url
+        
+        yield item
+
--- a/descarga_por_mes/proceso/proceso/spiders/noticias_org.py
+++ b/descarga_por_mes/proceso/proceso/spiders/noticias_org.py
--- a/descarga_por_mes/proceso_org/scrapy.cfg
+++ b/descarga_por_mes/proceso_org/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = proceso.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = proceso
--- a/parse_date_files.py
+++ b/parse_date_files.py
+# -*- coding: utf-8 -*-
+
 import json, os, sys
 from datetime import datetime
 from collections import OrderedDict

 """
+Toma como entrada un archivo json que contiene noticias con diferentes fechas.
+Devuelve las noticias en carpetas separadas por año.
+
 Uso:
 python parse_date_files.py <ruta_del_crawler> <nombre_archivo>


--- a/parse_date_files2.py
+++ b/parse_date_files2.py
+# -*- coding: utf-8 -*-
+
 import json, os, sys
 from datetime import datetime
 from collections import OrderedDict

 """
+Parseo de fechas para las noticias descargadas del tipo 'descarga_hacia_atras'
+
 Uso:
 python parse_date_files.py <nombre_del_crawler>