merge with dev

093e0e82 · Renán Sosa Guillen · fc91c136 · 15cbb498 · 093e0e82 · 093e0e82
Commit 093e0e82 authored Oct 17, 2018 by Renán Sosa Guillen
105 changed files
--- a/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/__init__.py
+++ b/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/__init__.py
--- a/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/items.py
+++ b/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/items.py
+# -*- coding: utf-8 -*-
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+import scrapy
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/middlewares.py
+++ b/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/middlewares.py
+# -*- coding: utf-8 -*-
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy import signals
+class DiariodechiapasSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+class DiariodechiapasDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/pipelines.py
+++ b/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/pipelines.py
+# -*- coding: utf-8 -*-
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+        return item
--- a/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/settings.py
+++ b/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/settings.py
+# -*- coding: utf-8 -*-
+# Scrapy settings for diarioDeChiapas project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+BOT_NAME = 'diarioDeChiapas'
+SPIDER_MODULES = ['diarioDeChiapas.spiders']
+NEWSPIDER_MODULE = 'diarioDeChiapas.spiders'
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'diarioDeChiapas (+http://www.yourdomain.com)'
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'diarioDeChiapas.middlewares.DiariodechiapasSpiderMiddleware': 543,
+#}
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'diarioDeChiapas.middlewares.DiariodechiapasDownloaderMiddleware': 543,
+#}
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'diarioDeChiapas.pipelines.JsonWriterPipeline': 300,
+}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/spiders/__init__.py
+++ b/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/spiders/noticias.py
+++ b/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/spiders/noticias.py
+# -*- coding: utf-8 -*-
+"""
+MEDIA:
+    Diario de Chiapas, Chiapas
+USAGE
+    $ cd diarioDeChiapas
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
+    for the news contained in noticias.json being splitted into files by date. ##
+    $ scrapy crawl noticias --nolog -s filename=noticias.json
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to a specific date. ##
+    $ scrapy crawl noticias --nolog -s filename=2018-08-30.json -a year=2018 -a month=8 -a day=30
+"""
+import scrapy, re, json
+from datetime import datetime, date
+from diarioDeChiapas.items import NoticiasItem
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+class ImportantData(scrapy.Item):
+    """
+    Useful data for the flow of the implementation
+    """
+    to_next_page = scrapy.Field()
+    is_last_link = scrapy.Field()
+    news_section = scrapy.Field()
+    return_url   = scrapy.Field()
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    name = "noticias"
+    def start_requests(self):
+        year  = getattr(self, "year", None)
+        month = getattr(self, "month", None)
+        day   = getattr(self, "day", None)
+        if year is not None and month is not None and day is not None:
+            self.stopDate = date(int(year), int(month), int(day))
+        else:
+            self.stopDate = None
+        baseURL = "http://www.diariodechiapas.com/landing/"
+        section_list = ["editorial", "portada", "metropoli", "region", "la-roja",
+                       "deportes", "boga", "ae", "trascendio"]
+        # section_list = ["editorial"]
+        if self.stopDate is None:
+            for s in section_list:
+                yield scrapy.Request(url=baseURL + s, callback=self.parse)
+        else:
+            for s in section_list:
+                flow_info = ImportantData()
+                flow_info['to_next_page'] = False
+                request = scrapy.Request(url=baseURL + s, callback=self.parse_with_stop_date)
+                request.meta['item'] = flow_info
+                yield request
+    def parse(self, response):
+        link_list = response.xpath('//section[@class="page__content"]').css('section.post').xpath('./a[@class="post__link"]/@href').extract()
+        section   = response.xpath('//section[@class="wrapper"]/h1').extract_first()
+        if section is not None : section = remove_tags(section)
+        for link in link_list:
+            flow_info = ImportantData()
+            flow_info['news_section'] = section
+            request = scrapy.Request(url=link, callback=self.parse_item)
+            request.meta['item'] = flow_info
+            yield request
+        next_page = response.css('div.wp-pagenavi').css('a.nextpostslink').css('::attr(href)').extract_first()
+        if next_page is not None:
+            yield scrapy.Request(url=next_page, callback=self.parse)
+    def parse_with_stop_date(self, response):
+        flow_info = response.meta['item']
+        if not flow_info['to_next_page']:
+            link_list = response.xpath('//section[@class="page__content"]').css('section.post').xpath('./a[@class="post__link"]/@href').extract()
+            section   = response.xpath('//section[@class="wrapper"]/h1').extract_first()
+            if section is not None : section = remove_tags(section)
+            for link in link_list:
+                flow_info = ImportantData()
+                flow_info['news_section'] = section
+                flow_info['return_url'] = response.url
+                if link == link_list[-1] : flow_info['is_last_link'] = True
+                else                     : flow_info['is_last_link'] = False
+                request = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
+                request.meta['item'] = flow_info
+                yield request
+        else:
+            next_page = response.css('div.wp-pagenavi').css('a.nextpostslink').css('::attr(href)').extract_first()
+            if next_page is not None:
+                flow_info['to_next_page'] = False
+                request = scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
+                request.meta['item'] = flow_info
+                yield request
+    def parse_item(self, response):
+        flow_info = response.meta['item']
+        item = NoticiasItem()
+        text = ''
+        title = response.xpath('//section[@class="single__content"]/h1').extract_first()
+        if title is not None : title = remove_tags(title)
+        for p in response.xpath('//section[@class="single__content"]').css('p').extract():
+            text += remove_tags(p) + "\n"
+        ## News item info ##
+        item['date']  = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+        item['topic'] = flow_info['news_section']
+        item['title'] = title
+        item['text']  = text.strip()
+        item['url']   = response.url
+        yield item
+    def parse_item_with_stop_date(self, response):
+        news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+        news_date = news_date[:news_date.find('T')]
+        news_date = datetime.strptime(news_date, '%Y-%m-%d').date()
+        if news_date >= self.stopDate:
+            flow_info = response.meta['item']
+            item = NoticiasItem()
+            text = ''
+            title = response.xpath('//section[@class="single__content"]/h1').extract_first()
+            if title is not None : title = remove_tags(title)
+            for p in response.xpath('//section[@class="single__content"]').css('p').extract():
+                text += remove_tags(p) + "\n"
+            ## News item info ##
+            item['date']  = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+            item['topic'] = flow_info['news_section']
+            item['title'] = title
+            item['text']  = text.strip()
+            item['url']   = response.url
+            yield item
+            if flow_info['is_last_link']:
+                flow_info['to_next_page'] = True
+                request = scrapy.Request(url=flow_info['return_url'], callback=self.parse_with_stop_date, dont_filter=True)
+                request.meta['item'] = flow_info
+                yield request
--- a/descarga_hacia_atras/diarioDeChiapas/scrapy.cfg
+++ b/descarga_hacia_atras/diarioDeChiapas/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+[settings]
+default = diarioDeChiapas.settings
+[deploy]
+#url = http://localhost:6800/
+project = diarioDeChiapas
--- a/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/__init__.py
+++ b/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/__init__.py
--- a/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/items.py
+++ b/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/items.py
+# -*- coding: utf-8 -*-
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+import scrapy
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/middlewares.py
+++ b/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/middlewares.py
+# -*- coding: utf-8 -*-
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy import signals
+class DiarioindependienteSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+class DiarioindependienteDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/pipelines.py
+++ b/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/pipelines.py
+# -*- coding: utf-8 -*-
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+        return item
--- a/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/settings.py
+++ b/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/settings.py
+# -*- coding: utf-8 -*-
+# Scrapy settings for diarioIndependiente project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+BOT_NAME = 'diarioIndependiente'
+SPIDER_MODULES = ['diarioIndependiente.spiders']
+NEWSPIDER_MODULE = 'diarioIndependiente.spiders'
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'diarioIndependiente (+http://www.yourdomain.com)'
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'diarioIndependiente.middlewares.DiarioindependienteSpiderMiddleware': 543,
+#}
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'diarioIndependiente.middlewares.DiarioindependienteDownloaderMiddleware': 543,
+#}
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'diarioIndependiente.pipelines.JsonWriterPipeline': 300,
+}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/spiders/__init__.py
+++ b/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/spiders/noticias.py
+++ b/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/spiders/noticias.py
+# -*- coding: utf-8 -*-
+"""
+MEDIA:
+    Diario El Independiente, Baja California Sur
+USAGE:
+    $ cd elIndependiente/
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
+    for the news contained in noticias.json being splitted into files by date. ##
+    $ scrapy crawl noticias --nolog -s filename=noticias.json
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to a specific date. ##
+    $ scrapy crawl noticias --nolog -s filename=2018-08-10.json -a year=2018 -a month=8 -a day=10
+"""
+import scrapy, re, json
+from datetime import datetime, date
+from diarioIndependiente.items import NoticiasItem
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+class ImportantFlowData(scrapy.Item):
+    """
+    Useful data for the flow of the implementation
+    """
+    to_next_page = scrapy.Field()
+    is_last_link = scrapy.Field()
+    return_url   = scrapy.Field()
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    name = "noticias"
+    def start_requests(self):
+        year  = getattr(self, "year", None)
+        month = getattr(self, "month", None)
+        day   = getattr(self, "day", None)
+        if year is not None and month is not None and day is not None:
+            self.stop_date = date(int(year), int(month), int(day))
+            base_url = "https://www.diarioelindependiente.mx/" + year + "/" + month + "/"
+        else:
+            self.stop_date = None
+            section_list = ["la-paz", "los-cabos", "policiaca", "deportes", "cultura", "nacional",
+                            "internacional", "opinion", "espectaculos", "tecnologia"]
+            base_url = "https://www.diarioelindependiente.mx/"        
+        if self.stop_date is None:
+            for s in section_list:
+                yield scrapy.Request(url=base_url + s, callback=self.parse)
+        else:
+            flow_info = ImportantFlowData()
+            flow_info['to_next_page'] = False
+            request = scrapy.Request(url=base_url, callback=self.parse_with_stop_date)
+            request.meta['item'] = flow_info
+            yield request
+    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        pagination = response.css('div.paginacion').xpath('./ul/li/a/@href').extract()
+        if len(pagination) > 0:
+            pagination = pagination[-2]
+            pages = int(pagination[pagination.rfind('=') + 1:])
+            for page in xrange(1, pages):
+                yield scrapy.Request(url=response.url + "?page=" + str(page + 1), callback=self.parse_page)
+    def parse_page(self, response):
+        link_list = response.xpath('//div[@id="colNoticias"]').css('article.card__article').xpath('./h2/a/@href').extract()
+        for link in link_list:
+            yield scrapy.Request(url=link, callback=self.parse_item)
+    def parse_with_stop_date(self, response):
+        flow_info = response.meta['item']
+        if not flow_info['to_next_page']:
+            link_list = response.xpath('//div[@id="colNoticias"]').css('article.card__article').xpath('./h2/a/@href').extract()
+            for link in link_list:
+                flow_info = ImportantFlowData()
+                flow_info['return_url'] = response.url
+                if link == link_list[-1] : flow_info['is_last_link'] = True
+                else                     : flow_info['is_last_link'] = False
+                request = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
+                request.meta['item'] = flow_info
+                yield request
+        else:
+            next_page = response.css('div.paginacion').xpath('./ul/li/a[@rel="next"]/@href').extract_first()
+            if next_page is not None:
+                flow_info['to_next_page'] = False
+                request = scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
+                request.meta['item'] = flow_info
+                yield request
+    def parse_item(self, response):
+        item = NoticiasItem()
+        text = ''
+        news_date = response.xpath('//meta[@name="date"]/@content').extract_first()
+        try:
+            topic = response.xpath('//span[@class="badge"]').extract_first()
+        except:
+            topic = None
+        for p in response.css('div.cuerpo_noticia').css('p').extract():
+                text += remove_tags(p) + "\n"
+        ## News item info ##
+        item['date']  = datetime.strptime(news_date, '%Y-%m-%d').isoformat("T")
+        item['title'] = remove_tags(response.css('h1.colorRojo').extract_first())
+        item['topic'] = remove_tags(topic)
+        item['text']  = text.strip()
+        item['url']   = response.url
+        yield item
+    def parse_item_with_stop_date(self, response):
+        news_date = response.xpath('//meta[@name="date"]/@content').extract_first()
+        news_date = datetime.strptime(news_date, '%Y-%m-%d').date()
+        if news_date >= self.stop_date:
+            flow_info = response.meta['item']
+            item = NoticiasItem()
+            text = ''
+            news_date = datetime.strptime(news_date.isoformat(), '%Y-%m-%d').isoformat("T")
+            title = response.css('h1.colorRojo').extract_first()
+            if title is not None : title = remove_tags(title)
+            topic = response.xpath('//span[@class="badge"]').extract_first()
+            if topic is not None : topic = remove_tags(topic)
+            for p in response.css('div.cuerpo_noticia').css('p').extract():
+                text += remove_tags(p) + "\n"
+            ## News item info ##
+            item['date']  = news_date
+            item['title'] = title
+            item['topic'] = topic
+            item['text']  = text.strip()
+            item['url']   = response.url
+            yield item
+            if flow_info['is_last_link']:
+                flow_info['to_next_page'] = True
+                request = scrapy.Request(url=flow_info['return_url'], callback=self.parse_with_stop_date, dont_filter=True)
+                request.meta['item'] = flow_info
+                yield request
--- a/descarga_hacia_atras/diarioIndependiente/scrapy.cfg
+++ b/descarga_hacia_atras/diarioIndependiente/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+[settings]
+default = diarioIndependiente.settings
+[deploy]
+#url = http://localhost:6800/
+project = diarioIndependiente
--- a/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/__init__.py
+++ b/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/__init__.py
--- a/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/items.py
+++ b/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/items.py
+# -*- coding: utf-8 -*-
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+import scrapy
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/middlewares.py
+++ b/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/middlewares.py
+# -*- coding: utf-8 -*-
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy import signals
+class HeraldochihuahuaSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+class HeraldochihuahuaDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/pipelines.py
+++ b/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/pipelines.py
+# -*- coding: utf-8 -*-
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+        return item
--- a/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/settings.py
+++ b/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/settings.py
+# -*- coding: utf-8 -*-
+# Scrapy settings for heraldoChihuahua project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+BOT_NAME = 'heraldoChihuahua'
+SPIDER_MODULES = ['heraldoChihuahua.spiders']
+NEWSPIDER_MODULE = 'heraldoChihuahua.spiders'
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'heraldoChihuahua (+http://www.yourdomain.com)'
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'heraldoChihuahua.middlewares.HeraldochihuahuaSpiderMiddleware': 543,
+#}
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'heraldoChihuahua.middlewares.HeraldochihuahuaDownloaderMiddleware': 543,
+#}
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'heraldoChihuahua.pipelines.JsonWriterPipeline': 300,
+}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/spiders/__init__.py
+++ b/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/spiders/noticias.py
+++ b/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/spiders/noticias.py
+# -*- coding: utf-8 -*-
+"""
+MEDIA:
+    El Heraldo de Chihuahua, Chihuahua
+USAGE
+    $ cd heraldoChihuahua
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
+    for the news contained in noticias.json being splitted into files by date. ##
+    $ scrapy crawl noticias --nolog -s filename=noticias.json
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to a specific date. ##
+    $ scrapy crawl noticias --nolog -s filename=2018-08-30.json -a year=2018 -a month=8 -a day=30
+"""
+import scrapy, re, json, urllib
+from datetime import datetime, date, tzinfo, timedelta
+from collections import OrderedDict
+from heraldoChihuahua.items import NoticiasItem
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+DAY_NUM = re.compile(r'[0-9]{1,2}')
+class UTC(tzinfo):
+    """
+    Class for Time Zone
+    """
+    def utcoffset(self, dt):
+        ## Time zone for Chihuahua: UTC-7 ##
+        return timedelta(hours=-7)
+    def tzname(self, dt):
+        ## Time zone name ##
+        return 'UTC-7'
+class ImportantData(scrapy.Item):
+    """
+    Useful data for the flow of the implementation
+    """
+    to_next_page = scrapy.Field()
+    is_last_link = scrapy.Field()
+    news_section = scrapy.Field()
+    news_page    = scrapy.Field()
+    return_url   = scrapy.Field()
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    name = "noticias"
+    def start_requests(self):
+        self.tz = UTC()
+        year    = getattr(self, "year", None)
+        month   = getattr(self, "month", None)
+        day     = getattr(self, "day", None)
+        if year is not None and month is not None and day is not None:
+            self.stopDate = date(int(year), int(month), int(day))
+        else:
+            self.stopDate = None
+        self.baseURL = "https://www.elheraldodechihuahua.com.mx/"
+        section_list = ["local", "policiaca", "mexico", "republica", "mundo", "finanzas",
+                        "analisis", "gossip", "circulos", "cultura", "doble-via", "deportes"]
+        # section_list = ["local"]
+        self.month_parser = dict(enero='01', febrero='02', marzo='03',      abril='04',   mayo='05',      junio='06',
+                                 julio='07', agosto='08',  septiembre='09', octubre='10', noviembre='11', diciembre='12')
+        if self.stopDate is None:
+            for s in section_list:
+                flow_info = ImportantData()
+                flow_info['news_page'] = 8
+                flow_info['news_section'] = s
+                request = scrapy.Request(url=self.baseURL + s, callback=self.parse)
+                request.meta['item'] = flow_info
+                yield request
+        else:
+            for s in section_list:
+                flow_info = ImportantData()
+                flow_info['to_next_page'] = False
+                flow_info['news_page'] = 8
+                flow_info['news_section'] = s
+                request = scrapy.Request(url=self.baseURL + s, callback=self.parse_with_stop_date)
+                request.meta['item'] = flow_info
+                yield request
+    def parse(self, response):
+        flow_info = response.meta['item']
+        link_list = response.css('div.hard-news-row').css('h4.title > a::attr(href)').extract()
+        if len(link_list) <= 0 : link_list = response.css('h4.title > a::attr(href)').extract()
+        for link in link_list:
+            yield scrapy.Request(url=link, callback=self.parse_item)
+        load_more = response.css('div.load-more').extract_first()
+        if load_more is not None:
+            flow_info['news_page'] += 4
+            params = OrderedDict()
+            params['widgetContentId'] = '148'
+            params['widgetName']      = 'viewPicker'
+            params['offset']          = str(flow_info['news_page'])
+            url_params = urllib.urlencode(params)
+            next_load = self.baseURL + flow_info['news_section'] + "/widget/?" + url_params
+            request = scrapy.Request(url=next_load, callback=self.parse)
+            request.meta['item'] = flow_info
+            yield request
+    def parse_with_stop_date(self, response):
+        flow_info = response.meta['item']
+        if not flow_info['to_next_page']:
+            link_list = response.css('div.hard-news-row').css('h4.title > a::attr(href)').extract()
+            if len(link_list) <= 0 : link_list = response.css('h4.title > a::attr(href)').extract()
+            news_page    = flow_info['news_page']
+            news_section = flow_info['news_section']
+            for link in link_list:
+                flow_info = ImportantData()
+                flow_info['return_url']   = response.url
+                flow_info['news_page']    = news_page
+                flow_info['news_section'] = news_section
+                if link == link_list[-1] : flow_info['is_last_link'] = True
+                else                     : flow_info['is_last_link'] = False
+                request = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
+                request.meta['item'] = flow_info
+                yield request
+        else:
+            load_more = response.css('div.load-more').extract_first()
+            if load_more is not None:
+                flow_info['to_next_page'] = False
+                flow_info['news_page'] += 4
+                params = OrderedDict()
+                params['widgetContentId'] = '148'
+                params['widgetName']      = 'viewPicker'
+                params['offset']          = str(flow_info['news_page'])
+                url_params = urllib.urlencode(params)
+                next_load = self.baseURL + flow_info['news_section'] + "/widget/?" + url_params
+                request = scrapy.Request(url=next_load, callback=self.parse_with_stop_date)
+                request.meta['item'] = flow_info
+                yield request
+    def parse_item(self, response):
+        published_date = response.css('p.published-date').extract_first()  
+        if published_date is not None:
+            published_date = remove_tags(published_date)
+            published_date = published_date.split('/')
+            published_date = published_date[1].strip().replace('\n', '')
+            date_lst = published_date.split(' de ')
+            element = DAY_NUM.search(date_lst[0])
+            if element:
+                item = NoticiasItem()
+                text = ''
+                date_lst[0] = element.group()
+                date_lst[1] = self.month_parser[date_lst[1].lower()]
+                date_lst = map(int, date_lst)
+                news_date = datetime(date_lst[2], date_lst[1], date_lst[0], tzinfo=self.tz).isoformat("T")
+                topic = response.css('div.breadcrumb > a').extract_first()
+                if topic is not None : topic = remove_tags(topic).strip("\n")
+                title = response.css('h1.title').extract_first()
+                if title is not None : title = remove_tags(title).strip("\n")
+                for p in response.css('div.content-body').xpath('./div[contains(@id, "content-body")]/p').extract():
+                    text += remove_tags(p) + "\n"
+                ## News item info ##
+                item['date']  = news_date
+                item['topic'] = topic
+                item['title'] = title
+                item['text']  = text.strip()
+                item['url']   = response.url
+                yield item
+    def parse_item_with_stop_date(self, response):
+        published_date = response.css('p.published-date').extract_first()
+        if published_date is not None:
+            published_date = remove_tags(published_date)
+            published_date = published_date.split('/')
+            published_date = published_date[1].strip().replace('\n', '')
+            date_lst = published_date.split(' de ')
+            element = DAY_NUM.search(date_lst[0])
+            if element:
+                date_lst[0] = element.group()
+                date_lst[1] = self.month_parser[date_lst[1].lower()]
+                news_date = "-".join(date_lst)
+                news_date = datetime.strptime(news_date, '%d-%m-%Y').date()
+                if news_date >= self.stopDate:
+                    flow_info = response.meta['item']
+                    item = NoticiasItem()
+                    text = ''
+                    date_lst = map(int, date_lst)
+                    news_date = datetime(date_lst[2], date_lst[1], date_lst[0], tzinfo=self.tz).isoformat("T")
+                    topic = response.css('div.breadcrumb > a').extract_first()
+                    if topic is not None : topic = remove_tags(topic).strip("\n")
+                    title = response.css('h1.title').extract_first()
+                    if title is not None : title = remove_tags(title).strip("\n")
+                    for p in response.css('div.content-body').xpath('./div[contains(@id, "content-body")]/p').extract():
+                        text += remove_tags(p) + "\n"
+                    ## News item info ##
+                    item['date']  = news_date
+                    item['topic'] = topic
+                    item['title'] = title
+                    item['text']  = text.strip()
+                    item['url']   = response.url
+                    yield item
+                    if flow_info['is_last_link']:
+                        flow_info['to_next_page'] = True
+                        request = scrapy.Request(url=flow_info['return_url'], callback=self.parse_with_stop_date, dont_filter=True)
+                        request.meta['item'] = flow_info
+                        yield request
--- a/descarga_hacia_atras/heraldoChihuahua/scrapy.cfg
+++ b/descarga_hacia_atras/heraldoChihuahua/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+[settings]
+default = heraldoChihuahua.settings
+[deploy]
+#url = http://localhost:6800/
+project = heraldoChihuahua
--- a/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/__init__.py
+++ b/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/__init__.py
--- a/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/items.py
+++ b/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/items.py
+# -*- coding: utf-8 -*-
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+import scrapy
+class LajornadamayaItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
--- a/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/middlewares.py
+++ b/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/middlewares.py
+# -*- coding: utf-8 -*-
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy import signals
+class LajornadamayaSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+    def process_start_requests(start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/pipelines.py
+++ b/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/pipelines.py
+# -*- coding: utf-8 -*-
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+class LajornadamayaPipeline(object):
+    def process_item(self, item, spider):
+        return item
--- a/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/settings.py
+++ b/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/settings.py
+# -*- coding: utf-8 -*-
+# Scrapy settings for laJornadaMaya project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+BOT_NAME = 'laJornadaMaya'
+SPIDER_MODULES = ['laJornadaMaya.spiders']
+NEWSPIDER_MODULE = 'laJornadaMaya.spiders'
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'laJornadaMaya (+http://www.yourdomain.com)'
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'laJornadaMaya.middlewares.LajornadamayaSpiderMiddleware': 543,
+#}
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'laJornadaMaya.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'laJornadaMaya.pipelines.LajornadamayaPipeline': 300,
+#}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/spiders/__init__.py
+++ b/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/spiders/noticias.py
+++ b/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/spiders/noticias.py
+import scrapy, json, re
+from datetime import datetime, date, timedelta, tzinfo
+"""
+Esta version descarga ingresando una fecha.
+USO:
+scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+No es recomendable para fechas de mas de un mes de antiguas.
+"""
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+	return TAG_RE.sub('', text)
+class UTC(tzinfo):
+	"""clase para el 'time zone' (zona horaria)"""
+	def utcoffset(self, dt):
+		# zona horaria para yucatan (centro de mexico): utc-6
+		return timedelta(hours=-6)
+	def tzname(self, dt):
+		# nombre de la zona horaria
+		return 'UTC-6'
+class NoticiasItem(scrapy.Item):
+	title = scrapy.Field()
+	text = scrapy.Field()
+	date = scrapy.Field()
+	location = scrapy.Field()
+	author = scrapy.Field()
+	topic = scrapy.Field()
+	url = scrapy.Field()
+class QuotesSpider(scrapy.Spider):
+	name = "noticias"
+	def start_requests(self):
+		# self.found = False
+		# self.flag = False
+		self.tz = UTC()
+		self.year = getattr(self, 'year', None)
+		self.month = getattr(self, 'month', None)
+		self.day = getattr(self, 'day', None)
+		self.req_date = date(int(self.year), int(self.month), int(self.day))
+		self.date_format = "%Y-%m-%d"
+		self.baseURL = 'https://www.lajornadamaya.mx'
+		section_list = ['yucatan', 'quintana-roo', 'campeche', 'deportes', 'nacional',
+						'internacional', 'opinion']
+		# section_list = ['deportes']
+		for section in section_list:
+			self.section = section
+			for count in range(0,2):
+				if ( count == 0 ):
+					yield scrapy.Request(url=self.baseURL+'/'+section, callback=self.parse_2)
+				elif (count == 1):
+					# self.section = section
+					self.page = 0
+					self.flag = False
+					self.found = False
+					page = -1
+					if not ( section == 'opinion' ):
+						while True:
+							if ( self.flag ):
+								self.flag = False
+								break
+							page+=1
+							yield scrapy.Request(url=self.baseURL+'/'+section+'?p='+str(page), callback=self.parse)
+						if ( self.found ):
+							self.found = False
+							self.page -= 1
+							if ( self.page > 0 ):
+								self.page -= 1
+							for pag in range(self.page, self.page+6):
+								yield scrapy.Request(url=self.baseURL+'/'+section+'?p='+str(pag), callback=self.parse_page, dont_filter=True)
+					else:
+						yield scrapy.Request(url=self.baseURL+'/notas?opinion', callback=self.parse_page)
+	def parse_2(self, response): # para las primeras noticias
+		path_list = ['//h1[@class="title"]/a/@href', '//h2[@class="title"]/a/@href']
+		link_list = []
+		for path in path_list:
+			link_list += response.xpath(path).extract()
+		for link in link_list:
+			if ( link[:link.rfind('/')] == self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2) ):
+				item = NoticiasItem()
+				d = link[:link.rfind('/')]
+				if len(d) == 10:
+					d = map(int, d.split('-'))
+					d = datetime(d[0], d[1], d[2], tzinfo=self.tz).isoformat('T')
+				elif len(d) == 19:
+					d, t = d.split(' ')
+					d = map(int, d.split('-'))
+					t = map(int, t.split(':'))
+					d = datetime(d[0],d[1],d[2],t[0],t[1],t[2],tzinfo=self.tz).isoformat('T')
+				item['date'] = d
+				item['topic'] = response.url[response.url.rfind('/')+1:].title()
+				# yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item_2)
+				request = scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item_2)
+				request.meta['item'] = item
+				yield request
+	def parse(self, response): # para los json
+		json_response = json.loads(response.text)
+		if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
+			json_list = json_response
+		else:
+			json_list = json_response['articles']
+		for line in json_list:
+			this_date = datetime.strptime(line['publishDate'][:line['publishDate'].rfind(' ')], self.date_format)
+			this_date = this_date.date()
+			if ( this_date == self.req_date ):
+				self.page = int(response.url[response.url.rfind('=')+1:])
+				self.found = True
+				self.flag = True
+				break
+			elif ( this_date < self.req_date ):
+				self.flag = True
+				break
+	def parse_item_2(self, response): # para las primeras noticias
+		item = response.meta['item']
+		# item = NoticiasItem()
+		text = ''
+		# item['date'] = response.url[:response.url.rfind('/')][response.url[:response.url.rfind('/')].rfind('/')+1:]
+		# item['topic'] = self.section.title()
+		item['title'] = response.xpath('//article/h1/text()').extract_first()
+		for paragraph in response.xpath('//*[@class="txt"]').extract():
+			text += remove_tags(paragraph) + '\n'
+		item['text'] = text
+		item['url'] = response.url
+		print item['title']
+		yield item
+	def parse_page(self, response): # para los json
+		json_response = json.loads(response.text)
+		if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
+			topic = response.url[response.url.rfind('/')+1:response.url.rfind('=')-2].title()
+			json_list = json_response
+		else:
+			json_list = json_response['articles']
+			topic = 'Opinion'
+		for line in json_list:
+			this_date = datetime.strptime(line['publishDate'][:line['publishDate'].rfind(' ')], self.date_format)
+			this_date = this_date.date()
+			if ( this_date == self.req_date ):
+				item = NoticiasItem()
+				# item['date'] = line['publishDate']
+				d = line['publishDate']
+				if len(d) == 10:
+					d = map(int, d.split('-'))
+					d = datetime(d[0], d[1], d[2], tzinfo=self.tz).isoformat('T')
+				elif len(d) == 19:
+					d, t = d.split(' ')
+					d = map(int, d.split('-'))
+					t = map(int, t.split(':'))
+					d = datetime(d[0],d[1],d[2],t[0],t[1],t[2],tzinfo=self.tz).isoformat('T')
+				item['date'] = d
+				item['topic'] = topic
+				item['title'] = line['name']
+				if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
+					request = scrapy.Request(url=self.baseURL+line['url'], callback=self.parse_item)
+				else:
+					request = scrapy.Request(url=self.baseURL+'/'+line['publishDate'][:line['publishDate'].rfind(' ')]+'/'+line['uriComponent'], callback=self.parse_item)
+				request.meta['item'] = item
+				yield request
+	def parse_item(self, response): # para los json
+		item = response.meta['item']
+		text = ''
+		for paragraph in response.xpath('//*[@class="txt"]').extract():
+			text += remove_tags(paragraph) + '\n'
+		item['text'] = text
+		item['url'] = response.url
+		print item['title']
+		yield item
--- a/descarga_hacia_atras/laJornadaMaya_deprecated/scrapy.cfg
+++ b/descarga_hacia_atras/laJornadaMaya_deprecated/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+[settings]
+default = laJornadaMaya.settings
+[deploy]
+#url = http://localhost:6800/
+project = laJornadaMaya
--- a/descarga_hacia_atras/tintaFresca/scrapy.cfg
+++ b/descarga_hacia_atras/tintaFresca/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+[settings]
+default = tintaFresca.settings
+[deploy]
+#url = http://localhost:6800/
+project = tintaFresca
--- a/descarga_hacia_atras/tintaFresca/tintaFresca/__init__.py
+++ b/descarga_hacia_atras/tintaFresca/tintaFresca/__init__.py
--- a/descarga_hacia_atras/tintaFresca/tintaFresca/items.py
+++ b/descarga_hacia_atras/tintaFresca/tintaFresca/items.py
+# -*- coding: utf-8 -*-
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+import scrapy
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_hacia_atras/tintaFresca/tintaFresca/middlewares.py
+++ b/descarga_hacia_atras/tintaFresca/tintaFresca/middlewares.py
+# -*- coding: utf-8 -*-
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy import signals
+class TintafrescaSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+class TintafrescaDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_hacia_atras/tintaFresca/tintaFresca/pipelines.py
+++ b/descarga_hacia_atras/tintaFresca/tintaFresca/pipelines.py
+# -*- coding: utf-8 -*-
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+        return item
--- a/descarga_hacia_atras/tintaFresca/tintaFresca/settings.py
+++ b/descarga_hacia_atras/tintaFresca/tintaFresca/settings.py
+# -*- coding: utf-8 -*-
+# Scrapy settings for tintaFresca project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+BOT_NAME = 'tintaFresca'
+SPIDER_MODULES = ['tintaFresca.spiders']
+NEWSPIDER_MODULE = 'tintaFresca.spiders'
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'tintaFresca (+http://www.yourdomain.com)'
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'tintaFresca.middlewares.TintafrescaSpiderMiddleware': 543,
+#}
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'tintaFresca.middlewares.TintafrescaDownloaderMiddleware': 543,
+#}
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'tintaFresca.pipelines.JsonWriterPipeline': 300,
+}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_hacia_atras/tintaFresca/tintaFresca/spiders/__init__.py
+++ b/descarga_hacia_atras/tintaFresca/tintaFresca/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_hacia_atras/tintaFresca/tintaFresca/spiders/noticias.py
+++ b/descarga_hacia_atras/tintaFresca/tintaFresca/spiders/noticias.py
+# -*- coding: utf-8 -*-
+"""
+MEDIA:
+    Tinta Fresca, Chiapas
+USAGE
+    $ cd tintaFresca
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
+    for the news contained in noticias.json being splitted into files by date. ##
+    $ scrapy crawl noticias --nolog -s filename=noticias.json
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to a specific date. ##
+    $ scrapy crawl noticias --nolog -s filename=2018-08-30.json -a year=2018 -a month=8 -a day=30
+"""
+import scrapy, re, json
+from datetime import datetime, date, tzinfo, timedelta
+from tintaFresca.items import NoticiasItem
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+class UTC(tzinfo):
+    """
+    Class for Time Zone
+    """
+    def utcoffset(self, dt):
+        ## Time zone for Chiapas: UTC-6 ##
+        return timedelta(hours=-6)
+    def tzname(self, dt):
+        ## Time zone name ##
+        return 'UTC-6'
+class ImportantData(scrapy.Item):
+    """
+    Useful data for the flow of the implementation
+    """
+    to_next_page = scrapy.Field()
+    is_last_link = scrapy.Field()
+    news_section = scrapy.Field()
+    return_url   = scrapy.Field()
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    name = "noticias"
+    def start_requests(self):
+        self.tz = UTC()
+        year    = getattr(self, "year", None)
+        month   = getattr(self, "month", None)
+        day     = getattr(self, "day", None)
+        if year is not None and month is not None and day is not None:
+            self.stopDate = date(int(year), int(month), int(day))
+        else:
+            self.stopDate = None
+        baseURL = "http://tintafresca.com.mx/"
+        # section_list = ["letras_en_su_tinta/page1/", "tgz/page1/", "patria_chica/page1/", "hecho_en_chiapas/page1/", "show/page1/", "rafaga/page1/"]
+        section_list = ["tgz/page1/", "patria_chica/page1/", "hecho_en_chiapas/page1/", "show/page1/"]
+        self.month_parser = dict(Enero='01', Febrero='02', Marzo='03',      Abril='04',   Mayo='05',      Junio='06',
+                                 Julio='07', Agosto='08',  Septiembre='09', Octubre='10', Noviembre='11', Diciembre='12')
+        if self.stopDate is None:
+            for s in section_list:
+                yield scrapy.Request(url=baseURL + s, callback=self.parse)
+        else:
+            for s in section_list:
+                flow_info = ImportantData()
+                flow_info['to_next_page'] = False
+                request = scrapy.Request(url=baseURL + s, callback=self.parse_with_stop_date)
+                request.meta['item'] = flow_info
+                yield request
+    def parse(self, response):
+        link_list = response.css('ul.tintas').css('a.leer::attr(href)').extract()
+        # section   = response.xpath('//div[@id="ruta"]').extract_first()
+        # if section is not None:
+        #     section = remove_tags(section)
+        #     section = section.replace("Inicio &gt; ", '')
+        for link in link_list:
+            flow_info = ImportantData()
+            # flow_info['news_section'] = section
+            request = scrapy.Request(url=link, callback=self.parse_item)
+            request.meta['item'] = flow_info
+            yield request
+        next_page = response.xpath('//ul[@class="pagination"]/li[3]/a/@href').extract_first()
+        if next_page is not None:
+            yield scrapy.Request(url=next_page, callback=self.parse)
+    def parse_with_stop_date(self, response):
+        flow_info = response.meta['item']
+        if not flow_info['to_next_page']:
+            link_list = response.css('ul.tintas').css('a.leer::attr(href)').extract()
+            # section   = response.xpath('//div[@id="ruta"]').extract_first()
+            # if section is not None:
+            #     section = remove_tags(section)
+            #     section = section.replace("Inicio &gt; ", '')
+            for link in link_list:
+                flow_info = ImportantData()
+                # flow_info['news_section'] = section
+                flow_info['return_url'] = response.url
+                if link == link_list[-1] : flow_info['is_last_link'] = True
+                else                     : flow_info['is_last_link'] = False
+                request = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
+                request.meta['item'] = flow_info
+                yield request
+        else:
+            next_page = response.xpath('//ul[@class="pagination"]/li[3]/a/@href').extract_first()
+            if next_page is not None:
+                flow_info['to_next_page'] = False
+                request = scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
+                request.meta['item'] = flow_info
+                yield request
+    def parse_item(self, response):
+        flow_info = response.meta['item']
+        item = NoticiasItem()
+        text = ''
+        date_str = response.xpath('//div[@class="balazo"]').extract_first()
+        if date_str.find('<br>') > -1 : date_str = date_str[date_str.find('<br>'):]
+        date_str = remove_tags(date_str)
+        date_lst = date_str.split('/')
+        date_lst[1] = self.month_parser[date_lst[1]]
+        date_lst = map(int, date_lst)
+        news_date = datetime(date_lst[2], date_lst[1], date_lst[0], tzinfo=self.tz).isoformat("T")
+        topic = response.css('div.seccion > h3.left > a').extract_first()
+        if topic is not None : topic = remove_tags(topic)
+        if topic is not None:
+            if topic == "Letras en su Tinta":
+                title = ''
+                lines = response.css('div.sumario > p').extract()
+                for line in lines:
+                    if lines.index(line) != len(lines)-1 : title += remove_tags(line) + ". "
+                    else                                 : title += remove_tags(line) + "."
+            else:
+                title = response.css('div.titulo > h1').extract_first()
+                if title is not None : title = remove_tags(title)
+        else:
+            title = None
+        for p in response.css('div.contenido > p').extract():
+            text += remove_tags(p) + "\n"
+        ## News item info ##
+        item['date']  = news_date
+        item['topic'] = topic
+        item['title'] = title
+        item['text']  = text.strip()
+        item['url']   = response.url
+        yield item
+    def parse_item_with_stop_date(self, response):
+        date_str = response.xpath('//div[@class="balazo"]').extract_first()
+        if date_str.find('<br>') > -1 : date_str = date_str[date_str.find('<br>'):]
+        date_str = remove_tags(date_str)
+        date_lst = date_str.split('/')
+        date_lst[1] = self.month_parser[date_lst[1]]
+        news_date = "-".join(date_lst)
+        news_date = datetime.strptime(news_date, '%d-%m-%Y').date()
+        if news_date >= self.stopDate:
+            flow_info = response.meta['item']
+            item = NoticiasItem()
+            text = ''
+            date_lst = map(int, date_lst)
+            news_date = datetime(date_lst[2], date_lst[1], date_lst[0], tzinfo=self.tz).isoformat("T")
+            topic = response.css('div.seccion > h3.left > a').extract_first()
+            if topic is not None : topic = remove_tags(topic)
+            if topic is not None:
+                if topic == "Letras en su Tinta":
+                    title = ''
+                    lines = response.css('div.sumario > p').extract()
+                    for line in lines:
+                        if lines.index(line) != len(lines)-1 : title += remove_tags(line) + ". "
+                        else                                 : title += remove_tags(line) + "."
+                else:
+                    title = response.css('div.titulo > h1').extract_first()
+                    if title is not None : title = remove_tags(title)
+            else:
+                title = None
+            for p in response.css('div.contenido > p').extract():
+                text += remove_tags(p) + "\n"
+            ## News item info ##
+            item['date']  = news_date
+            item['topic'] = topic
+            item['title'] = title
+            item['text']  = text.strip()
+            item['url']   = response.url
+            yield item
+            if flow_info['is_last_link']:
+                flow_info['to_next_page'] = True
+                request = scrapy.Request(url=flow_info['return_url'], callback=self.parse_with_stop_date, dont_filter=True)
+                request.meta['item'] = flow_info
+                yield request
--- a/descarga_por_dia/cuartoPoder/cuartoPoder/__init__.py
+++ b/descarga_por_dia/cuartoPoder/cuartoPoder/__init__.py
--- a/descarga_por_dia/cuartoPoder/cuartoPoder/items.py
+++ b/descarga_por_dia/cuartoPoder/cuartoPoder/items.py
+# -*- coding: utf-8 -*-
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+import scrapy
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/cuartoPoder/cuartoPoder/middlewares.py
+++ b/descarga_por_dia/cuartoPoder/cuartoPoder/middlewares.py
+# -*- coding: utf-8 -*-
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy import signals
+class CuartopoderSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+class CuartopoderDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_por_dia/cuartoPoder/cuartoPoder/pipelines.py
+++ b/descarga_por_dia/cuartoPoder/cuartoPoder/pipelines.py
+# -*- coding: utf-8 -*-
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+        return item
--- a/descarga_por_dia/cuartoPoder/cuartoPoder/settings.py
+++ b/descarga_por_dia/cuartoPoder/cuartoPoder/settings.py
+# -*- coding: utf-8 -*-
+# Scrapy settings for cuartoPoder project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+BOT_NAME = 'cuartoPoder'
+SPIDER_MODULES = ['cuartoPoder.spiders']
+NEWSPIDER_MODULE = 'cuartoPoder.spiders'
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'cuartoPoder (+http://www.yourdomain.com)'
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'cuartoPoder.middlewares.CuartopoderSpiderMiddleware': 543,
+#}
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'cuartoPoder.middlewares.CuartopoderDownloaderMiddleware': 543,
+#}
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'cuartoPoder.pipelines.JsonWriterPipeline': 300,
+}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_por_dia/cuartoPoder/cuartoPoder/spiders/__init__.py
+++ b/descarga_por_dia/cuartoPoder/cuartoPoder/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_por_dia/cuartoPoder/cuartoPoder/spiders/noticias.py
+++ b/descarga_por_dia/cuartoPoder/cuartoPoder/spiders/noticias.py
+# -*- coding: utf-8 -*-
+"""
+MEDIA:
+    Cuarto Poder, Chiapas
+USAGE:
+    ## Get all the news from a specific date. ##
+    ---------------------------------------------------------------------------------------------
+    $ cd cuartoPoder/
+    $ scrapy crawl noticias --nolog -s filename=2018-08-30.json -a year=2018 -a month=8 -a day=30
+"""
+import scrapy, re
+from cuartoPoder.items import NoticiasItem
+from datetime import datetime, timedelta, tzinfo
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+class UTC(tzinfo):
+    """
+    Class for Time Zone
+    """
+    def utcoffset(self, dt):
+        ## Time zone for Chiapas: UTC-6 ##
+        return timedelta(hours=-6)
+    def tzname(self, dt):
+        ## Time zone name ##
+        return 'UTC-6'
+class ImportantData(scrapy.Item):
+    """
+    Useful data for the flow of the implementation
+    """
+    to_next_page = scrapy.Field()
+    next_page = scrapy.Field()
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    name = "noticias"
+    def start_requests(self):
+        self.tz    = UTC()
+        self.year  = getattr(self, "year", None)
+        self.month = getattr(self, "month", None)
+        self.day   = getattr(self, "day", None)
+        self.baseURL = "http://www.cuartopoder.mx"
+        first_URL = self.baseURL + "/archivo/portada/listado/{1}-{2}-{0}/{1}-{2}-{0}/".format(self.year, self.month.zfill(2), self.day.zfill(2))
+        self.second_URL = self.baseURL + "/XStatic/cuartopoder/template/cargaBloque.aspx?strControl=ctrlArchivoResultadosPaginadoListado&"
+        flow_info = ImportantData()
+        flow_info['to_next_page'] = False
+        flow_info['next_page'] = 2
+        request = scrapy.Request(url=first_URL, callback=self.parse)
+        request.meta['item'] = flow_info
+        yield request
+    def parse(self, response):
+        flow_info = response.meta['item']
+        for link in response.css('ul.news-list').xpath('./li/h5/a/@href').extract():
+            to_next_page = True
+            news_link = self.baseURL + link
+            yield scrapy.Request(url=news_link, callback=self.parse_item)
+        if flow_info['to_next_page']:
+            page = flow_info['next_page']
+            page_URL = self.second_URL + "p={3}&eids=&fd={1}-{2}-{0}&fh={1}-{2}-{0}&id=portada".format(self.year, self.month.zfill(2), self.day.zfill(2), str(page))        
+            flow_info['to_next_page'] = False
+            flow_info['next_page'] += 1
+            request = scrapy.Request(url=page_URL, callback=self.parse)
+            request.meta['item'] = flow_info
+            yield request
+    def parse_item(self, response):
+        item = NoticiasItem()
+        text = ''
+        news_date = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
+        title = response.css('div.post-title').css('h1').extract_first()
+        if title is not None : remove_tags(title)
+        topic = response.css('div.big-title').xpath('./h2/a/span').extract_first()
+        if topic is not None : topic = remove_tags(topic)
+        for p in response.css('div.post-content').css('p').extract():
+            p = remove_tags(p)
+            text += p + "\n"
+        ## News item info ##
+        item['date']  = news_date
+        item['title'] = title
+        item['topic'] = topic
+        item['text']  = text.strip()
+        item['url']   = response.url
+        yield item
--- a/descarga_por_dia/cuartoPoder/scrapy.cfg
+++ b/descarga_por_dia/cuartoPoder/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+[settings]
+default = cuartoPoder.settings
+[deploy]
+#url = http://localhost:6800/
+project = cuartoPoder
--- a/descarga_por_dia/diarioPuntual/diarioPuntual/__init__.py
+++ b/descarga_por_dia/diarioPuntual/diarioPuntual/__init__.py
--- a/descarga_por_dia/diarioPuntual/diarioPuntual/items.py
+++ b/descarga_por_dia/diarioPuntual/diarioPuntual/items.py
+# -*- coding: utf-8 -*-
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+import scrapy
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/diarioPuntual/diarioPuntual/middlewares.py
+++ b/descarga_por_dia/diarioPuntual/diarioPuntual/middlewares.py
+# -*- coding: utf-8 -*-
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy import signals
+class DiariopuntualSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+class DiariopuntualDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_por_dia/diarioPuntual/diarioPuntual/pipelines.py
+++ b/descarga_por_dia/diarioPuntual/diarioPuntual/pipelines.py
+# -*- coding: utf-8 -*-
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+        return item
--- a/descarga_por_dia/diarioPuntual/diarioPuntual/settings.py
+++ b/descarga_por_dia/diarioPuntual/diarioPuntual/settings.py
+# -*- coding: utf-8 -*-
+# Scrapy settings for diarioPuntual project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+BOT_NAME = 'diarioPuntual'
+SPIDER_MODULES = ['diarioPuntual.spiders']
+NEWSPIDER_MODULE = 'diarioPuntual.spiders'
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'diarioPuntual (+http://www.yourdomain.com)'
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'diarioPuntual.middlewares.DiariopuntualSpiderMiddleware': 543,
+#}
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'diarioPuntual.middlewares.DiariopuntualDownloaderMiddleware': 543,
+#}
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'diarioPuntual.pipelines.JsonWriterPipeline': 300,
+}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_por_dia/diarioPuntual/diarioPuntual/spiders/__init__.py
+++ b/descarga_por_dia/diarioPuntual/diarioPuntual/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_por_dia/diarioPuntual/diarioPuntual/spiders/noticias.py
+++ b/descarga_por_dia/diarioPuntual/diarioPuntual/spiders/noticias.py
+# -*- coding: utf-8 -*-
+"""
+MEDIA:
+    Puntual, EDOMEX
+USAGE:
+    ## Get all the news from a specific date. ##
+    ---------------------------------------------------------------------------------------------
+    $ cd diarioPuntual/
+    $ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
+"""
+import scrapy, re
+from diarioPuntual.items import NoticiasItem
+from datetime import datetime, timedelta, tzinfo
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+class UTC(tzinfo):
+    """
+    Class for Time Zone
+    """
+    def utcoffset(self, dt):
+        ## Time zone for EDOMEX: UTC-6 ##
+        return timedelta(hours=-6)
+    def tzname(self, dt):
+        ## Time zone name ##
+        return 'UTC-6'
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    name = "noticias"
+    def start_requests(self):
+        self.tz    = UTC()
+        self.year  = getattr(self, "year", None)
+        self.month = getattr(self, "month", None)
+        self.day   = getattr(self, "day", None)
+        baseURL = "http://diario-puntual.com.mx/{0}/{1}/{2}/".format(self.year, self.month.zfill(2), self.day.zfill(2))
+        yield scrapy.Request(url=baseURL, callback=self.parse)
+    def parse(self, response):
+        for link in response.css('div.post-column').css('h2.posttitle > a::attr(href)').extract():
+            yield scrapy.Request(url=link, callback=self.parse_item)
+        next_page = response.css('div.archive-pagination').xpath('./a[@class="next page-numbers"]/@href').extract_first()
+        if next_page is not None:
+            yield scrapy.Request(url=next_page, callback=self.parse)
+    def parse_item(self, response):
+        item = NoticiasItem()
+        text = ''
+        news_date = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
+        title = response.css('div.post-container').css('h1.post-title').extract_first()
+        if title is not None : title = remove_tags(title)
+        topic = None
+        for p in response.css('div.post-column > article').css('p').extract():
+            p = remove_tags(p)
+            text += p + "\n"
+        ## News item info ##
+        item['date']  = news_date
+        item['title'] = title
+        item['topic'] = topic
+        item['text']  = text.strip()
+        item['url']   = response.url
+        yield item
--- a/descarga_por_dia/diarioPuntual/scrapy.cfg
+++ b/descarga_por_dia/diarioPuntual/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+[settings]
+default = diarioPuntual.settings
+[deploy]
+#url = http://localhost:6800/
+project = diarioPuntual
--- a/descarga_por_dia/elComentario/elComentario/__init__.py
+++ b/descarga_por_dia/elComentario/elComentario/__init__.py
--- a/descarga_por_dia/elComentario/elComentario/items.py
+++ b/descarga_por_dia/elComentario/elComentario/items.py
+# -*- coding: utf-8 -*-
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+import scrapy
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/elComentario/elComentario/middlewares.py
+++ b/descarga_por_dia/elComentario/elComentario/middlewares.py
+# -*- coding: utf-8 -*-
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy import signals
+class ElcomentarioSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+class ElcomentarioDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_por_dia/elComentario/elComentario/pipelines.py
+++ b/descarga_por_dia/elComentario/elComentario/pipelines.py
+# -*- coding: utf-8 -*-
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+        return item
--- a/descarga_por_dia/elComentario/elComentario/settings.py
+++ b/descarga_por_dia/elComentario/elComentario/settings.py
+# -*- coding: utf-8 -*-
+# Scrapy settings for elComentario project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+BOT_NAME = 'elComentario'
+SPIDER_MODULES = ['elComentario.spiders']
+NEWSPIDER_MODULE = 'elComentario.spiders'
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'elComentario (+http://www.yourdomain.com)'
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'elComentario.middlewares.ElcomentarioSpiderMiddleware': 543,
+#}
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'elComentario.middlewares.ElcomentarioDownloaderMiddleware': 543,
+#}
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'elComentario.pipelines.JsonWriterPipeline': 300,
+}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_por_dia/elComentario/elComentario/spiders/__init__.py
+++ b/descarga_por_dia/elComentario/elComentario/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_por_dia/elComentario/elComentario/spiders/noticias.py
+++ b/descarga_por_dia/elComentario/elComentario/spiders/noticias.py
+# -*- coding: utf-8 -*-
+"""
+MEDIA:
+    El Comentario, Colima
+USAGE:
+    ## Get all the news from a specific date. ##
+    ---------------------------------------------------------------------------------------------
+    $ cd elComentario/
+    $ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
+"""
+import scrapy, re
+from elComentario.items import NoticiasItem
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    name = "noticias"
+    def start_requests(self):
+        year  = getattr(self, "year", None)
+        month = getattr(self, "month", None)
+        day   = getattr(self, "day", None)
+        baseURL = "https://elcomentario.ucol.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
+        yield scrapy.Request(url=baseURL, callback=self.parse)
+    def parse(self, response):
+        for link in response.css('div.articles').xpath('./article/div[@class="cnt"]/h3/a/@href').extract():
+            yield scrapy.Request(url=link, callback=self.parse_item)
+        next_page = response.css('div.post-pagination').xpath('./a[@title="Next page"]/@href').extract_first()
+        if next_page is not None:
+            yield scrapy.Request(url=next_page, callback=self.parse)
+    def parse_item(self, response):
+        item = NoticiasItem()
+        text = ''
+        news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+        title = response.xpath('//header/h1').extract_first()
+        if title is not None : title = remove_tags(title)
+        topic = response.css('a.theme').extract_first()
+        if topic is not None : topic = remove_tags(topic)
+        for p in response.css('div.pf-content').css('p').extract():
+            p = remove_tags(p)
+            text += p + "\n"
+        text = text.strip()
+        ## News item info ##
+        item['date']  = news_date
+        item['title'] = title
+        item['topic'] = topic
+        item['text']  = text
+        item['url']   = response.url
+        yield item
--- a/descarga_por_dia/elComentario/scrapy.cfg
+++ b/descarga_por_dia/elComentario/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+[settings]
+default = elComentario.settings
+[deploy]
+#url = http://localhost:6800/
+project = elComentario
--- a/descarga_por_dia/elSur/elSur/__init__.py
+++ b/descarga_por_dia/elSur/elSur/__init__.py
--- a/descarga_por_dia/elSur/elSur/items.py
+++ b/descarga_por_dia/elSur/elSur/items.py
+# -*- coding: utf-8 -*-
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+import scrapy
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/elSur/elSur/middlewares.py
+++ b/descarga_por_dia/elSur/elSur/middlewares.py
+# -*- coding: utf-8 -*-
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy import signals
+class ElsurSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+class ElsurDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_por_dia/elSur/elSur/pipelines.py
+++ b/descarga_por_dia/elSur/elSur/pipelines.py
+# -*- coding: utf-8 -*-
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+        return item
--- a/descarga_por_dia/elSur/elSur/settings.py
+++ b/descarga_por_dia/elSur/elSur/settings.py
+# -*- coding: utf-8 -*-
+# Scrapy settings for elSur project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+BOT_NAME = 'elSur'
+SPIDER_MODULES = ['elSur.spiders']
+NEWSPIDER_MODULE = 'elSur.spiders'
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'elSur (+http://www.yourdomain.com)'
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'elSur.middlewares.ElsurSpiderMiddleware': 543,
+#}
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'elSur.middlewares.ElsurDownloaderMiddleware': 543,
+#}
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'elSur.pipelines.JsonWriterPipeline': 300,
+}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_por_dia/elSur/elSur/spiders/__init__.py
+++ b/descarga_por_dia/elSur/elSur/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_por_dia/elSur/elSur/spiders/noticias.py
+++ b/descarga_por_dia/elSur/elSur/spiders/noticias.py
+# -*- coding: utf-8 -*-
+"""
+MEDIA:
+    El Sur, Guerrero
+USAGE:
+    ## Get all the news from a specific date. ##
+    ---------------------------------------------------------------------------------------------
+    $ cd elComentario/
+    $ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
+"""
+import scrapy, re
+from elSur.items import NoticiasItem
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+HEAD_RE_1 = re.compile(r'Texto:.*?[/y] Foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
+HEAD_RE_2 = re.compile(r'Texto y foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    name = "noticias"
+    def start_requests(self):
+        year  = getattr(self, "year", None)
+        month = getattr(self, "month", None)
+        day   = getattr(self, "day", None)
+        baseURL = "https://suracapulco.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
+        yield scrapy.Request(url=baseURL, callback=self.parse)
+    def parse(self, response):
+        for link in response.css('div.dslc-blog-posts').css('div.dslc-blog-post-title > h2 > a::attr(href)').extract():
+            yield scrapy.Request(url=link, callback=self.parse_item)
+        pag_lst = response.css('div.dslc-pagination > ul > li')
+        if len(pag_lst) > 0:
+            del pag_lst[0]
+            del pag_lst[0]
+            next_page = None
+            for li_obj in pag_lst:
+                li = remove_tags(li_obj.extract())
+                if not li.isdigit():
+                    next_page = li_obj.xpath('./a/@href').extract_first()
+                    break
+            if next_page is not None : yield scrapy.Request(url=next_page, callback=self.parse)
+    def parse_item(self, response):
+        item = NoticiasItem()
+        text = ''
+        news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+        title = response.css('div.dslc-tp-title > h1').extract_first()
+        if title is not None : title = remove_tags(title)
+        topic = response.css('div.dslc-tp-meta').xpath('./ul/li[3]/a[1]').extract_first()
+        if topic is not None : topic = remove_tags(topic)
+        for p in response.xpath('//div[@id="dslc-theme-content-inner"]').css('p').extract():
+            p = remove_tags(p)
+            text += p + "\n"
+        dateline = response.css('span.dateline').extract_first()
+        if dateline is not None:
+            dateline = remove_tags(dateline)
+            text = text.replace(dateline, '')
+        text = text.replace(u'\u00a0', ' ')
+        text = HEAD_RE_1.sub('', text)
+        text = HEAD_RE_2.sub('', text)
+        ## News item info ##
+        item['date']  = news_date
+        item['title'] = title
+        item['topic'] = topic
+        item['text']  = text.strip()
+        item['url']   = response.url
+        yield item
--- a/descarga_por_dia/elSur/scrapy.cfg
+++ b/descarga_por_dia/elSur/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+[settings]
+default = elSur.settings
+[deploy]
+#url = http://localhost:6800/
+project = elSur
--- a/descarga_por_dia/foraneos/diarioCoLatino/diarioCoLatino/spiders/noticias.py
+++ b/descarga_por_dia/foraneos/diarioCoLatino/diarioCoLatino/spiders/noticias.py
 # -*- coding: utf-8 -*-
-import scrapy, re
-from diarioCoLatino.items import NoticiasItem
 """
-MEDIO:
+MEDIA:
-Diario Co Latino, El Salvador
+    Diario Co Latino, El Salvador
-USO:
-scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
+USAGE:
+    ## Get all the news from a specific date. ##
+    ---------------------------------------------------------------------------------------------
+    $ cd diarioCoLatino/
+    $ scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
 """
+import scrapy, re
+from diarioCoLatino.items import NoticiasItem
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)
@@ -20,9 +26,14 @@ LOC_RE = re.compile(r'\n.*?\/(PL|AFP|DPA|SIGNIS ALC)\n', re.I)
 EM_RE = re.compile(r'\n((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?\n')
 class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
    name = "noticias"
    def start_requests(self):
        year = getattr(self, "year", None)
        month = getattr(self, "month", None)
@@ -33,6 +44,7 @@ class QuotesSpider(scrapy.Spider):
        yield scrapy.Request(url=self.baseURL, callback=self.parse)
    def parse(self, response):
        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
@@ -45,20 +57,22 @@ class QuotesSpider(scrapy.Spider):
                yield scrapy.Request(url=self.baseURL + "/page/" + str(page + 1), callback=self.parse_page)
    def parse_page(self, response):
-        for link in response.xpath('//div[@class="post-listing"]/article/h2/a/@href').extract():
+        for link in response.css('div.content').css('div.post-listing').xpath('./article/h2/a/@href').extract():
            yield scrapy.Request(url=link, callback=self.parse_item)
    def parse_item(self, response):
        item = NoticiasItem()
        text = ''
-        "La fecha obtenida ya incluye formato y zona horaria"
+        # La fecha obtenida ya incluye formato y zona horaria
-        item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+        news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
-        item['title'] = remove_tags(response.css('h1.entry-title').css('span').extract_first()).strip()
+        news_title = remove_tags(response.css('h1.entry-title').css('span').extract_first()).strip()
-        item['topic'] = None
+        news_topic = None
        for p in response.xpath('//div[@class="entry"]/p').extract():
            text += remove_tags(p) + "\n"
@@ -69,27 +83,30 @@ class QuotesSpider(scrapy.Spider):
        text = "\n" + text
        """ Obtiene autor """
+        news_author = None
        res = AUTH_RE.match(text)
        if res:
            m = res.group(0)
-            item['author'] = m[m.find('Por')+len('Por'):].strip()
+            news_author = m[m.find('Por')+len('Por'):].strip()
            text = text[text.find(m) + len(m):].strip()
            text = "\n" + text
        """ Elimina twitter """
+        news_twitter = None
        res = TW_RE.search(text)
        if res:
            m = res.group(0)
-            item['twitter'] = m.strip()
+            news_twitter = m.strip()
            text = text[text.find(m) + len(m):].strip()
            text = "\n" + text
        """ Obtiene lugar """
+        news_loc = None
        res = LOC_RE.match(text)
        if res:
            m = res.group(0)
            if m[m.find('/') + 1:].strip().lower() != 'dpa':
-                item['location'] = m[:m.find('/')].strip()
+                news_loc = m[:m.find('/')].strip()
                text = text[text.find(m) + len(m):].strip()
                text = "\n" + text
            else:
@@ -97,10 +114,11 @@ class QuotesSpider(scrapy.Spider):
                text = "\n" + text
        """ Elimina correo """
+        news_email = None
        res = EM_RE.search(text)
        if res:
            m = res.group(0)
-            item['email'] = m.strip()
+            news_email = m.strip()
            # text = text[text.find(m) + len(m):].strip()
            text = text.replace(m, '').strip()
            text = "\n" + text
@@ -108,7 +126,7 @@ class QuotesSpider(scrapy.Spider):
        res = EM_RE.search(text)
        if res:
            m = res.group(0)
-            item['email'] = m.strip()
+            news_email = m.strip()
            # text = text[text.find(m) + len(m):].strip()
            text = text.replace(m, '').strip()
            text = "\n" + text
@@ -119,8 +137,16 @@ class QuotesSpider(scrapy.Spider):
        text = "\n" + text
        text = text.replace("\nCo Latino\n", '').strip()
-        item['text'] = text.strip()
+        ## News item info ##
+        item['date']   = news_date
+        item['title']  = news_title
+        item['topic']  = news_topic
+        item['author'] = news_author
+        item['twitter'] = news_twitter
+        item['location'] = news_loc
+        item['email'] = news_email
+        item['text']  = text.strip()
        item['url']   = response.url
        yield item
--- a/descarga_por_dia/surDeCampeche/scrapy.cfg
+++ b/descarga_por_dia/surDeCampeche/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+[settings]
+default = surDeCampeche.settings
+[deploy]
+#url = http://localhost:6800/
+project = surDeCampeche
--- a/descarga_por_dia/surDeCampeche/surDeCampeche/__init__.py
+++ b/descarga_por_dia/surDeCampeche/surDeCampeche/__init__.py
--- a/descarga_por_dia/surDeCampeche/surDeCampeche/items.py
+++ b/descarga_por_dia/surDeCampeche/surDeCampeche/items.py
+# -*- coding: utf-8 -*-
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+import scrapy
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/surDeCampeche/surDeCampeche/middlewares.py
+++ b/descarga_por_dia/surDeCampeche/surDeCampeche/middlewares.py
+# -*- coding: utf-8 -*-
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy import signals
+class SurdecampecheSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+class SurdecampecheDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_por_dia/surDeCampeche/surDeCampeche/pipelines.py
+++ b/descarga_por_dia/surDeCampeche/surDeCampeche/pipelines.py
+# -*- coding: utf-8 -*-
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+        return item
--- a/descarga_por_dia/surDeCampeche/surDeCampeche/settings.py
+++ b/descarga_por_dia/surDeCampeche/surDeCampeche/settings.py
+# -*- coding: utf-8 -*-
+# Scrapy settings for surDeCampeche project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+BOT_NAME = 'surDeCampeche'
+SPIDER_MODULES = ['surDeCampeche.spiders']
+NEWSPIDER_MODULE = 'surDeCampeche.spiders'
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'surDeCampeche (+http://www.yourdomain.com)'
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'surDeCampeche.middlewares.SurdecampecheSpiderMiddleware': 543,
+#}
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'surDeCampeche.middlewares.SurdecampecheDownloaderMiddleware': 543,
+#}
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'surDeCampeche.pipelines.JsonWriterPipeline': 300,
+}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_por_dia/surDeCampeche/surDeCampeche/spiders/__init__.py
+++ b/descarga_por_dia/surDeCampeche/surDeCampeche/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_por_dia/surDeCampeche/surDeCampeche/spiders/noticias.py
+++ b/descarga_por_dia/surDeCampeche/surDeCampeche/spiders/noticias.py
+# -*- coding: utf-8 -*-
+import scrapy, re
+from surDeCampeche.items import NoticiasItem
+"""
+MEDIO:
+El Sur de Campeche, Campeche
+USO:
+scrapy crawl noticias --nolog -s filename=2018-08-10.json -a year=2018 -a month=8 -a day=10
+"""
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+class QuotesSpider(scrapy.Spider):
+    name = "noticias"
+    def start_requests(self):
+        year  = getattr(self, "year", None)
+        month = getattr(self, "month", None)
+        day   = getattr(self, "day", None)
+        self.baseURL = "http://www.elsur.mx/" + year + "/" + month + "/" + day
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
+    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        pagination = response.xpath('//a[@class="page-numbers"]/@href').extract()
+        if pagination is not None and len(pagination) > 0:
+            pages = pagination[-1].rstrip("/")
+            pages = int(pages[pages.rfind("/") + 1:])
+            for page in range(1, pages):
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page + 1), callback=self.parse_page)
+    def parse_page(self, response):
+        for link in response.css('div.news_box_inner_content').css('div.news_box_item_content').xpath('./h3/a/@href').extract():
+            yield scrapy.Request(url=link, callback=self.parse_item)
+    def parse_item(self, response):
+        item = NoticiasItem()
+        text = ''
+        # La fecha obtenida ya incluye formato y zona horaria
+        item['date']  = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+        item['title'] = remove_tags(response.xpath('//div[@class="post_title_wrapper"]/h1').extract_first())
+        try:
+            topic = remove_tags(response.css('span.blog_meta_category').css('a').extract_first())
+        except:
+            topic = None
+        item['topic'] = topic
+        for p in response.css('div.entry-content').css('p').extract():
+            text += remove_tags(p) + "\n"
+        item['text'] = text
+        item['url'] = response.url
+        yield item
--- a/descarga_por_dia/tribunaCampeche/scrapy.cfg
+++ b/descarga_por_dia/tribunaCampeche/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+[settings]
+default = tribunaCampeche.settings
+[deploy]
+#url = http://localhost:6800/
+project = tribunaCampeche
--- a/descarga_por_dia/tribunaCampeche/tribunaCampeche/__init__.py
+++ b/descarga_por_dia/tribunaCampeche/tribunaCampeche/__init__.py
--- a/descarga_por_dia/tribunaCampeche/tribunaCampeche/items.py
+++ b/descarga_por_dia/tribunaCampeche/tribunaCampeche/items.py
+# -*- coding: utf-8 -*-
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+import scrapy
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/tribunaCampeche/tribunaCampeche/middlewares.py
+++ b/descarga_por_dia/tribunaCampeche/tribunaCampeche/middlewares.py
+# -*- coding: utf-8 -*-
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy import signals
+class TribunacampecheSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+class TribunacampecheDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_por_dia/tribunaCampeche/tribunaCampeche/pipelines.py
+++ b/descarga_por_dia/tribunaCampeche/tribunaCampeche/pipelines.py
+# -*- coding: utf-8 -*-
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+        return item
--- a/descarga_por_dia/tribunaCampeche/tribunaCampeche/settings.py
+++ b/descarga_por_dia/tribunaCampeche/tribunaCampeche/settings.py
+# -*- coding: utf-8 -*-
+# Scrapy settings for tribunaCampeche project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+BOT_NAME = 'tribunaCampeche'
+SPIDER_MODULES = ['tribunaCampeche.spiders']
+NEWSPIDER_MODULE = 'tribunaCampeche.spiders'
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'tribunaCampeche (+http://www.yourdomain.com)'
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'tribunaCampeche.middlewares.TribunacampecheSpiderMiddleware': 543,
+#}
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'tribunaCampeche.middlewares.TribunacampecheDownloaderMiddleware': 543,
+#}
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'tribunaCampeche.pipelines.JsonWriterPipeline': 300,
+}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_por_dia/tribunaCampeche/tribunaCampeche/spiders/__init__.py
+++ b/descarga_por_dia/tribunaCampeche/tribunaCampeche/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_por_dia/tribunaCampeche/tribunaCampeche/spiders/noticias.py
+++ b/descarga_por_dia/tribunaCampeche/tribunaCampeche/spiders/noticias.py
+# -*- coding: utf-8 -*-
+import scrapy, re
+from tribunaCampeche.items import NoticiasItem
+"""
+MEDIO:
+Tribuna, Campeche
+USO:
+scrapy crawl noticias --nolog -s filename=2018-08-10.json -a year=2018 -a month=8 -a day=10
+"""
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+class QuotesSpider(scrapy.Spider):
+    name = "noticias"
+    def start_requests(self):
+        year  = getattr(self, "year", None)
+        month = getattr(self, "month", None)
+        day   = getattr(self, "day", None)
+        self.baseURL = "http://tribunacampeche.com/" + year + "/" + month + "/" + day
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
+    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        pagination = response.xpath('//a[@class="page-numbers"]/@href').extract()
+        if pagination is not None and len(pagination) > 0:
+            pages = pagination[-1].rstrip("/")
+            pages = int(pages[pages.rfind("/") + 1:])
+            for page in range(1, pages):
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page + 1), callback=self.parse_page)
+    def parse_page(self, response):
+        for link in response.css('div.vw-post-box').css('div.vw-post-box-inner').xpath('./h3/a/@href').extract():
+            yield scrapy.Request(url=link, callback=self.parse_item)
+    def parse_item(self, response):
+        item = NoticiasItem()
+        text = ''
+        # La fecha obtenida ya incluye formato y zona horaria
+        item['date']  = response.xpath('//time[@itemprop="datePublished"]/@datetime').extract_first()
+        item['title'] = remove_tags(response.xpath('//h1[@class="entry-title"]').extract_first())
+        try:
+            topic = remove_tags(response.css('article.vw-main-post').xpath('./div[@class="vw-post-categories"]/div/a').extract_first())
+        except:
+            topic = None
+        item['topic'] = topic
+        for p in response.css('div.vw-post-content').css('p').extract():
+            text += remove_tags(p) + "\n"
+        item['text'] = text
+        item['url'] = response.url
+        yield item
--- a/descarga_por_mes/proceso/proceso/items.py
+++ b/descarga_por_mes/proceso/proceso/items.py
@@ -3,12 +3,18 @@
 # Define here the models for your scraped items
 #
 # See documentation in:
-# http://doc.scrapy.org/en/latest/topics/items.html
+# https://doc.scrapy.org/en/latest/topics/items.html
 import scrapy
-class ProcesoItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_mes/proceso/proceso/middlewares.py
+++ b/descarga_por_mes/proceso/proceso/middlewares.py
@@ -5,52 +5,100 @@
 # See documentation in:
 # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
-from scrapy import signals
+import os, tempfile, time, sys, logging, dryscrape
+from scrapy.downloadermiddlewares.redirect import RedirectMiddleware
+logger = logging.getLogger(__name__)
-class ProcesoSpiderMiddleware(object):
-    # Not all methods need to be defined. If a method is not defined,
-    # scrapy acts as if the spider middleware does not modify the
-    # passed objects.
-    @classmethod
+class ThreatDefenceRedirectMiddleware(RedirectMiddleware):
-    def from_crawler(cls, crawler):
+    def __init__(self, settings):
-        # This method is used by Scrapy to create your spiders.
+        super(ThreatDefenceRedirectMiddleware, self).__init__(settings)
-        s = cls()
-        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
-        return s
-    def process_spider_input(response, spider):
+        # start xvfb to support headless scraping
-        # Called for each response that goes through the spider
+        if 'linux' in sys.platform:
-        # middleware and into the spider.
+            dryscrape.start_xvfb()
-        # Should return None or raise an exception.
+        self.dryscrape_session = dryscrape.Session(base_url='https://hemeroteca.proceso.com.mx/')
-        return None
+        for key, value in settings['DEFAULT_REQUEST_HEADERS'].items():
+            # seems to be a bug with how webkit-server handles accept-encoding
+            if key.lower() != 'accept-encoding':
+                self.dryscrape_session.set_header(key, value)
-    def process_spider_output(response, result, spider):
+    def _redirect(self, redirected, request, spider, reason):
-        # Called with the results returned from the Spider, after
+        # act normally if this isn't a threat defense redirect
-        # it has processed the response.
+        if not self.is_threat_defense_url(redirected.url):
+            return super(ThreatDefenceRedirectMiddleware, self)._redirect(redirected, request, spider, reason)
-        # Must return an iterable of Request, dict or Item objects.
+        logger.debug('Proceso threat defense triggered for {0}'.format(request.url))
-        for i in result:
+        request.cookies = self.bypass_threat_defense(redirected.url)
-            yield i
+        request.dont_filter = True # prevents the original link being marked a dupe
+        return request
-    def process_spider_exception(response, exception, spider):
+    def is_threat_defense_url(self, url):
-        # Called when a spider or process_spider_input() method
+        return 'proceso.com.mx' in url
-        # (from other spider middleware) raises an exception.
-        # Should return either None or an iterable of Response, dict
+    def bypass_threat_defense(self, url=None):
-        # or Item objects.
+        # only navigate if any explicit url is provided
-        pass
+        if url:
+            self.dryscrape_session.visit(url)
-    def process_start_requests(start_requests, spider):
+        # solve the captcha if there is one
-        # Called with the start requests of the spider, and works
+        # captcha_images = self.dryscrape_session.css('img[src *= captcha]')
-        # similarly to the process_spider_output() method, except
+        # if len(captcha_images) > 0:
-        # that it doesn’t have a response associated.
+        #     return self.solve_captcha(captcha_images[0])
-        # Must return only requests (not items).
+        # click on any explicit retry links
-        for r in start_requests:
+        # retry_links = self.dryscrape_session.css('a[href *= threat_defence]')
-            yield r
+        # if len(retry_links) > 0:
+        #     return self.bypass_threat_defense(retry_links[0].get_attr('href'))
-    def spider_opened(self, spider):
+        # otherwise, we're on a redirect page so wait for the redirect and try again
-        spider.logger.info('Spider opened: %s' % spider.name)
+        self.wait_for_redirect()
+        return self.bypass_threat_defense()
+    def wait_for_redirect(self, url = None, wait = 0.1, timeout=10):
+        url = url or self.dryscrape_session.url()
+        for i in range(int(timeout//wait)):
+            time.sleep(wait)
+            if self.dryscrape_session.url() != url:
+                return self.dryscrape_session.url()
+        logger.error("Maybe {0} isn't a redirect URL?".format(self.dryscrape_session.url()))
+        raise Exception('Timed out on the zipru redirect page.')
+    # def solve_captcha(self, img, width=1280, height=800):
+    #     # take a screenshot of the page
+    #     self.dryscrape_session.set_viewport_size(width, height)
+    #     filename = tempfile.mktemp('.png')
+    #     self.dryscrape_session.render(filename, width, height)
+    #     # inject javascript to find the bounds of the captcha
+    #     js = 'document.querySelector("img[src *= captcha]").getBoundingClientRect()'
+    #     rect = self.dryscrape_session.eval_script(js)
+    #     box = (int(rect['left']), int(rect['top']), int(rect['right']), int(rect['bottom']))
+    #     # solve the captcha in the screenshot
+    #     image = Image.open(filename)
+    #     os.unlink(filename)
+    #     captcha_image = image.crop(box)
+    #     captcha = pytesseract.image_to_string(captcha_image)
+    #     logger.debug(f'Solved the Zipru captcha: "{captcha}"')
+    #     # submit the captcha
+    #     input = self.dryscrape_session.xpath('//input[@id = "solve_string"]')[0]
+    #     input.set(captcha)
+    #     button = self.dryscrape_session.xpath('//button[@id = "button_submit"]')[0]
+    #     url = self.dryscrape_session.url()
+    #     button.click()
+    #     # try again if it we redirect to a threat defense URL
+    #     if self.is_threat_defense_url(self.wait_for_redirect(url)):
+    #         return self.bypass_threat_defense()
+    #     # otherwise return the cookies as a dict
+    #     cookies = {}
+    #     for cookie_string in self.dryscrape_session.cookies():
+    #         if 'domain=zipru.to' in cookie_string:
+    #             key, value = cookie_string.split(';')[0].split('=')
+    #             cookies[key] = value
+    #     return cookies
--- a/descarga_por_mes/proceso/proceso/pipelines.py
+++ b/descarga_por_mes/proceso/proceso/pipelines.py
@@ -3,9 +3,73 @@
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class ProcesoPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_mes/proceso/proceso/settings.py
+++ b/descarga_por_mes/proceso/proceso/settings.py
@@ -16,10 +16,10 @@ NEWSPIDER_MODULE = 'proceso.spiders'
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'proceso (+http://www.yourdomain.com)'
+USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,22 +27,27 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 1
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = True
+COOKIES_DEBUG = True
+SPLASH_COOKIES_DEBUG = True
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
 # Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
+# DEFAULT_REQUEST_HEADERS = {
 #     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#     'User-Agent': USER_AGENT,
+#     'Connection': 'Keep-Alive',
+#     # 'Accept-Encoding': 'gzip, deflate',
 #     'Accept-Language': 'en',
-#}
+# }
 # Enable or disable spider middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
@@ -54,10 +59,11 @@ SPIDER_MIDDLEWARES = {
 # Enable or disable downloader middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 DOWNLOADER_MIDDLEWARES = {
-   # 'proceso.middlewares.MyCustomDownloaderMiddleware': 543,
+    # 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None,
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
+    # 'proceso.middlewares.ThreatDefenceRedirectMiddleware': 820,
 }
 # Enable or disable extensions
@@ -68,9 +74,9 @@ DOWNLOADER_MIDDLEWARES = {
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'proceso.pipelines.ProcesoPipeline': 300,
+   'proceso.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_mes/proceso/proceso/settings_org.py
+++ b/descarga_por_mes/proceso/proceso/settings_org.py
+# -*- coding: utf-8 -*-
+# Scrapy settings for proceso project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+BOT_NAME = 'proceso'
+SPIDER_MODULES = ['proceso.spiders']
+NEWSPIDER_MODULE = 'proceso.spiders'
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'proceso (+http://www.yourdomain.com)'
+# USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36'
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 1
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+# COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+SPIDER_MIDDLEWARES = {
+   # 'proceso.middlewares.ProcesoSpiderMiddleware': 543,
+   'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
+}
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+DOWNLOADER_MIDDLEWARES = {
+   # 'proceso.middlewares.MyCustomDownloaderMiddleware': 543,
+   'scrapy_splash.SplashCookiesMiddleware': 723,
+   'scrapy_splash.SplashMiddleware': 725,
+   'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
+}
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'proceso.pipelines.JsonWriterPipeline': 300,
+}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
+SPLASH_URL = 'http://localhost:8050/'
+DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
--- a/descarga_por_mes/proceso/proceso/spiders/noticias.py
+++ b/descarga_por_mes/proceso/proceso/spiders/noticias.py
-import scrapy, re
+# -*- coding: utf-8 -*-
-from datetime import datetime, date, timedelta, tzinfo
-from scrapy_splash import SplashRequest
 """
-Para este sitio se hace uso de 'scrapy-splash' porque el contenido es cargado a traves de javascript
+MEDIA:
+    Proceso, CDMX
+USAGE:
+    ## For this crawler 'scrapy-splash' is used because the content is loaded through javascript. ##
+    ## Read especs_sitio_proceso.txt file. ##
+    $ cd proceso/
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
+    for the news contained in noticias.json being splitted into files by date. ##
-USO:
+    $ scrapy crawl noticias --nolog -s filename=noticias.json
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to a specific year-month date. ##
+    $ scrapy crawl noticias --nolog -s filename=2018-09.json -a year=2018 -a month=9
 """
+import scrapy, re, cfscrape
+from proceso.items import NoticiasItem
+from datetime import datetime, date, timedelta, tzinfo
+from scrapy.http.cookies import CookieJar
+from scrapy_splash import SplashRequest, SplashFormRequest
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)
-class UTC(tzinfo):
-	"""clase para el 'time zone' (zona horaria)"""
+class UTC(tzinfo):
+    """
+    Class for Time Zone
+    """
    def utcoffset(self, dt):
-		# zona horaria para el centro de mexico: utc-6
+        ## Time zone for CDMX: UTC-6 ##
        return timedelta(hours=-6)
    def tzname(self, dt):
-		# nombre de la zona horaria
+        ## Time zone name ##
        return 'UTC-6'
+USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
-class NoticiasItem(scrapy.Item):
+script = """
-	title = scrapy.Field()
+function main(splash)
-	text = scrapy.Field()
+  splash:init_cookies(splash.args.cookies)
-	date = scrapy.Field()
+  assert(splash:go{
-	location = scrapy.Field()
+    splash.args.url,
-	author = scrapy.Field()
+    headers=splash.args.headers,
-	topic = scrapy.Field()
+    http_method=splash.args.http_method,
-	url = scrapy.Field()
+    body=splash.args.body,
+    })
+  assert(splash:wait(0.5))
+  local entries = splash:history()
+  local last_response = entries[#entries].response
+  return {
+    url = splash:url(),
+    headers = last_response.headers,
+    http_status = last_response.status,
+    cookies = splash:get_cookies(),
+    html = splash:html(),
+  }
+end
+"""
 class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
    name = "noticias"
    def start_requests(self):
        self.tz = UTC()
        year = getattr(self, 'year', None)
        month = getattr(self, 'month', None)
-		# day = getattr(self, 'day', None)
-		parse_month = {'1': 'enero',       '2': 'febrero',  '3': 'marzo',      '4': 'abril',
-					   '5': 'mayo',        '6': 'junio',    '7': 'julio',      '8': 'agosto',
-					   '9': 'septiembre', '10': 'octubre', '11': 'noviembre', '12': 'diciembre'}
-		self.date = parse_month[month]+' de '+year
+        self.month_parser = {
+            'enero'      : '01', 'febrero' : '02',  'marzo'     : '03',  'abril'     : '04',
+            'mayo'       : '05', 'junio'   : '06',  'julio'     : '07',  'agosto'    : '08',
+            'septiembre' : '09', 'octubre' : '10',  'noviembre' : '11',  'diciembre' : '12'
+        }
+        self.baseURL = "https://hemeroteca.proceso.com.mx/"
+        login_url = "https://hemeroteca.proceso.com.mx/wp-login.php"
+        if year is not None and month is not None:
+            self.stop_date = date(int(year), int(month), 15)
+            # yield scrapy.Request(url=login_url, callback=self.parse_with_stop_date)
+        else:
+            self.stop_date = None
+            # yield scrapy.Request(url=login_url, callback=self.parse)
+        token, agent = cfscrape.get_tokens(login_url, user_agent=USER_AGENT)
+        print token
+        print "\n"
+        yield SplashRequest(url=login_url, callback=self.parse_login, endpoint='render.html', args={ 'wait': 0.5 }, cookies=token,
+                    headers={'User-Agent' : agent})
+    def parse_login(self, response):
+        return SplashFormRequest.from_response(
+            response,
+            formdata = {
+                'log' : 'carlos_silvaforne@yahoo.com.mx',
+                'pwd' : 'carlos_silvaforne@'
+            },
+            callback = self.after_login,
+            dont_click = True
+        )
+    def after_login(self, response):
+        ## Check login succeed before going on ##
+        print response.headers
+        print "\n"
+        print response.real_url
+        print "\n"
+        print response.request.headers
+        print "\n"
+        cookie_list = response.request.headers.getlist('Cookie')
+        cfc, cfd = cookie_list[0].split(';')
+        cfc = cfc.strip().split('=')
+        cfd = cfd.strip().split('=')
+        cookies = [cfc[1], cfd[1]]
+        cookies = {cfc[0]: cfc[1], cfd[0]: cfd[1]}
+        session_legend = response.css('div.topnav > a').extract()[-1]
+        print response.css('h1.entry-title').extract_first()
+        print "\n"
+        if session_legend is not None:
+            session_legend = remove_tags(session_legend)
-		self.baseURL='http://hemeroteca.proceso.com.mx/?page_id=111058&edicion=mexico&page='
+            if not "Cerrar" in session_legend:
+                print "Login failed."
-		yield scrapy.Request(url=self.baseURL+self.year, callback=self.parse)
+            else:
+                print session_legend
+                print "\n"
+                token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
+                print token
+                print "\n"
+                if self.stop_date is None:
+                    yield scrapy.Request(url=self.baseURL, callback=self.parse)
-	def parse(self, response):
+                else:
-		for post in response.css('div.catpor-box'):
+                    self.meta = response.request.meta
-			post_date = post.xpath('./div/span[@class="catpor-published clearfix"]/text()').extract_first()
+                    yield SplashRequest(url=self.baseURL, callback=self.parse_with_stop_date,
-			post_date = post_date[post_date.find('d')+3:]
+                        meta=self.meta,
+                        endpoint='execute',
+                        cache_args=['lua_source'],
+                        args={'lua_source': script},
+                        headers={'User-Agent': USER_AGENT}
+                    )
+                    # request = SplashRequest(url=self.baseURL, callback=self.parse_with_stop_date,
+                    #     endpoint='execute',
+                    #     cache_args=['lua_source'],
+                    #     args={'lua_source': script},
+                    #     headers={'User-Agent': USER_AGENT}
+                    # )
+                    # request.meta['splash']['session_id'] = cookie_list[0]
+                    # yield request
-			if post_date == self.date:
-				link = post.xpath('./div/div/a/@href').extract_first()
-				yield scrapy.Request(url=link, callback=self.parse_2)
-	def parse_2(self, response):
+        # if "authentication failed" in response.body:
-		for link in response.xpath('//*[@class="post-container clearfix"]/h2/a/@href').extract():
+        #     self.logger.error("Login failed.")
-			# yield scrapy.Request(url=link, callback=self.parse_item)
+        #     return
-			yield SplashRequest(url=link, callback=self.parse_item, endpoint='render.html', args={ 'wait': 0.5 })
+        # else:
+        #     # token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
+        #     if self.stop_date is None:
+        #         yield scrapy.Request(url=self.baseURL, callback=self.parse, dont_filter=True)
+        #     else:
+        #         yield scrapy.Request(url=self.baseURL, callback=self.parse_with_stop_date, dont_filter=True)
+                # yield scrapy.Request(
+                #     url=self.baseURL,
+                #     callback=self.parse_with_stop_date,
+                #     cookies=token,
+                #     headers={'User-Agent' : agent}
+                # )
+    def parse_with_stop_date(self, response):
+        print "parse_with_stop_date"
+        print "\n"
+        print response.css('h1.entry-title').extract_first()
+        # print "\n"
+        # print response.cookiejar
+        print "\n"
+        print response.headers
+        print "\n"
+        # session_legend = response.css('div.topnav > a').extract()[-1]
+        # if session_legend is not None :
+        #     print remove_tags(session_legend)
+        #     print "\n"
+        # else :
+        #     print "No log in."
+        TO_NEXT_PAGE = True
+        for item in response.css('div.catpor-box > div'):
+            item_date = item.css('span.catpor-published').extract_first()
+            if item_date is not None:
+                item_date    = remove_tags(item_date).replace(",", '')
+                item_date    = item_date.split(' ')
+                item_date[1] = self.month_parser[item_date[1]]
+                item_date    = map(int, item_date)
+                item_date    = date(item_date[2], item_date[1], item_date[0])
+                if item_date >= self.stop_date:
+                    item_link = item.css('span.catpor-title > a::attr(href)').extract_first()
+                    print item_link
+                    # token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
+                    # yield scrapy.Request(url=item_link, callback=self.parse_links, cookies=cookies)
+                    # yield scrapy.Request(url=item_link, callback=self.parse_links, cookies=token,
+                    #                         headers={'User-Agent' : agent})
+                    yield SplashRequest(url=item_link, callback=self.parse_links,
+                        endpoint='execute',
+                        cache_args=['lua_source'],
+                        args={'lua_source': script},
+                        headers={'User-Agent': USER_AGENT}
+                    )
+                else:
+                    TO_NEXT_PAGE = False
+                    break
+        if TO_NEXT_PAGE:
+            next_page = response.css('div.page-navigation > div.nav-next > a::attr(href)').extract_first()
+            if next_page is not None:
+                # yield scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
+                yield SplashRequest(url=next_page, callback=self.parse_with_stop_date,
+                    endpoint='execute',
+                    cache_args=['lua_source'],
+                    args={'lua_source': script},
+                    headers={'User-Agent': USER_AGENT}
+                )
+    def parse_links(self, response):
+        print "\n\n"
+        print response.headers
+        for link in response.css('div.post-container > h2 > a::attr(href)').extract():
+            # print link
+            # token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
+            # yield SplashRequest(url=link, callback=self.parse_item, endpoint='render.html', args={ 'wait': 0.5 }, cookies=token,
+            #         headers={'User-Agent' : agent})
+            yield SplashRequest(url=link, callback=self.parse_item,
+                endpoint='execute',
+                cache_args=['lua_source'],
+                args={'lua_source': script},
+                headers={'User-Agent': USER_AGENT}
+            )
    def parse_item(self, response):
+        # if response.url == "https://hemeroteca.proceso.com.mx/?page_id=278958&a51dc26366d99bb5fa29cea4747565fec=420203":
+        #     print response.body
        item = NoticiasItem()
        text = ''
-		d = response.xpath('//div[@id="primary"]/div/div/div/div/span[@class="published"]/text()').extract_first()
+        news_date = response.xpath('//div[@id="primary"]').css('span.published').extract_first()
-		d, t = d.split(' ')
+        if news_date is not None:
-		d = map(int, d.split('-'))
+            news_date = remove_tags(news_date)
-		t = map(int, t.split(':'))
+            print news_date
-		d = datetime(d[0],d[1],d[2],t[0],t[1],t[2],tzinfo=self.tz).isoformat('T')
+            d, t = news_date.split(' ')
-		item['date'] = d
+            d = map(int, d.split("-"))
+            t = map(int, t.split(":"))
+            news_date = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat('T')
-		item['title'] = response.xpath('//div[@id="primary"]/div/h1/text()').extract_first()
+        title = response.xpath('//div[@id="primary"]/div/h1').extract_first()
-		item['topic'] = response.xpath('//span[@class="entry-categories"]/text()').extract_first()
+        if title is not None : title = remove_tags(title)
-		for paragraph in response.xpath('//div[@id="primary"]/div/div/div/div[@class="entry-content"]/div/p').extract():
+        topic = response.css('span.entry-categories').extract_first()
+        if topic is not None : topic = remove_tags(topic)
+        for paragraph in response.xpath('//div[@id="primary"]').css('div.entry-content > div').css('p').extract():
            text += remove_tags(paragraph) + '\n'
-		item['text'] = text
+        ## News item info ##
+        item['date']  = news_date
+        item['title'] = title
+        item['topic'] = topic
+        item['text']  = text.strip()
        item['url']   = response.url
-		# print item['title']
        yield item
--- a/descarga_por_mes/proceso/proceso/spiders/noticias_org.py
+++ b/descarga_por_mes/proceso/proceso/spiders/noticias_org.py
+# -*- coding: utf-8 -*-
+"""
+MEDIA:
+    Proceso, CDMX
+USAGE:
+    ## For this crawler 'scrapy-splash' is used because the content is loaded through javascript. ##
+    ## Read especs_sitio_proceso.txt file. ##
+    $ cd proceso/
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
+    for the news contained in noticias.json being splitted into files by date. ##
+    $ scrapy crawl noticias --nolog -s filename=noticias.json
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to a specific year-month date. ##
+    $ scrapy crawl noticias --nolog -s filename=2018-09.json -a year=2018 -a month=9
+"""
+import scrapy, re, time, cfscrape
+from proceso.items import NoticiasItem
+from datetime import datetime, date, timedelta, tzinfo
+from scrapy_splash import SplashRequest, SplashFormRequest
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+class UTC(tzinfo):
+    """
+    Class for Time Zone
+    """
+    def utcoffset(self, dt):
+        ## Time zone for CDMX: UTC-6 ##
+        return timedelta(hours=-6)
+    def tzname(self, dt):
+        ## Time zone name ##
+        return 'UTC-6'
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    # name = "noticias"
+    def start_requests(self):
+        self.tz = UTC()
+        year = getattr(self, 'year', None)
+        month = getattr(self, 'month', None)
+        self.month_parser = {
+            'enero'      : '01', 'febrero' : '02',  'marzo'     : '03',  'abril'     : '04',
+            'mayo'       : '05', 'junio'   : '06',  'julio'     : '07',  'agosto'    : '08',
+            'septiembre' : '09', 'octubre' : '10',  'noviembre' : '11',  'diciembre' : '12'
+        }
+        self.baseURL = "https://hemeroteca.proceso.com.mx/"
+        login_url = "https://hemeroteca.proceso.com.mx/wp-login.php"
+        if year is not None and month is not None:
+            self.stop_date = date(int(year), int(month), 22)
+            # yield scrapy.Request(url=self.baseURL, callback=self.parse_with_stop_date)
+        else:
+            self.stop_date = None
+            # yield scrapy.Request(url=self.baseURL, callback=self.parse)
+        # yield scrapy.Request(url=login_url, callback=self.parse_login)
+        yield SplashRequest(url=login_url, callback=self.parse_login, endpoint='render.html', args={ 'wait': 0.5 })
+    def parse_login(self, response):
+        print "parse_login"
+        # return scrapy.FormRequest.from_response(
+        #     response,
+        #     formdata = {
+        #         'log' : 'carlos_silvaforne@yahoo.com.mx',
+        #         'pwd' : 'carlos_silvaforne@'
+        #     },
+        #     callback = self.after_login
+        # )
+        return SplashFormRequest.from_response(
+            response,
+            formdata = {
+                'log' : 'carlos_silvaforne@yahoo.com.mx',
+                'pwd' : 'carlos_silvaforne@'
+                # 'log' : 'myusr',
+                # 'pwd' : 'mypwd'
+            },
+            callback = self.after_login,
+            # callback = self.parse_with_stop_date,
+            dont_click = True
+        )
+    def after_login(self, response):
+        ## Check login succeed before going on ##
+        print "after_login"
+        # print response.body
+        if "authentication failed" in response.body:
+            self.logger.error("Login failed.")
+            return
+        else:
+            print "passed"
+            # token, agent = cfscrape.get_tokens(self.baseURL)
+            if self.stop_date is None:
+                pass
+            #     yield scrapy.Request(url=self.baseURL, callback=self.parse, dont_filter=True)
+            else:
+                # yield scrapy.Request(url=self.baseURL, callback=self.parse_with_stop_date, dont_filter=True)
+                # yield SplashRequest(url=self.baseURL, callback=self.parse_with_stop_date)
+                yield scrapy.Request(
+                    url=self.baseURL,
+                    callback=self.parse_with_stop_date,
+                    cookies=token,
+                    headers={'User-Agent' : agent}
+                )
+    def parse_with_stop_date(self, response):
+        TO_NEXT_PAGE = True
+        for item in response.css('div.catpor-box > div'):
+            item_date = item.css('span.catpor-published').extract_first()
+            if item_date is not None:
+                item_date    = remove_tags(item_date).replace(",", '')
+                item_date    = item_date.split(' ')
+                item_date[1] = self.month_parser[item_date[1]]
+                item_date    = map(int, item_date)
+                item_date    = date(item_date[2], item_date[1], item_date[0])
+                if item_date >= self.stop_date:
+                    item_link = item.css('span.catpor-title > a::attr(href)').extract_first()
+                    yield scrapy.Request(url=item_link, callback=self.parse_links)
+                else:
+                    TO_NEXT_PAGE = False
+                    break
+        if TO_NEXT_PAGE:
+            next_page = response.css('div.page-navigation > div.nav-next > a::attr(href)').extract_first()
+            if next_page is not None:
+                yield scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
+    def parse_links(self, response):
+        for link in response.css('div.post-container > h2 > a::attr(href)').extract():
+            # print link
+            yield SplashRequest(url=link, callback=self.parse_item, endpoint='render.html', args={ 'wait': 0.5 })
+    def parse_item(self, response):
+        item = NoticiasItem()
+        text = ''
+        news_date = response.xpath('//div[@id="primary"]').css('span.published').extract_first()
+        if news_date is not None:
+            news_date = remove_tags(news_date)
+            print news_date
+            d, t = news_date.split(' ')
+            d = map(int, d.split("-"))
+            t = map(int, t.split(":"))
+            news_date = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat('T')
+        title = response.xpath('//div[@id="primary"]/div/h1').extract_first()
+        if title is not None : title = remove_tags(title)
+        topic = response.css('span.entry-categories').extract_first()
+        if topic is not None : topic = remove_tags(topic)
+        for paragraph in response.xpath('//div[@id="primary"]').css('div.entry-content > div').css('p').extract():
+            text += remove_tags(paragraph) + '\n'
+        ## News item info ##
+        item['date']  = news_date
+        item['title'] = title
+        item['topic'] = topic
+        item['text']  = text.strip()
+        item['url']   = response.url
+        yield item
--- a/descarga_por_mes/proceso_prueba/proceso_prueba/__init__.py
+++ b/descarga_por_mes/proceso_prueba/proceso_prueba/__init__.py
--- a/descarga_por_mes/proceso_prueba/proceso_prueba/items.py
+++ b/descarga_por_mes/proceso_prueba/proceso_prueba/items.py
+# -*- coding: utf-8 -*-
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+import scrapy
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_mes/proceso_prueba/proceso_prueba/middlewares.py
+++ b/descarga_por_mes/proceso_prueba/proceso_prueba/middlewares.py
+# -*- coding: utf-8 -*-
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy import signals
+class ProcesoPruebaSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+class ProcesoPruebaDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_por_mes/proceso_prueba/proceso_prueba/pipelines.py
+++ b/descarga_por_mes/proceso_prueba/proceso_prueba/pipelines.py
+# -*- coding: utf-8 -*-
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+        return item
--- a/descarga_por_mes/proceso_prueba/proceso_prueba/settings.py
+++ b/descarga_por_mes/proceso_prueba/proceso_prueba/settings.py
+# -*- coding: utf-8 -*-
+# Scrapy settings for proceso_prueba project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+BOT_NAME = 'proceso_prueba'
+SPIDER_MODULES = ['proceso_prueba.spiders']
+NEWSPIDER_MODULE = 'proceso_prueba.spiders'
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 1
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = True
+COOKIES_DEBUG = True
+SPLASH_COOKIES_DEBUG = True
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+SPIDER_MIDDLEWARES = {
+   # 'proceso_prueba.middlewares.ProcesoPruebaSpiderMiddleware': 543,
+   'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
+}
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+DOWNLOADER_MIDDLEWARES = {
+    'scrapy_splash.SplashCookiesMiddleware': 723,
+    'scrapy_splash.SplashMiddleware': 725,
+    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
+}
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'proceso_prueba.pipelines.JsonWriterPipeline': 300,
+}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
+SPLASH_URL = 'http://localhost:8050/'
+DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
--- a/descarga_por_mes/proceso_prueba/proceso_prueba/spiders/__init__.py
+++ b/descarga_por_mes/proceso_prueba/proceso_prueba/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_por_mes/proceso_prueba/proceso_prueba/spiders/noticias.py
+++ b/descarga_por_mes/proceso_prueba/proceso_prueba/spiders/noticias.py
+# -*- coding: utf-8 -*-
+"""
+MEDIA:
+    Proceso, CDMX
+USAGE:
+    ## For this crawler 'scrapy-splash' is used because the content is loaded through javascript. ##
+    ## Read especs_sitio_proceso.txt file. ##
+    $ cd proceso_prueba/
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
+    for the news contained in noticias.json being splitted into files by date. ##
+    $ scrapy crawl noticias --nolog -s filename=noticias.json
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to a specific year-month date. ##
+    $ scrapy crawl noticias --nolog -s filename=2018-09.json -a year=2018 -a month=9
+"""
+import scrapy, re, cfscrape
+from proceso_prueba.items import NoticiasItem
+from datetime import datetime, date, timedelta, tzinfo
+from scrapy.http.cookies import CookieJar
+from scrapy_splash import SplashRequest, SplashFormRequest
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+class UTC(tzinfo):
+    """
+    Class for Time Zone
+    """
+    def utcoffset(self, dt):
+        ## Time zone for CDMX: UTC-6 ##
+        return timedelta(hours=-6)
+    def tzname(self, dt):
+        ## Time zone name ##
+        return 'UTC-6'
+USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
+script = """
+function main(splash)
+  splash:init_cookies(splash.args.cookies)
+  assert(splash:go{
+    splash.args.url,
+    headers=splash.args.headers,
+    http_method=splash.args.http_method,
+    body=splash.args.body,
+    })
+  assert(splash:wait(0.5))
+  local entries = splash:history()
+  local last_response = entries[#entries].response
+  return {
+    url = splash:url(),
+    headers = last_response.headers,
+    http_status = last_response.status,
+    cookies = splash:get_cookies(),
+    html = splash:html(),
+  }
+end
+"""
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    name = "noticias"
+    def start_requests(self):
+        self.tz = UTC()
+        year = getattr(self, 'year', None)
+        month = getattr(self, 'month', None)
+        self.month_parser = {
+            'enero'      : '01', 'febrero' : '02',  'marzo'     : '03',  'abril'     : '04',
+            'mayo'       : '05', 'junio'   : '06',  'julio'     : '07',  'agosto'    : '08',
+            'septiembre' : '09', 'octubre' : '10',  'noviembre' : '11',  'diciembre' : '12'
+        }
+        self.baseURL = "https://hemeroteca.proceso.com.mx/"
+        login_url = "https://hemeroteca.proceso.com.mx/wp-login.php"
+        if year is not None and month is not None:
+            self.stop_date = date(int(year), int(month), 15)
+            # yield scrapy.Request(url=login_url, callback=self.parse_with_stop_date)
+        else:
+            self.stop_date = None
+            # yield scrapy.Request(url=login_url, callback=self.parse)
+        self.token, self.agent = cfscrape.get_tokens(login_url, user_agent=USER_AGENT)
+        print "token"
+        print self.token
+        yield SplashRequest(url=login_url, callback=self.parse_login, endpoint='render.html', args={ 'wait': 0.5 }, cookies=self.token,
+                    headers={'User-Agent' : self.agent})
+    def parse_login(self, response):
+        return SplashFormRequest.from_response(
+            response,
+            method="POST",
+            formdata = {
+                'log' : 'carlos_silvaforne@yahoo.com.mx',
+                'pwd' : 'carlos_silvaforne@'
+            },
+            callback = self.after_login,
+            dont_click = True
+        )
+    def after_login(self, response):
+        print "\nresponse.request.headers"
+        print response.request.meta
+        print "\nresponse.headers"
+        print response.headers
+        session_legend = response.css('div.topnav > a').extract()[-1]
+        if session_legend is not None :
+            print "\nsession_legend"
+            print remove_tags(session_legend)
+        else :
+            print "No log in."
+        item_link = "https://hemeroteca.proceso.com.mx/?page_id=420325"
+        yield SplashRequest(url=item_link, callback=self.parse_links,
+            meta=response.request.meta,
+            endpoint='execute',
+            cache_args=['lua_source'],
+            args={'lua_source': script},
+            headers={'User-Agent': self.agent}
+        )
+        # yield SplashRequest(url=item_link, callback=self.parse_links, endpoint='render.html', args={ 'wait': 0.5 }, cookies=self.token,
+        #             headers={'User-Agent' : self.agent})
+    def parse_links(self, response):
+        session_legend = response.css('div.topnav > a').extract()
+        print "\nresponse.body"
+        print response.body
+        # if session_legend is not None :
+        #     print "\nsession_legend"
+        #     print remove_tags(session_legend)
+        # else :
+        #     print "No log in."
+        print "\nresponse.request.headers"
+        print response.request.headers
+        print "\n"
+    def after_login_org(self, response):
+        ## Check login succeed before going on ##
+        print response.headers
+        print "\n"
+        print response.real_url
+        print "\n"
+        print response.request.headers
+        print "\n"
+        cookie_list = response.request.headers.getlist('Cookie')
+        cfc, cfd = cookie_list[0].split(';')
+        cfc = cfc.strip().split('=')
+        cfd = cfd.strip().split('=')
+        cookies = [cfc[1], cfd[1]]
+        cookies = {cfc[0]: cfc[1], cfd[0]: cfd[1]}
+        session_legend = response.css('div.topnav > a').extract()[-1]
+        print response.css('h1.entry-title').extract_first()
+        print "\n"
+        if session_legend is not None:
+            session_legend = remove_tags(session_legend)
+            if not "Cerrar" in session_legend:
+                print "Login failed."
+            else:
+                print session_legend
+                print "\n"
+                token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
+                print token
+                print "\n"
+                if self.stop_date is None:
+                    yield scrapy.Request(url=self.baseURL, callback=self.parse)
+                else:
+                    yield SplashRequest(url=self.baseURL, callback=self.parse_with_stop_date,
+                        cookies=cookies,
+                        endpoint='execute',
+                        cache_args=['lua_source'],
+                        args={'lua_source': script},
+                        headers={'User-Agent': USER_AGENT}
+                    )
+                    # request = SplashRequest(url=self.baseURL, callback=self.parse_with_stop_date,
+                    #     endpoint='execute',
+                    #     cache_args=['lua_source'],
+                    #     args={'lua_source': script},
+                    #     headers={'User-Agent': USER_AGENT}
+                    # )
+                    # request.meta['splash']['session_id'] = cookie_list[0]
+                    # yield request
+        # if "authentication failed" in response.body:
+        #     self.logger.error("Login failed.")
+        #     return
+        # else:
+        #     # token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
+        #     if self.stop_date is None:
+        #         yield scrapy.Request(url=self.baseURL, callback=self.parse, dont_filter=True)
+        #     else:
+        #         yield scrapy.Request(url=self.baseURL, callback=self.parse_with_stop_date, dont_filter=True)
+                # yield scrapy.Request(
+                #     url=self.baseURL,
+                #     callback=self.parse_with_stop_date,
+                #     cookies=token,
+                #     headers={'User-Agent' : agent}
+                # )
+    def parse_with_stop_date(self, response):
+        print "parse_with_stop_date"
+        print "\n"
+        print response.css('h1.entry-title').extract_first()
+        print "\n"
+        print response.cookiejar
+        print "\n"
+        print response.headers
+        print "\n"
+        session_legend = response.css('div.topnav > a').extract()[-1]
+        if session_legend is not None :
+            print remove_tags(session_legend)
+            print "\n"
+        else :
+            print "No log in."
+        TO_NEXT_PAGE = True
+        for item in response.css('div.catpor-box > div'):
+            item_date = item.css('span.catpor-published').extract_first()
+            if item_date is not None:
+                item_date    = remove_tags(item_date).replace(",", '')
+                item_date    = item_date.split(' ')
+                item_date[1] = self.month_parser[item_date[1]]
+                item_date    = map(int, item_date)
+                item_date    = date(item_date[2], item_date[1], item_date[0])
+                if item_date >= self.stop_date:
+                    item_link = item.css('span.catpor-title > a::attr(href)').extract_first()
+                    print item_link
+                    # token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
+                    # yield scrapy.Request(url=item_link, callback=self.parse_links, cookies=cookies)
+                    # yield scrapy.Request(url=item_link, callback=self.parse_links, cookies=token,
+                    #                         headers={'User-Agent' : agent})
+                    # yield SplashRequest(url=item_link, callback=self.parse_links,
+                    #     endpoint='execute',
+                    #     cache_args=['lua_source'],
+                    #     args={'lua_source': script},
+                    #     headers={'User-Agent': USER_AGENT}
+                    # )
+                else:
+                    TO_NEXT_PAGE = False
+                    break
+        if TO_NEXT_PAGE:
+            next_page = response.css('div.page-navigation > div.nav-next > a::attr(href)').extract_first()
+            if next_page is not None:
+                # yield scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
+                yield SplashRequest(url=next_page, callback=self.parse_with_stop_date,
+                    endpoint='execute',
+                    cache_args=['lua_source'],
+                    args={'lua_source': script},
+                    headers={'User-Agent': USER_AGENT}
+                )
+    def parse_links_org(self, response):
+        print "\n\n"
+        print response.headers
+        for link in response.css('div.post-container > h2 > a::attr(href)').extract():
+            # print link
+            # token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
+            # yield SplashRequest(url=link, callback=self.parse_item, endpoint='render.html', args={ 'wait': 0.5 }, cookies=token,
+            #         headers={'User-Agent' : agent})
+            yield SplashRequest(url=link, callback=self.parse_item,
+                endpoint='execute',
+                cache_args=['lua_source'],
+                args={'lua_source': script},
+                headers={'User-Agent': USER_AGENT}
+            )
+    def parse_item_org(self, response):
+        # if response.url == "https://hemeroteca.proceso.com.mx/?page_id=278958&a51dc26366d99bb5fa29cea4747565fec=420203":
+        #     print response.body
+        item = NoticiasItem()
+        text = ''
+        news_date = response.xpath('//div[@id="primary"]').css('span.published').extract_first()
+        if news_date is not None:
+            news_date = remove_tags(news_date)
+            print news_date
+            d, t = news_date.split(' ')
+            d = map(int, d.split("-"))
+            t = map(int, t.split(":"))
+            news_date = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat('T')
+        title = response.xpath('//div[@id="primary"]/div/h1').extract_first()
+        if title is not None : title = remove_tags(title)
+        topic = response.css('span.entry-categories').extract_first()
+        if topic is not None : topic = remove_tags(topic)
+        for paragraph in response.xpath('//div[@id="primary"]').css('div.entry-content > div').css('p').extract():
+            text += remove_tags(paragraph) + '\n'
+        ## News item info ##
+        item['date']  = news_date
+        item['title'] = title
+        item['topic'] = topic
+        item['text']  = text.strip()
+        item['url']   = response.url
+        yield item
--- a/descarga_por_mes/proceso_prueba/scrapy.cfg
+++ b/descarga_por_mes/proceso_prueba/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+[settings]
+default = proceso_prueba.settings
+[deploy]
+#url = http://localhost:6800/
+project = proceso_prueba
--- a/descarga_por_rss/elFinanciero/elFinanciero/spiders/noticias.py
+++ b/descarga_por_rss/elFinanciero/elFinanciero/spiders/noticias.py
 # -*- coding: utf-8 -*-
+"""
+MEDIA:
+    El Financiero, CDMX
+USAGE:
+    ## Get the news from RSS. ##
+    ---------------------------------------------------------------------------------------------
+    $ cd elFinanciero/
+    $ scrapy crawl noticias --nolog -s filename=2017-12-20.json
+"""
 import scrapy, re, json
 from elFinanciero.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo
-"""
-MEDIO:
-El Financiero, CDMX
-USO:
-scrapy crawl noticias --nolog -s filename=2017-12-20.json
-"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
@@ -16,65 +22,73 @@ def remove_tags(text):
 class UTC(tzinfo):
-	"""clase para el 'time zone' (zona horaria)"""
+    """
+    Class for Time Zone
+    """
    def utcoffset(self, dt):
-		# zona horaria para hidalgo (centro de mexico): utc-6
+        ## Time zone for CDMX: UTC-6 ##
        return timedelta(hours=-6)
    def tzname(self, dt):
-		# nombre de la zona horaria
+        ## Time zone name ##
        return 'UTC-6'
 class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
    name = "noticias"
    def start_requests(self):
        self.tz = UTC()
-        # self.date_parser = {'enero': 1,      'febrero': 2,  'marzo': 3,      'abril': 4,
-        #                     'mayo': 5,       'junio': 6,    'julio': 7,      'agosto': 8,
-        #                     'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
        self.baseURL = "http://www.elfinanciero.com.mx/rss"
        yield scrapy.Request(url=self.baseURL, callback=self.parse)
    def parse(self, response):
        for link in response.xpath('//link/text()').extract()[1:]:
            yield scrapy.Request(url=link, callback=self.parse_item)
    def parse_item(self, response):
        item = NoticiasItem()
        text = ''
-        res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
+        res = response.xpath('//script[@data-schema="NewsArticle"]').extract_first()
+        if res is not None : res = remove_tags(res)
        resDict = json.loads(res)
        dt = resDict['datePublished']
        d,t = dt.split()
        d = map(int, d.split("-"))
        t = map(int, t.split(":"))
-        item['date'] = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat("T")
+        news_date = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat("T")
-        item['title'] = remove_tags(response.css('div.column').css('div.column').css('h1').extract_first()).strip()
+        title = remove_tags(response.css('div.column').css('div.column').css('h1').extract_first()).strip()
        topic = response.xpath('//div[@class="section-line"]').extract_first()
        if topic is not None:
-            item['topic'] = remove_tags(topic)
+            topic = remove_tags(topic)
-        else:
-            item['topic'] = None
        author = response.xpath('//div[@class="note-author"]/a').extract_first()
        if author is not None:
-            item['author'] = remove_tags(author)
+            author = remove_tags(author)
        for p in response.css('div.content').css('p').extract():
            text += remove_tags(p) + '\n'
-        item['text'] = text.strip()
+        ## News item info ##
+        item['date']  = news_date
+        item['title'] = title
+        item['topic'] = topic
+        item['author'] = author
+        item['text']  = text.strip()
        item['url']  = response.url
-        # print item['title']
        yield item