merge with dev

093e0e82 · Renán Sosa Guillen · fc91c136 · 15cbb498 · 093e0e82 · 093e0e82
Commit 093e0e82 authored Oct 17, 2018 by Renán Sosa Guillen
105 changed files
--- a/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/__init__.py
+++ b/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/__init__.py
--- a/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/items.py
+++ b/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/items.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/middlewares.py
+++ b/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/middlewares.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class DiariodechiapasSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class DiariodechiapasDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/pipelines.py
+++ b/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/pipelines.py
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
+        return item
--- a/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/settings.py
+++ b/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/settings.py
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for diarioDeChiapas project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'diarioDeChiapas'
+
+SPIDER_MODULES = ['diarioDeChiapas.spiders']
+NEWSPIDER_MODULE = 'diarioDeChiapas.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'diarioDeChiapas (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'diarioDeChiapas.middlewares.DiariodechiapasSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'diarioDeChiapas.middlewares.DiariodechiapasDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'diarioDeChiapas.pipelines.JsonWriterPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/spiders/__init__.py
+++ b/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/spiders/noticias.py
+++ b/descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/spiders/noticias.py
+# -*- coding: utf-8 -*-
+
+"""
+MEDIA:
+    Diario de Chiapas, Chiapas
+
+USAGE
+    $ cd diarioDeChiapas
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
+    for the news contained in noticias.json being splitted into files by date. ##
+    $ scrapy crawl noticias --nolog -s filename=noticias.json
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to a specific date. ##
+    $ scrapy crawl noticias --nolog -s filename=2018-08-30.json -a year=2018 -a month=8 -a day=30
+"""
+
+import scrapy, re, json
+from datetime import datetime, date
+from diarioDeChiapas.items import NoticiasItem
+
+
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+
+
+
+class ImportantData(scrapy.Item):
+    """
+    Useful data for the flow of the implementation
+    """
+    to_next_page = scrapy.Field()
+    is_last_link = scrapy.Field()
+    news_section = scrapy.Field()
+    return_url   = scrapy.Field()
+
+
+
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    name = "noticias"
+
+    def start_requests(self):
+        year  = getattr(self, "year", None)
+        month = getattr(self, "month", None)
+        day   = getattr(self, "day", None)
+
+        if year is not None and month is not None and day is not None:
+            self.stopDate = date(int(year), int(month), int(day))
+        else:
+            self.stopDate = None
+
+        baseURL = "http://www.diariodechiapas.com/landing/"
+
+        section_list = ["editorial", "portada", "metropoli", "region", "la-roja",
+                       "deportes", "boga", "ae", "trascendio"]
+        # section_list = ["editorial"]
+
+        if self.stopDate is None:
+            for s in section_list:
+                yield scrapy.Request(url=baseURL + s, callback=self.parse)
+
+        else:
+            for s in section_list:
+                flow_info = ImportantData()
+                flow_info['to_next_page'] = False
+                
+                request = scrapy.Request(url=baseURL + s, callback=self.parse_with_stop_date)
+                request.meta['item'] = flow_info
+                
+                yield request
+
+
+
+    def parse(self, response):
+        link_list = response.xpath('//section[@class="page__content"]').css('section.post').xpath('./a[@class="post__link"]/@href').extract()
+        section   = response.xpath('//section[@class="wrapper"]/h1').extract_first()
+        if section is not None : section = remove_tags(section)
+
+        for link in link_list:
+            flow_info = ImportantData()
+            flow_info['news_section'] = section
+
+            request = scrapy.Request(url=link, callback=self.parse_item)
+            request.meta['item'] = flow_info
+
+            yield request
+
+        next_page = response.css('div.wp-pagenavi').css('a.nextpostslink').css('::attr(href)').extract_first()
+        if next_page is not None:
+            yield scrapy.Request(url=next_page, callback=self.parse)
+
+
+
+    def parse_with_stop_date(self, response):
+        flow_info = response.meta['item']
+
+        if not flow_info['to_next_page']:
+            link_list = response.xpath('//section[@class="page__content"]').css('section.post').xpath('./a[@class="post__link"]/@href').extract()
+            section   = response.xpath('//section[@class="wrapper"]/h1').extract_first()
+            if section is not None : section = remove_tags(section)
+
+            for link in link_list:
+                flow_info = ImportantData()
+                flow_info['news_section'] = section
+                flow_info['return_url'] = response.url
+                
+                if link == link_list[-1] : flow_info['is_last_link'] = True
+                else                     : flow_info['is_last_link'] = False
+                
+                request = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
+                request.meta['item'] = flow_info
+                
+                yield request
+
+        else:
+            next_page = response.css('div.wp-pagenavi').css('a.nextpostslink').css('::attr(href)').extract_first()
+            if next_page is not None:
+                flow_info['to_next_page'] = False
+                
+                request = scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
+                request.meta['item'] = flow_info
+                
+                yield request
+
+
+
+    def parse_item(self, response):
+        flow_info = response.meta['item']
+        item = NoticiasItem()
+        text = ''
+
+        title = response.xpath('//section[@class="single__content"]/h1').extract_first()
+        if title is not None : title = remove_tags(title)
+
+        for p in response.xpath('//section[@class="single__content"]').css('p').extract():
+            text += remove_tags(p) + "\n"
+
+        ## News item info ##
+        item['date']  = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+        item['topic'] = flow_info['news_section']
+        item['title'] = title
+        item['text']  = text.strip()
+        item['url']   = response.url
+
+        yield item
+
+
+
+    def parse_item_with_stop_date(self, response):
+        news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+        news_date = news_date[:news_date.find('T')]
+        news_date = datetime.strptime(news_date, '%Y-%m-%d').date()
+
+        if news_date >= self.stopDate:
+            flow_info = response.meta['item']
+            item = NoticiasItem()
+            text = ''
+
+            title = response.xpath('//section[@class="single__content"]/h1').extract_first()
+            if title is not None : title = remove_tags(title)
+
+            for p in response.xpath('//section[@class="single__content"]').css('p').extract():
+                text += remove_tags(p) + "\n"
+
+            ## News item info ##
+            item['date']  = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+            item['topic'] = flow_info['news_section']
+            item['title'] = title
+            item['text']  = text.strip()
+            item['url']   = response.url
+
+            yield item
+
+            if flow_info['is_last_link']:
+                flow_info['to_next_page'] = True
+                
+                request = scrapy.Request(url=flow_info['return_url'], callback=self.parse_with_stop_date, dont_filter=True)
+                request.meta['item'] = flow_info
+                
+                yield request
--- a/descarga_hacia_atras/diarioDeChiapas/scrapy.cfg
+++ b/descarga_hacia_atras/diarioDeChiapas/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = diarioDeChiapas.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = diarioDeChiapas
--- a/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/__init__.py
+++ b/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/__init__.py
--- a/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/items.py
+++ b/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/items.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/middlewares.py
+++ b/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/middlewares.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class DiarioindependienteSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class DiarioindependienteDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/pipelines.py
+++ b/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/pipelines.py
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
+        return item
--- a/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/settings.py
+++ b/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/settings.py
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for diarioIndependiente project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'diarioIndependiente'
+
+SPIDER_MODULES = ['diarioIndependiente.spiders']
+NEWSPIDER_MODULE = 'diarioIndependiente.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'diarioIndependiente (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'diarioIndependiente.middlewares.DiarioindependienteSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'diarioIndependiente.middlewares.DiarioindependienteDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'diarioIndependiente.pipelines.JsonWriterPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/spiders/__init__.py
+++ b/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/spiders/noticias.py
+++ b/descarga_hacia_atras/diarioIndependiente/diarioIndependiente/spiders/noticias.py
+# -*- coding: utf-8 -*-
+
+"""
+MEDIA:
+    Diario El Independiente, Baja California Sur
+
+USAGE:
+    $ cd elIndependiente/
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
+    for the news contained in noticias.json being splitted into files by date. ##
+    
+    $ scrapy crawl noticias --nolog -s filename=noticias.json
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to a specific date. ##
+    
+    $ scrapy crawl noticias --nolog -s filename=2018-08-10.json -a year=2018 -a month=8 -a day=10
+"""
+
+import scrapy, re, json
+from datetime import datetime, date
+from diarioIndependiente.items import NoticiasItem
+
+
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+
+
+
+class ImportantFlowData(scrapy.Item):
+    """
+    Useful data for the flow of the implementation
+    """
+    to_next_page = scrapy.Field()
+    is_last_link = scrapy.Field()
+    return_url   = scrapy.Field()
+
+
+
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    name = "noticias"
+
+
+    def start_requests(self):
+        year  = getattr(self, "year", None)
+        month = getattr(self, "month", None)
+        day   = getattr(self, "day", None)
+
+        if year is not None and month is not None and day is not None:
+            self.stop_date = date(int(year), int(month), int(day))
+            base_url = "https://www.diarioelindependiente.mx/" + year + "/" + month + "/"
+
+        else:
+            self.stop_date = None
+            section_list = ["la-paz", "los-cabos", "policiaca", "deportes", "cultura", "nacional",
+                            "internacional", "opinion", "espectaculos", "tecnologia"]
+            
+            base_url = "https://www.diarioelindependiente.mx/"        
+
+        
+        if self.stop_date is None:
+            for s in section_list:
+                yield scrapy.Request(url=base_url + s, callback=self.parse)
+        
+        else:
+            flow_info = ImportantFlowData()
+            flow_info['to_next_page'] = False
+            
+            request = scrapy.Request(url=base_url, callback=self.parse_with_stop_date)
+            request.meta['item'] = flow_info
+            yield request
+
+
+
+    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+
+        pagination = response.css('div.paginacion').xpath('./ul/li/a/@href').extract()
+        if len(pagination) > 0:
+            pagination = pagination[-2]
+            pages = int(pagination[pagination.rfind('=') + 1:])
+
+            for page in xrange(1, pages):
+                yield scrapy.Request(url=response.url + "?page=" + str(page + 1), callback=self.parse_page)
+
+
+
+    def parse_page(self, response):
+        link_list = response.xpath('//div[@id="colNoticias"]').css('article.card__article').xpath('./h2/a/@href').extract()
+
+        for link in link_list:
+            yield scrapy.Request(url=link, callback=self.parse_item)
+
+
+
+    def parse_with_stop_date(self, response):
+        flow_info = response.meta['item']
+
+        if not flow_info['to_next_page']:
+            link_list = response.xpath('//div[@id="colNoticias"]').css('article.card__article').xpath('./h2/a/@href').extract()
+
+            for link in link_list:
+                flow_info = ImportantFlowData()
+                flow_info['return_url'] = response.url
+
+                if link == link_list[-1] : flow_info['is_last_link'] = True
+                else                     : flow_info['is_last_link'] = False
+
+                request = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
+                request.meta['item'] = flow_info
+                
+                yield request
+
+        else:
+            next_page = response.css('div.paginacion').xpath('./ul/li/a[@rel="next"]/@href').extract_first()
+
+            if next_page is not None:
+                flow_info['to_next_page'] = False
+
+                request = scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
+                request.meta['item'] = flow_info
+                
+                yield request
+
+
+
+    def parse_item(self, response):
+        item = NoticiasItem()
+        text = ''
+
+        news_date = response.xpath('//meta[@name="date"]/@content').extract_first()
+
+        try:
+            topic = response.xpath('//span[@class="badge"]').extract_first()
+        except:
+            topic = None
+
+        for p in response.css('div.cuerpo_noticia').css('p').extract():
+                text += remove_tags(p) + "\n"
+
+        ## News item info ##
+        item['date']  = datetime.strptime(news_date, '%Y-%m-%d').isoformat("T")
+        item['title'] = remove_tags(response.css('h1.colorRojo').extract_first())
+        item['topic'] = remove_tags(topic)
+        item['text']  = text.strip()
+        item['url']   = response.url
+
+        yield item
+
+
+
+    def parse_item_with_stop_date(self, response):
+        news_date = response.xpath('//meta[@name="date"]/@content').extract_first()
+        news_date = datetime.strptime(news_date, '%Y-%m-%d').date()
+
+        if news_date >= self.stop_date:
+            flow_info = response.meta['item']
+            item = NoticiasItem()
+            text = ''
+
+            news_date = datetime.strptime(news_date.isoformat(), '%Y-%m-%d').isoformat("T")
+
+            title = response.css('h1.colorRojo').extract_first()
+            if title is not None : title = remove_tags(title)
+
+            topic = response.xpath('//span[@class="badge"]').extract_first()
+            if topic is not None : topic = remove_tags(topic)
+
+            for p in response.css('div.cuerpo_noticia').css('p').extract():
+                text += remove_tags(p) + "\n"
+
+            ## News item info ##
+            item['date']  = news_date
+            item['title'] = title
+            item['topic'] = topic
+            item['text']  = text.strip()
+            item['url']   = response.url
+
+            yield item
+
+            if flow_info['is_last_link']:
+                flow_info['to_next_page'] = True
+                
+                request = scrapy.Request(url=flow_info['return_url'], callback=self.parse_with_stop_date, dont_filter=True)
+                request.meta['item'] = flow_info
+                
+                yield request
--- a/descarga_hacia_atras/diarioIndependiente/scrapy.cfg
+++ b/descarga_hacia_atras/diarioIndependiente/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = diarioIndependiente.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = diarioIndependiente
--- a/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/__init__.py
+++ b/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/__init__.py
--- a/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/items.py
+++ b/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/items.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/middlewares.py
+++ b/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/middlewares.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class HeraldochihuahuaSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class HeraldochihuahuaDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/pipelines.py
+++ b/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/pipelines.py
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
+        return item
--- a/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/settings.py
+++ b/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/settings.py
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for heraldoChihuahua project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'heraldoChihuahua'
+
+SPIDER_MODULES = ['heraldoChihuahua.spiders']
+NEWSPIDER_MODULE = 'heraldoChihuahua.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'heraldoChihuahua (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'heraldoChihuahua.middlewares.HeraldochihuahuaSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'heraldoChihuahua.middlewares.HeraldochihuahuaDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'heraldoChihuahua.pipelines.JsonWriterPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/spiders/__init__.py
+++ b/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/spiders/noticias.py
+++ b/descarga_hacia_atras/heraldoChihuahua/heraldoChihuahua/spiders/noticias.py
--- a/descarga_hacia_atras/heraldoChihuahua/scrapy.cfg
+++ b/descarga_hacia_atras/heraldoChihuahua/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = heraldoChihuahua.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = heraldoChihuahua
--- a/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/__init__.py
+++ b/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/__init__.py
--- a/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/items.py
+++ b/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/items.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class LajornadamayaItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
--- a/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/middlewares.py
+++ b/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/middlewares.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class LajornadamayaSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/pipelines.py
+++ b/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/pipelines.py
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+class LajornadamayaPipeline(object):
+    def process_item(self, item, spider):
+        return item
--- a/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/settings.py
+++ b/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/settings.py
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for laJornadaMaya project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'laJornadaMaya'
+
+SPIDER_MODULES = ['laJornadaMaya.spiders']
+NEWSPIDER_MODULE = 'laJornadaMaya.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'laJornadaMaya (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'laJornadaMaya.middlewares.LajornadamayaSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'laJornadaMaya.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'laJornadaMaya.pipelines.LajornadamayaPipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/spiders/__init__.py
+++ b/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/spiders/noticias.py
+++ b/descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/spiders/noticias.py
+import scrapy, json, re
+from datetime import datetime, date, timedelta, tzinfo
+
+"""
+Esta version descarga ingresando una fecha.
+USO:
+
+scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+
+No es recomendable para fechas de mas de un mes de antiguas.
+"""
+
+
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+	return TAG_RE.sub('', text)
+
+
+class UTC(tzinfo):
+	"""clase para el 'time zone' (zona horaria)"""
+
+	def utcoffset(self, dt):
+		# zona horaria para yucatan (centro de mexico): utc-6
+		return timedelta(hours=-6)
+
+	def tzname(self, dt):
+		# nombre de la zona horaria
+		return 'UTC-6'
+
+
+class NoticiasItem(scrapy.Item):
+	title = scrapy.Field()
+	text = scrapy.Field()
+	date = scrapy.Field()
+	location = scrapy.Field()
+	author = scrapy.Field()
+	topic = scrapy.Field()
+	url = scrapy.Field()
+
+
+class QuotesSpider(scrapy.Spider):
+	name = "noticias"
+	
+	def start_requests(self):
+		# self.found = False
+		# self.flag = False
+		self.tz = UTC()
+		self.year = getattr(self, 'year', None)
+		self.month = getattr(self, 'month', None)
+		self.day = getattr(self, 'day', None)
+		self.req_date = date(int(self.year), int(self.month), int(self.day))
+		self.date_format = "%Y-%m-%d"
+		self.baseURL = 'https://www.lajornadamaya.mx'
+		section_list = ['yucatan', 'quintana-roo', 'campeche', 'deportes', 'nacional',
+						'internacional', 'opinion']
+		# section_list = ['deportes']
+
+		for section in section_list:
+			self.section = section
+			for count in range(0,2):
+				if ( count == 0 ):
+					yield scrapy.Request(url=self.baseURL+'/'+section, callback=self.parse_2)
+				elif (count == 1):
+					# self.section = section
+					self.page = 0
+					self.flag = False
+					self.found = False
+					page = -1
+					if not ( section == 'opinion' ):
+						while True:
+							if ( self.flag ):
+								self.flag = False
+								break
+							page+=1
+							yield scrapy.Request(url=self.baseURL+'/'+section+'?p='+str(page), callback=self.parse)
+						
+						if ( self.found ):
+							self.found = False
+							self.page -= 1
+							if ( self.page > 0 ):
+								self.page -= 1
+							
+							for pag in range(self.page, self.page+6):
+								yield scrapy.Request(url=self.baseURL+'/'+section+'?p='+str(pag), callback=self.parse_page, dont_filter=True)
+
+					else:
+						yield scrapy.Request(url=self.baseURL+'/notas?opinion', callback=self.parse_page)
+				
+	
+	def parse_2(self, response): # para las primeras noticias
+		path_list = ['//h1[@class="title"]/a/@href', '//h2[@class="title"]/a/@href']
+		link_list = []
+		for path in path_list:
+			link_list += response.xpath(path).extract()
+		
+		for link in link_list:
+			if ( link[:link.rfind('/')] == self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2) ):
+				item = NoticiasItem()
+				
+				d = link[:link.rfind('/')]
+				if len(d) == 10:
+					d = map(int, d.split('-'))
+					d = datetime(d[0], d[1], d[2], tzinfo=self.tz).isoformat('T')
+				
+				elif len(d) == 19:
+					d, t = d.split(' ')
+					d = map(int, d.split('-'))
+					t = map(int, t.split(':'))
+					d = datetime(d[0],d[1],d[2],t[0],t[1],t[2],tzinfo=self.tz).isoformat('T')
+
+				item['date'] = d
+
+				
+				item['topic'] = response.url[response.url.rfind('/')+1:].title()
+				# yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item_2)
+				request = scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item_2)
+				request.meta['item'] = item
+				yield request
+
+
+	def parse(self, response): # para los json
+		json_response = json.loads(response.text)
+
+		if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
+			json_list = json_response
+		else:
+			json_list = json_response['articles']
+		
+		for line in json_list:
+			this_date = datetime.strptime(line['publishDate'][:line['publishDate'].rfind(' ')], self.date_format)
+			this_date = this_date.date()
+
+			if ( this_date == self.req_date ):
+				self.page = int(response.url[response.url.rfind('=')+1:])
+				self.found = True
+				self.flag = True
+				break
+			elif ( this_date < self.req_date ):
+				self.flag = True
+				break
+				
+	
+	def parse_item_2(self, response): # para las primeras noticias
+		item = response.meta['item']
+		# item = NoticiasItem()
+		text = ''
+		# item['date'] = response.url[:response.url.rfind('/')][response.url[:response.url.rfind('/')].rfind('/')+1:]
+		# item['topic'] = self.section.title()
+		item['title'] = response.xpath('//article/h1/text()').extract_first()
+		for paragraph in response.xpath('//*[@class="txt"]').extract():
+			text += remove_tags(paragraph) + '\n'
+		item['text'] = text
+		item['url'] = response.url
+		print item['title']
+		yield item
+
+
+	def parse_page(self, response): # para los json
+		json_response = json.loads(response.text)
+
+		if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
+			topic = response.url[response.url.rfind('/')+1:response.url.rfind('=')-2].title()
+			json_list = json_response
+		else:
+			json_list = json_response['articles']
+			topic = 'Opinion'
+		
+		for line in json_list:
+			this_date = datetime.strptime(line['publishDate'][:line['publishDate'].rfind(' ')], self.date_format)
+			this_date = this_date.date()
+			
+			if ( this_date == self.req_date ):
+				item = NoticiasItem()
+				# item['date'] = line['publishDate']
+				d = line['publishDate']
+				if len(d) == 10:
+					d = map(int, d.split('-'))
+					d = datetime(d[0], d[1], d[2], tzinfo=self.tz).isoformat('T')
+				
+				elif len(d) == 19:
+					d, t = d.split(' ')
+					d = map(int, d.split('-'))
+					t = map(int, t.split(':'))
+					d = datetime(d[0],d[1],d[2],t[0],t[1],t[2],tzinfo=self.tz).isoformat('T')
+
+				item['date'] = d
+
+
+				item['topic'] = topic
+				item['title'] = line['name']
+				if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
+					request = scrapy.Request(url=self.baseURL+line['url'], callback=self.parse_item)
+				else:
+					request = scrapy.Request(url=self.baseURL+'/'+line['publishDate'][:line['publishDate'].rfind(' ')]+'/'+line['uriComponent'], callback=self.parse_item)
+				request.meta['item'] = item
+				yield request
+
+
+	def parse_item(self, response): # para los json
+		item = response.meta['item']
+		text = ''
+		for paragraph in response.xpath('//*[@class="txt"]').extract():
+			text += remove_tags(paragraph) + '\n'
+		item['text'] = text
+		item['url'] = response.url
+		print item['title']
+		yield item
+
--- a/descarga_hacia_atras/laJornadaMaya_deprecated/scrapy.cfg
+++ b/descarga_hacia_atras/laJornadaMaya_deprecated/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = laJornadaMaya.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = laJornadaMaya
--- a/descarga_hacia_atras/tintaFresca/scrapy.cfg
+++ b/descarga_hacia_atras/tintaFresca/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = tintaFresca.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = tintaFresca
--- a/descarga_hacia_atras/tintaFresca/tintaFresca/__init__.py
+++ b/descarga_hacia_atras/tintaFresca/tintaFresca/__init__.py
--- a/descarga_hacia_atras/tintaFresca/tintaFresca/items.py
+++ b/descarga_hacia_atras/tintaFresca/tintaFresca/items.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_hacia_atras/tintaFresca/tintaFresca/middlewares.py
+++ b/descarga_hacia_atras/tintaFresca/tintaFresca/middlewares.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class TintafrescaSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class TintafrescaDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_hacia_atras/tintaFresca/tintaFresca/pipelines.py
+++ b/descarga_hacia_atras/tintaFresca/tintaFresca/pipelines.py
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
+        return item
--- a/descarga_hacia_atras/tintaFresca/tintaFresca/settings.py
+++ b/descarga_hacia_atras/tintaFresca/tintaFresca/settings.py
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for tintaFresca project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'tintaFresca'
+
+SPIDER_MODULES = ['tintaFresca.spiders']
+NEWSPIDER_MODULE = 'tintaFresca.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'tintaFresca (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'tintaFresca.middlewares.TintafrescaSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'tintaFresca.middlewares.TintafrescaDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'tintaFresca.pipelines.JsonWriterPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_hacia_atras/tintaFresca/tintaFresca/spiders/__init__.py
+++ b/descarga_hacia_atras/tintaFresca/tintaFresca/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_hacia_atras/tintaFresca/tintaFresca/spiders/noticias.py
+++ b/descarga_hacia_atras/tintaFresca/tintaFresca/spiders/noticias.py
+# -*- coding: utf-8 -*-
+
+"""
+MEDIA:
+    Tinta Fresca, Chiapas
+
+USAGE
+    $ cd tintaFresca
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
+    for the news contained in noticias.json being splitted into files by date. ##
+    $ scrapy crawl noticias --nolog -s filename=noticias.json
+    ------------------------------------------------------------------------------------------------------------
+    ## Get all the news from the most current to a specific date. ##
+    $ scrapy crawl noticias --nolog -s filename=2018-08-30.json -a year=2018 -a month=8 -a day=30
+"""
+
+import scrapy, re, json
+from datetime import datetime, date, tzinfo, timedelta
+from tintaFresca.items import NoticiasItem
+
+
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+
+
+class UTC(tzinfo):
+    """
+    Class for Time Zone
+    """
+    def utcoffset(self, dt):
+        ## Time zone for Chiapas: UTC-6 ##
+        return timedelta(hours=-6)
+
+    def tzname(self, dt):
+        ## Time zone name ##
+        return 'UTC-6'
+
+
+
+class ImportantData(scrapy.Item):
+    """
+    Useful data for the flow of the implementation
+    """
+    to_next_page = scrapy.Field()
+    is_last_link = scrapy.Field()
+    news_section = scrapy.Field()
+    return_url   = scrapy.Field()
+
+
+
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    name = "noticias"
+
+    def start_requests(self):
+        self.tz = UTC()
+        year    = getattr(self, "year", None)
+        month   = getattr(self, "month", None)
+        day     = getattr(self, "day", None)
+
+        if year is not None and month is not None and day is not None:
+            self.stopDate = date(int(year), int(month), int(day))
+        else:
+            self.stopDate = None
+
+        baseURL = "http://tintafresca.com.mx/"
+
+        # section_list = ["letras_en_su_tinta/page1/", "tgz/page1/", "patria_chica/page1/", "hecho_en_chiapas/page1/", "show/page1/", "rafaga/page1/"]
+        section_list = ["tgz/page1/", "patria_chica/page1/", "hecho_en_chiapas/page1/", "show/page1/"]
+
+        self.month_parser = dict(Enero='01', Febrero='02', Marzo='03',      Abril='04',   Mayo='05',      Junio='06',
+                                 Julio='07', Agosto='08',  Septiembre='09', Octubre='10', Noviembre='11', Diciembre='12')
+
+        if self.stopDate is None:
+            for s in section_list:
+                yield scrapy.Request(url=baseURL + s, callback=self.parse)
+
+        else:
+            for s in section_list:
+                flow_info = ImportantData()
+                flow_info['to_next_page'] = False
+                
+                request = scrapy.Request(url=baseURL + s, callback=self.parse_with_stop_date)
+                request.meta['item'] = flow_info
+                
+                yield request
+
+
+
+    def parse(self, response):
+        link_list = response.css('ul.tintas').css('a.leer::attr(href)').extract()
+        # section   = response.xpath('//div[@id="ruta"]').extract_first()
+        
+        # if section is not None:
+        #     section = remove_tags(section)
+        #     section = section.replace("Inicio &gt; ", '')
+
+        for link in link_list:
+            flow_info = ImportantData()
+            # flow_info['news_section'] = section
+
+            request = scrapy.Request(url=link, callback=self.parse_item)
+            request.meta['item'] = flow_info
+
+            yield request
+
+        next_page = response.xpath('//ul[@class="pagination"]/li[3]/a/@href').extract_first()
+        if next_page is not None:
+            yield scrapy.Request(url=next_page, callback=self.parse)
+
+
+
+    def parse_with_stop_date(self, response):
+        flow_info = response.meta['item']
+
+        if not flow_info['to_next_page']:
+            link_list = response.css('ul.tintas').css('a.leer::attr(href)').extract()
+            # section   = response.xpath('//div[@id="ruta"]').extract_first()
+            
+            # if section is not None:
+            #     section = remove_tags(section)
+            #     section = section.replace("Inicio &gt; ", '')
+
+            for link in link_list:
+                flow_info = ImportantData()
+                # flow_info['news_section'] = section
+                flow_info['return_url'] = response.url
+                
+                if link == link_list[-1] : flow_info['is_last_link'] = True
+                else                     : flow_info['is_last_link'] = False
+                
+                request = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
+                request.meta['item'] = flow_info
+                
+                yield request
+
+        else:
+            next_page = response.xpath('//ul[@class="pagination"]/li[3]/a/@href').extract_first()
+            if next_page is not None:
+                flow_info['to_next_page'] = False
+                
+                request = scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
+                request.meta['item'] = flow_info
+                
+                yield request
+
+
+
+    def parse_item(self, response):
+        flow_info = response.meta['item']
+        item = NoticiasItem()
+        text = ''
+
+        date_str = response.xpath('//div[@class="balazo"]').extract_first()
+        if date_str.find('<br>') > -1 : date_str = date_str[date_str.find('<br>'):]
+        date_str = remove_tags(date_str)
+        date_lst = date_str.split('/')
+        date_lst[1] = self.month_parser[date_lst[1]]
+        date_lst = map(int, date_lst)
+        news_date = datetime(date_lst[2], date_lst[1], date_lst[0], tzinfo=self.tz).isoformat("T")
+
+        topic = response.css('div.seccion > h3.left > a').extract_first()
+        if topic is not None : topic = remove_tags(topic)
+
+        if topic is not None:
+            if topic == "Letras en su Tinta":
+                title = ''
+                lines = response.css('div.sumario > p').extract()
+
+                for line in lines:
+                    if lines.index(line) != len(lines)-1 : title += remove_tags(line) + ". "
+                    else                                 : title += remove_tags(line) + "."
+            else:
+                title = response.css('div.titulo > h1').extract_first()
+                if title is not None : title = remove_tags(title)
+        else:
+            title = None
+
+        for p in response.css('div.contenido > p').extract():
+            text += remove_tags(p) + "\n"
+
+        ## News item info ##
+        item['date']  = news_date
+        item['topic'] = topic
+        item['title'] = title
+        item['text']  = text.strip()
+        item['url']   = response.url
+
+        yield item
+
+
+
+    def parse_item_with_stop_date(self, response):
+        date_str = response.xpath('//div[@class="balazo"]').extract_first()
+        if date_str.find('<br>') > -1 : date_str = date_str[date_str.find('<br>'):]
+        date_str = remove_tags(date_str)
+        date_lst = date_str.split('/')
+        date_lst[1] = self.month_parser[date_lst[1]]
+        news_date = "-".join(date_lst)
+        news_date = datetime.strptime(news_date, '%d-%m-%Y').date()
+
+        if news_date >= self.stopDate:
+            flow_info = response.meta['item']
+            item = NoticiasItem()
+            text = ''
+
+            date_lst = map(int, date_lst)
+            news_date = datetime(date_lst[2], date_lst[1], date_lst[0], tzinfo=self.tz).isoformat("T")
+
+            topic = response.css('div.seccion > h3.left > a').extract_first()
+            if topic is not None : topic = remove_tags(topic)
+
+            if topic is not None:
+                if topic == "Letras en su Tinta":
+                    title = ''
+                    lines = response.css('div.sumario > p').extract()
+
+                    for line in lines:
+                        if lines.index(line) != len(lines)-1 : title += remove_tags(line) + ". "
+                        else                                 : title += remove_tags(line) + "."
+                else:
+                    title = response.css('div.titulo > h1').extract_first()
+                    if title is not None : title = remove_tags(title)
+            else:
+                title = None
+
+            for p in response.css('div.contenido > p').extract():
+                text += remove_tags(p) + "\n"
+
+            ## News item info ##
+            item['date']  = news_date
+            item['topic'] = topic
+            item['title'] = title
+            item['text']  = text.strip()
+            item['url']   = response.url
+
+            yield item
+
+            if flow_info['is_last_link']:
+                flow_info['to_next_page'] = True
+                
+                request = scrapy.Request(url=flow_info['return_url'], callback=self.parse_with_stop_date, dont_filter=True)
+                request.meta['item'] = flow_info
+                
+                yield request
--- a/descarga_por_dia/cuartoPoder/cuartoPoder/__init__.py
+++ b/descarga_por_dia/cuartoPoder/cuartoPoder/__init__.py
--- a/descarga_por_dia/cuartoPoder/cuartoPoder/items.py
+++ b/descarga_por_dia/cuartoPoder/cuartoPoder/items.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/cuartoPoder/cuartoPoder/middlewares.py
+++ b/descarga_por_dia/cuartoPoder/cuartoPoder/middlewares.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class CuartopoderSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class CuartopoderDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_por_dia/cuartoPoder/cuartoPoder/pipelines.py
+++ b/descarga_por_dia/cuartoPoder/cuartoPoder/pipelines.py
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
+        return item
--- a/descarga_por_dia/cuartoPoder/cuartoPoder/settings.py
+++ b/descarga_por_dia/cuartoPoder/cuartoPoder/settings.py
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for cuartoPoder project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'cuartoPoder'
+
+SPIDER_MODULES = ['cuartoPoder.spiders']
+NEWSPIDER_MODULE = 'cuartoPoder.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'cuartoPoder (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'cuartoPoder.middlewares.CuartopoderSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'cuartoPoder.middlewares.CuartopoderDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'cuartoPoder.pipelines.JsonWriterPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_por_dia/cuartoPoder/cuartoPoder/spiders/__init__.py
+++ b/descarga_por_dia/cuartoPoder/cuartoPoder/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_por_dia/cuartoPoder/cuartoPoder/spiders/noticias.py
+++ b/descarga_por_dia/cuartoPoder/cuartoPoder/spiders/noticias.py
+# -*- coding: utf-8 -*-
+
+"""
+MEDIA:
+    Cuarto Poder, Chiapas
+
+USAGE:
+    ## Get all the news from a specific date. ##
+    ---------------------------------------------------------------------------------------------
+    $ cd cuartoPoder/
+    $ scrapy crawl noticias --nolog -s filename=2018-08-30.json -a year=2018 -a month=8 -a day=30
+"""
+
+import scrapy, re
+from cuartoPoder.items import NoticiasItem
+from datetime import datetime, timedelta, tzinfo
+
+
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+
+
+class UTC(tzinfo):
+    """
+    Class for Time Zone
+    """
+    def utcoffset(self, dt):
+        ## Time zone for Chiapas: UTC-6 ##
+        return timedelta(hours=-6)
+
+    def tzname(self, dt):
+        ## Time zone name ##
+        return 'UTC-6'
+
+
+
+class ImportantData(scrapy.Item):
+    """
+    Useful data for the flow of the implementation
+    """
+    to_next_page = scrapy.Field()
+    next_page = scrapy.Field()
+
+
+
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    name = "noticias"
+
+
+    def start_requests(self):
+        self.tz    = UTC()
+        self.year  = getattr(self, "year", None)
+        self.month = getattr(self, "month", None)
+        self.day   = getattr(self, "day", None)
+
+        self.baseURL = "http://www.cuartopoder.mx"
+        first_URL = self.baseURL + "/archivo/portada/listado/{1}-{2}-{0}/{1}-{2}-{0}/".format(self.year, self.month.zfill(2), self.day.zfill(2))
+        self.second_URL = self.baseURL + "/XStatic/cuartopoder/template/cargaBloque.aspx?strControl=ctrlArchivoResultadosPaginadoListado&"
+
+        flow_info = ImportantData()
+        flow_info['to_next_page'] = False
+        flow_info['next_page'] = 2
+        
+        request = scrapy.Request(url=first_URL, callback=self.parse)
+        request.meta['item'] = flow_info
+        
+        yield request
+
+
+
+    def parse(self, response):
+        flow_info = response.meta['item']
+
+        for link in response.css('ul.news-list').xpath('./li/h5/a/@href').extract():
+            to_next_page = True
+            news_link = self.baseURL + link
+            
+            yield scrapy.Request(url=news_link, callback=self.parse_item)
+
+        if flow_info['to_next_page']:
+            page = flow_info['next_page']
+            page_URL = self.second_URL + "p={3}&eids=&fd={1}-{2}-{0}&fh={1}-{2}-{0}&id=portada".format(self.year, self.month.zfill(2), self.day.zfill(2), str(page))        
+            
+            flow_info['to_next_page'] = False
+            flow_info['next_page'] += 1
+            
+            request = scrapy.Request(url=page_URL, callback=self.parse)
+            request.meta['item'] = flow_info
+
+            yield request
+
+
+
+    def parse_item(self, response):
+        item = NoticiasItem()
+        text = ''
+
+        news_date = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
+
+        title = response.css('div.post-title').css('h1').extract_first()
+        if title is not None : remove_tags(title)
+
+        topic = response.css('div.big-title').xpath('./h2/a/span').extract_first()
+        if topic is not None : topic = remove_tags(topic)
+
+        for p in response.css('div.post-content').css('p').extract():
+            p = remove_tags(p)
+            text += p + "\n"
+
+        ## News item info ##
+        item['date']  = news_date
+        item['title'] = title
+        item['topic'] = topic
+        item['text']  = text.strip()
+        item['url']   = response.url
+
+        yield item
--- a/descarga_por_dia/cuartoPoder/scrapy.cfg
+++ b/descarga_por_dia/cuartoPoder/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = cuartoPoder.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = cuartoPoder
--- a/descarga_por_dia/diarioPuntual/diarioPuntual/__init__.py
+++ b/descarga_por_dia/diarioPuntual/diarioPuntual/__init__.py
--- a/descarga_por_dia/diarioPuntual/diarioPuntual/items.py
+++ b/descarga_por_dia/diarioPuntual/diarioPuntual/items.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/diarioPuntual/diarioPuntual/middlewares.py
+++ b/descarga_por_dia/diarioPuntual/diarioPuntual/middlewares.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class DiariopuntualSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class DiariopuntualDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_por_dia/diarioPuntual/diarioPuntual/pipelines.py
+++ b/descarga_por_dia/diarioPuntual/diarioPuntual/pipelines.py
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
+        return item
--- a/descarga_por_dia/diarioPuntual/diarioPuntual/settings.py
+++ b/descarga_por_dia/diarioPuntual/diarioPuntual/settings.py
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for diarioPuntual project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'diarioPuntual'
+
+SPIDER_MODULES = ['diarioPuntual.spiders']
+NEWSPIDER_MODULE = 'diarioPuntual.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'diarioPuntual (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'diarioPuntual.middlewares.DiariopuntualSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'diarioPuntual.middlewares.DiariopuntualDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'diarioPuntual.pipelines.JsonWriterPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_por_dia/diarioPuntual/diarioPuntual/spiders/__init__.py
+++ b/descarga_por_dia/diarioPuntual/diarioPuntual/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_por_dia/diarioPuntual/diarioPuntual/spiders/noticias.py
+++ b/descarga_por_dia/diarioPuntual/diarioPuntual/spiders/noticias.py
+# -*- coding: utf-8 -*-
+
+"""
+MEDIA:
+    Puntual, EDOMEX
+
+USAGE:
+    ## Get all the news from a specific date. ##
+    ---------------------------------------------------------------------------------------------
+    $ cd diarioPuntual/
+    $ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
+"""
+
+import scrapy, re
+from diarioPuntual.items import NoticiasItem
+from datetime import datetime, timedelta, tzinfo
+
+
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+
+
+class UTC(tzinfo):
+    """
+    Class for Time Zone
+    """
+    def utcoffset(self, dt):
+        ## Time zone for EDOMEX: UTC-6 ##
+        return timedelta(hours=-6)
+
+    def tzname(self, dt):
+        ## Time zone name ##
+        return 'UTC-6'
+
+
+
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    name = "noticias"
+
+
+    def start_requests(self):
+        self.tz    = UTC()
+        self.year  = getattr(self, "year", None)
+        self.month = getattr(self, "month", None)
+        self.day   = getattr(self, "day", None)
+
+        baseURL = "http://diario-puntual.com.mx/{0}/{1}/{2}/".format(self.year, self.month.zfill(2), self.day.zfill(2))
+        
+        yield scrapy.Request(url=baseURL, callback=self.parse)
+
+
+
+    def parse(self, response):
+        for link in response.css('div.post-column').css('h2.posttitle > a::attr(href)').extract():
+            yield scrapy.Request(url=link, callback=self.parse_item)
+
+        next_page = response.css('div.archive-pagination').xpath('./a[@class="next page-numbers"]/@href').extract_first()
+        if next_page is not None:
+            yield scrapy.Request(url=next_page, callback=self.parse)
+
+
+
+    def parse_item(self, response):
+        item = NoticiasItem()
+        text = ''
+
+        news_date = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
+
+        title = response.css('div.post-container').css('h1.post-title').extract_first()
+        if title is not None : title = remove_tags(title)
+
+        topic = None
+
+        for p in response.css('div.post-column > article').css('p').extract():
+            p = remove_tags(p)
+            text += p + "\n"
+
+        ## News item info ##
+        item['date']  = news_date
+        item['title'] = title
+        item['topic'] = topic
+        item['text']  = text.strip()
+        item['url']   = response.url
+
+        yield item
--- a/descarga_por_dia/diarioPuntual/scrapy.cfg
+++ b/descarga_por_dia/diarioPuntual/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = diarioPuntual.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = diarioPuntual
--- a/descarga_por_dia/elComentario/elComentario/__init__.py
+++ b/descarga_por_dia/elComentario/elComentario/__init__.py
--- a/descarga_por_dia/elComentario/elComentario/items.py
+++ b/descarga_por_dia/elComentario/elComentario/items.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/elComentario/elComentario/middlewares.py
+++ b/descarga_por_dia/elComentario/elComentario/middlewares.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class ElcomentarioSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class ElcomentarioDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_por_dia/elComentario/elComentario/pipelines.py
+++ b/descarga_por_dia/elComentario/elComentario/pipelines.py
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
+        return item
--- a/descarga_por_dia/elComentario/elComentario/settings.py
+++ b/descarga_por_dia/elComentario/elComentario/settings.py
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for elComentario project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'elComentario'
+
+SPIDER_MODULES = ['elComentario.spiders']
+NEWSPIDER_MODULE = 'elComentario.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'elComentario (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'elComentario.middlewares.ElcomentarioSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'elComentario.middlewares.ElcomentarioDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'elComentario.pipelines.JsonWriterPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_por_dia/elComentario/elComentario/spiders/__init__.py
+++ b/descarga_por_dia/elComentario/elComentario/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_por_dia/elComentario/elComentario/spiders/noticias.py
+++ b/descarga_por_dia/elComentario/elComentario/spiders/noticias.py
+# -*- coding: utf-8 -*-
+
+"""
+MEDIA:
+    El Comentario, Colima
+
+USAGE:
+    ## Get all the news from a specific date. ##
+    ---------------------------------------------------------------------------------------------
+    $ cd elComentario/
+    $ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
+"""
+
+import scrapy, re
+from elComentario.items import NoticiasItem
+
+
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+
+
+
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    name = "noticias"
+
+
+    def start_requests(self):
+        year  = getattr(self, "year", None)
+        month = getattr(self, "month", None)
+        day   = getattr(self, "day", None)
+
+        baseURL = "https://elcomentario.ucol.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
+        
+        yield scrapy.Request(url=baseURL, callback=self.parse)
+
+
+
+    def parse(self, response):
+        for link in response.css('div.articles').xpath('./article/div[@class="cnt"]/h3/a/@href').extract():
+            yield scrapy.Request(url=link, callback=self.parse_item)
+
+        next_page = response.css('div.post-pagination').xpath('./a[@title="Next page"]/@href').extract_first()
+        if next_page is not None:
+            yield scrapy.Request(url=next_page, callback=self.parse)
+
+
+
+    def parse_item(self, response):
+        item = NoticiasItem()
+        text = ''
+
+        news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+
+        title = response.xpath('//header/h1').extract_first()
+        if title is not None : title = remove_tags(title)
+
+        topic = response.css('a.theme').extract_first()
+        if topic is not None : topic = remove_tags(topic)
+
+        for p in response.css('div.pf-content').css('p').extract():
+            p = remove_tags(p)
+            text += p + "\n"
+        text = text.strip()
+
+        ## News item info ##
+        item['date']  = news_date
+        item['title'] = title
+        item['topic'] = topic
+        item['text']  = text
+        item['url']   = response.url
+
+        yield item
--- a/descarga_por_dia/elComentario/scrapy.cfg
+++ b/descarga_por_dia/elComentario/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = elComentario.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = elComentario
--- a/descarga_por_dia/elSur/elSur/__init__.py
+++ b/descarga_por_dia/elSur/elSur/__init__.py
--- a/descarga_por_dia/elSur/elSur/items.py
+++ b/descarga_por_dia/elSur/elSur/items.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/elSur/elSur/middlewares.py
+++ b/descarga_por_dia/elSur/elSur/middlewares.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class ElsurSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class ElsurDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_por_dia/elSur/elSur/pipelines.py
+++ b/descarga_por_dia/elSur/elSur/pipelines.py
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
+        return item
--- a/descarga_por_dia/elSur/elSur/settings.py
+++ b/descarga_por_dia/elSur/elSur/settings.py
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for elSur project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'elSur'
+
+SPIDER_MODULES = ['elSur.spiders']
+NEWSPIDER_MODULE = 'elSur.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'elSur (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'elSur.middlewares.ElsurSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'elSur.middlewares.ElsurDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'elSur.pipelines.JsonWriterPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_por_dia/elSur/elSur/spiders/__init__.py
+++ b/descarga_por_dia/elSur/elSur/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_por_dia/elSur/elSur/spiders/noticias.py
+++ b/descarga_por_dia/elSur/elSur/spiders/noticias.py
+# -*- coding: utf-8 -*-
+
+"""
+MEDIA:
+    El Sur, Guerrero
+
+USAGE:
+    ## Get all the news from a specific date. ##
+    ---------------------------------------------------------------------------------------------
+    $ cd elComentario/
+    $ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
+"""
+
+import scrapy, re
+from elSur.items import NoticiasItem
+
+
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+
+HEAD_RE_1 = re.compile(r'Texto:.*?[/y] Foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
+HEAD_RE_2 = re.compile(r'Texto y foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
+
+
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    name = "noticias"
+
+
+    def start_requests(self):
+        year  = getattr(self, "year", None)
+        month = getattr(self, "month", None)
+        day   = getattr(self, "day", None)
+
+        baseURL = "https://suracapulco.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
+        
+        yield scrapy.Request(url=baseURL, callback=self.parse)
+
+
+
+    def parse(self, response):
+        for link in response.css('div.dslc-blog-posts').css('div.dslc-blog-post-title > h2 > a::attr(href)').extract():
+            yield scrapy.Request(url=link, callback=self.parse_item)
+
+        pag_lst = response.css('div.dslc-pagination > ul > li')
+        if len(pag_lst) > 0:
+            del pag_lst[0]
+            del pag_lst[0]
+            next_page = None
+
+            for li_obj in pag_lst:
+                li = remove_tags(li_obj.extract())
+                if not li.isdigit():
+                    next_page = li_obj.xpath('./a/@href').extract_first()
+                    break
+                
+            if next_page is not None : yield scrapy.Request(url=next_page, callback=self.parse)
+
+
+
+    def parse_item(self, response):
+        item = NoticiasItem()
+        text = ''
+
+        news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+
+        title = response.css('div.dslc-tp-title > h1').extract_first()
+        if title is not None : title = remove_tags(title)
+
+        topic = response.css('div.dslc-tp-meta').xpath('./ul/li[3]/a[1]').extract_first()
+        if topic is not None : topic = remove_tags(topic)
+
+        for p in response.xpath('//div[@id="dslc-theme-content-inner"]').css('p').extract():
+            p = remove_tags(p)
+            text += p + "\n"
+
+        dateline = response.css('span.dateline').extract_first()
+        if dateline is not None:
+            dateline = remove_tags(dateline)
+            text = text.replace(dateline, '')
+
+        text = text.replace(u'\u00a0', ' ')
+        text = HEAD_RE_1.sub('', text)
+        text = HEAD_RE_2.sub('', text)
+
+        ## News item info ##
+        item['date']  = news_date
+        item['title'] = title
+        item['topic'] = topic
+        item['text']  = text.strip()
+        item['url']   = response.url
+
+        yield item
--- a/descarga_por_dia/elSur/scrapy.cfg
+++ b/descarga_por_dia/elSur/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = elSur.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = elSur
--- a/descarga_por_dia/foraneos/diarioCoLatino/diarioCoLatino/spiders/noticias.py
+++ b/descarga_por_dia/foraneos/diarioCoLatino/diarioCoLatino/spiders/noticias.py
 # -*- coding: utf-8 -*-
-import scrapy, re
-from diarioCoLatino.items import NoticiasItem

 """
-MEDIO:
-Diario Co Latino, El Salvador
-USO:
-scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
+MEDIA:
+    Diario Co Latino, El Salvador
+
+USAGE:
+    ## Get all the news from a specific date. ##
+    ---------------------------------------------------------------------------------------------
+    $ cd diarioCoLatino/
+    $ scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
 """

+import scrapy, re
+from diarioCoLatino.items import NoticiasItem
+
+
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)
@@ -20,9 +26,14 @@ LOC_RE = re.compile(r'\n.*?\/(PL|AFP|DPA|SIGNIS ALC)\n', re.I)
 EM_RE = re.compile(r'\n((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?\n')


+
 class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
    name = "noticias"

+
    def start_requests(self):
        year = getattr(self, "year", None)
        month = getattr(self, "month", None)
@@ -33,6 +44,7 @@ class QuotesSpider(scrapy.Spider):
        yield scrapy.Request(url=self.baseURL, callback=self.parse)


+
    def parse(self, response):
        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)

@@ -45,20 +57,22 @@ class QuotesSpider(scrapy.Spider):
                yield scrapy.Request(url=self.baseURL + "/page/" + str(page + 1), callback=self.parse_page)


+
    def parse_page(self, response):
-        for link in response.xpath('//div[@class="post-listing"]/article/h2/a/@href').extract():
+        for link in response.css('div.content').css('div.post-listing').xpath('./article/h2/a/@href').extract():
            yield scrapy.Request(url=link, callback=self.parse_item)


+
    def parse_item(self, response):
        item = NoticiasItem()
        text = ''

-        "La fecha obtenida ya incluye formato y zona horaria"
-        item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
-        item['title'] = remove_tags(response.css('h1.entry-title').css('span').extract_first()).strip()
+        # La fecha obtenida ya incluye formato y zona horaria
+        news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+        news_title = remove_tags(response.css('h1.entry-title').css('span').extract_first()).strip()

-        item['topic'] = None
+        news_topic = None

        for p in response.xpath('//div[@class="entry"]/p').extract():
            text += remove_tags(p) + "\n"
@@ -69,27 +83,30 @@ class QuotesSpider(scrapy.Spider):

        text = "\n" + text
        """ Obtiene autor """
+        news_author = None
        res = AUTH_RE.match(text)
        if res:
            m = res.group(0)
-            item['author'] = m[m.find('Por')+len('Por'):].strip()
+            news_author = m[m.find('Por')+len('Por'):].strip()
            text = text[text.find(m) + len(m):].strip()
            text = "\n" + text

        """ Elimina twitter """
+        news_twitter = None
        res = TW_RE.search(text)
        if res:
            m = res.group(0)
-            item['twitter'] = m.strip()
+            news_twitter = m.strip()
            text = text[text.find(m) + len(m):].strip()
            text = "\n" + text

        """ Obtiene lugar """
+        news_loc = None
        res = LOC_RE.match(text)
        if res:
            m = res.group(0)
            if m[m.find('/') + 1:].strip().lower() != 'dpa':
-                item['location'] = m[:m.find('/')].strip()
+                news_loc = m[:m.find('/')].strip()
                text = text[text.find(m) + len(m):].strip()
                text = "\n" + text
            else:
@@ -97,10 +114,11 @@ class QuotesSpider(scrapy.Spider):
                text = "\n" + text

        """ Elimina correo """
+        news_email = None
        res = EM_RE.search(text)
        if res:
            m = res.group(0)
-            item['email'] = m.strip()
+            news_email = m.strip()
            # text = text[text.find(m) + len(m):].strip()
            text = text.replace(m, '').strip()
            text = "\n" + text
@@ -108,7 +126,7 @@ class QuotesSpider(scrapy.Spider):
        res = EM_RE.search(text)
        if res:
            m = res.group(0)
-            item['email'] = m.strip()
+            news_email = m.strip()
            # text = text[text.find(m) + len(m):].strip()
            text = text.replace(m, '').strip()
            text = "\n" + text
@@ -119,8 +137,16 @@ class QuotesSpider(scrapy.Spider):
        text = "\n" + text
        text = text.replace("\nCo Latino\n", '').strip()

-        item['text'] = text.strip()

+        ## News item info ##
+        item['date']   = news_date
+        item['title']  = news_title
+        item['topic']  = news_topic
+        item['author'] = news_author
+        item['twitter'] = news_twitter
+        item['location'] = news_loc
+        item['email'] = news_email
+        item['text']  = text.strip()
        item['url']   = response.url

        yield item
--- a/descarga_por_dia/surDeCampeche/scrapy.cfg
+++ b/descarga_por_dia/surDeCampeche/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = surDeCampeche.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = surDeCampeche
--- a/descarga_por_dia/surDeCampeche/surDeCampeche/__init__.py
+++ b/descarga_por_dia/surDeCampeche/surDeCampeche/__init__.py
--- a/descarga_por_dia/surDeCampeche/surDeCampeche/items.py
+++ b/descarga_por_dia/surDeCampeche/surDeCampeche/items.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/surDeCampeche/surDeCampeche/middlewares.py
+++ b/descarga_por_dia/surDeCampeche/surDeCampeche/middlewares.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class SurdecampecheSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class SurdecampecheDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_por_dia/surDeCampeche/surDeCampeche/pipelines.py
+++ b/descarga_por_dia/surDeCampeche/surDeCampeche/pipelines.py
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
+        return item
--- a/descarga_por_dia/surDeCampeche/surDeCampeche/settings.py
+++ b/descarga_por_dia/surDeCampeche/surDeCampeche/settings.py
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for surDeCampeche project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'surDeCampeche'
+
+SPIDER_MODULES = ['surDeCampeche.spiders']
+NEWSPIDER_MODULE = 'surDeCampeche.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'surDeCampeche (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'surDeCampeche.middlewares.SurdecampecheSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'surDeCampeche.middlewares.SurdecampecheDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'surDeCampeche.pipelines.JsonWriterPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_por_dia/surDeCampeche/surDeCampeche/spiders/__init__.py
+++ b/descarga_por_dia/surDeCampeche/surDeCampeche/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_por_dia/surDeCampeche/surDeCampeche/spiders/noticias.py
+++ b/descarga_por_dia/surDeCampeche/surDeCampeche/spiders/noticias.py
+# -*- coding: utf-8 -*-
+import scrapy, re
+from surDeCampeche.items import NoticiasItem
+
+"""
+MEDIO:
+El Sur de Campeche, Campeche
+USO:
+scrapy crawl noticias --nolog -s filename=2018-08-10.json -a year=2018 -a month=8 -a day=10
+"""
+
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+
+
+class QuotesSpider(scrapy.Spider):
+    name = "noticias"
+
+    def start_requests(self):
+        year  = getattr(self, "year", None)
+        month = getattr(self, "month", None)
+        day   = getattr(self, "day", None)
+
+        self.baseURL = "http://www.elsur.mx/" + year + "/" + month + "/" + day
+
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
+
+
+    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        
+        pagination = response.xpath('//a[@class="page-numbers"]/@href').extract()
+        if pagination is not None and len(pagination) > 0:
+            pages = pagination[-1].rstrip("/")
+            pages = int(pages[pages.rfind("/") + 1:])
+
+            for page in range(1, pages):
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page + 1), callback=self.parse_page)
+
+
+    def parse_page(self, response):
+        for link in response.css('div.news_box_inner_content').css('div.news_box_item_content').xpath('./h3/a/@href').extract():
+            yield scrapy.Request(url=link, callback=self.parse_item)
+
+
+    def parse_item(self, response):
+        item = NoticiasItem()
+        text = ''
+
+        # La fecha obtenida ya incluye formato y zona horaria
+        item['date']  = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+        item['title'] = remove_tags(response.xpath('//div[@class="post_title_wrapper"]/h1').extract_first())
+        
+        try:
+            topic = remove_tags(response.css('span.blog_meta_category').css('a').extract_first())
+        except:
+            topic = None
+        
+        item['topic'] = topic
+
+        for p in response.css('div.entry-content').css('p').extract():
+            text += remove_tags(p) + "\n"
+        
+        item['text'] = text
+
+        item['url'] = response.url
+
+        yield item
--- a/descarga_por_dia/tribunaCampeche/scrapy.cfg
+++ b/descarga_por_dia/tribunaCampeche/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = tribunaCampeche.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = tribunaCampeche
--- a/descarga_por_dia/tribunaCampeche/tribunaCampeche/__init__.py
+++ b/descarga_por_dia/tribunaCampeche/tribunaCampeche/__init__.py
--- a/descarga_por_dia/tribunaCampeche/tribunaCampeche/items.py
+++ b/descarga_por_dia/tribunaCampeche/tribunaCampeche/items.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/tribunaCampeche/tribunaCampeche/middlewares.py
+++ b/descarga_por_dia/tribunaCampeche/tribunaCampeche/middlewares.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class TribunacampecheSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class TribunacampecheDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_por_dia/tribunaCampeche/tribunaCampeche/pipelines.py
+++ b/descarga_por_dia/tribunaCampeche/tribunaCampeche/pipelines.py
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
+        return item
--- a/descarga_por_dia/tribunaCampeche/tribunaCampeche/settings.py
+++ b/descarga_por_dia/tribunaCampeche/tribunaCampeche/settings.py
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for tribunaCampeche project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'tribunaCampeche'
+
+SPIDER_MODULES = ['tribunaCampeche.spiders']
+NEWSPIDER_MODULE = 'tribunaCampeche.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'tribunaCampeche (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'tribunaCampeche.middlewares.TribunacampecheSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'tribunaCampeche.middlewares.TribunacampecheDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'tribunaCampeche.pipelines.JsonWriterPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_por_dia/tribunaCampeche/tribunaCampeche/spiders/__init__.py
+++ b/descarga_por_dia/tribunaCampeche/tribunaCampeche/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_por_dia/tribunaCampeche/tribunaCampeche/spiders/noticias.py
+++ b/descarga_por_dia/tribunaCampeche/tribunaCampeche/spiders/noticias.py
+# -*- coding: utf-8 -*-
+import scrapy, re
+from tribunaCampeche.items import NoticiasItem
+
+"""
+MEDIO:
+Tribuna, Campeche
+USO:
+scrapy crawl noticias --nolog -s filename=2018-08-10.json -a year=2018 -a month=8 -a day=10
+"""
+
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+
+
+class QuotesSpider(scrapy.Spider):
+    name = "noticias"
+
+    def start_requests(self):
+        year  = getattr(self, "year", None)
+        month = getattr(self, "month", None)
+        day   = getattr(self, "day", None)
+
+        self.baseURL = "http://tribunacampeche.com/" + year + "/" + month + "/" + day
+
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
+
+
+    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        
+        pagination = response.xpath('//a[@class="page-numbers"]/@href').extract()
+        if pagination is not None and len(pagination) > 0:
+            pages = pagination[-1].rstrip("/")
+            pages = int(pages[pages.rfind("/") + 1:])
+
+            for page in range(1, pages):
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page + 1), callback=self.parse_page)
+
+
+    def parse_page(self, response):
+        for link in response.css('div.vw-post-box').css('div.vw-post-box-inner').xpath('./h3/a/@href').extract():
+            yield scrapy.Request(url=link, callback=self.parse_item)
+
+
+    def parse_item(self, response):
+        item = NoticiasItem()
+        text = ''
+
+        # La fecha obtenida ya incluye formato y zona horaria
+        item['date']  = response.xpath('//time[@itemprop="datePublished"]/@datetime').extract_first()
+        item['title'] = remove_tags(response.xpath('//h1[@class="entry-title"]').extract_first())
+        
+        try:
+            topic = remove_tags(response.css('article.vw-main-post').xpath('./div[@class="vw-post-categories"]/div/a').extract_first())
+        except:
+            topic = None
+        
+        item['topic'] = topic
+
+        for p in response.css('div.vw-post-content').css('p').extract():
+            text += remove_tags(p) + "\n"
+        
+        item['text'] = text
+
+        item['url'] = response.url
+
+        yield item
--- a/descarga_por_mes/proceso/proceso/items.py
+++ b/descarga_por_mes/proceso/proceso/items.py
@@ -3,12 +3,18 @@
 # Define here the models for your scraped items
 #
 # See documentation in:
-# http://doc.scrapy.org/en/latest/topics/items.html
+# https://doc.scrapy.org/en/latest/topics/items.html

 import scrapy


-class ProcesoItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_mes/proceso/proceso/middlewares.py
+++ b/descarga_por_mes/proceso/proceso/middlewares.py
--- a/descarga_por_mes/proceso/proceso/pipelines.py
+++ b/descarga_por_mes/proceso/proceso/pipelines.py
--- a/descarga_por_mes/proceso/proceso/settings.py
+++ b/descarga_por_mes/proceso/proceso/settings.py
--- a/descarga_por_mes/proceso/proceso/settings_org.py
+++ b/descarga_por_mes/proceso/proceso/settings_org.py
--- a/descarga_por_mes/proceso/proceso/spiders/noticias.py
+++ b/descarga_por_mes/proceso/proceso/spiders/noticias.py
--- a/descarga_por_mes/proceso/proceso/spiders/noticias_org.py
+++ b/descarga_por_mes/proceso/proceso/spiders/noticias_org.py
--- a/descarga_por_mes/proceso_prueba/proceso_prueba/__init__.py
+++ b/descarga_por_mes/proceso_prueba/proceso_prueba/__init__.py
--- a/descarga_por_mes/proceso_prueba/proceso_prueba/items.py
+++ b/descarga_por_mes/proceso_prueba/proceso_prueba/items.py
--- a/descarga_por_mes/proceso_prueba/proceso_prueba/middlewares.py
+++ b/descarga_por_mes/proceso_prueba/proceso_prueba/middlewares.py
--- a/descarga_por_mes/proceso_prueba/proceso_prueba/pipelines.py
+++ b/descarga_por_mes/proceso_prueba/proceso_prueba/pipelines.py
--- a/descarga_por_mes/proceso_prueba/proceso_prueba/settings.py
+++ b/descarga_por_mes/proceso_prueba/proceso_prueba/settings.py
--- a/descarga_por_mes/proceso_prueba/proceso_prueba/spiders/__init__.py
+++ b/descarga_por_mes/proceso_prueba/proceso_prueba/spiders/__init__.py
--- a/descarga_por_mes/proceso_prueba/proceso_prueba/spiders/noticias.py
+++ b/descarga_por_mes/proceso_prueba/proceso_prueba/spiders/noticias.py
--- a/descarga_por_mes/proceso_prueba/scrapy.cfg
+++ b/descarga_por_mes/proceso_prueba/scrapy.cfg
--- a/descarga_por_rss/elFinanciero/elFinanciero/spiders/noticias.py
+++ b/descarga_por_rss/elFinanciero/elFinanciero/spiders/noticias.py