update

91655377 · Mario Chirinos Colunga · 9d89a5cc · 91655377 · 91655377 · 91655377
Commit 91655377 authored May 13, 2022 by Mario Chirinos Colunga
153 changed files
--- a/README.md
+++ b/README.md
 # M³ - Descarga de Noticias e Texto
 Rastreadore (Crawlers) para medios escritos de información en linea basados en [Scrapy](http://scrapy.org/).
-Los ratreadores estan divididos en tres clases.
+Los ratreadores estan divididos en tres clases:
 * spiders/daily: Sitios que su verion impresa es publicada diariamente.
 * spiders/monthly: SItios de publcacion mensual.

--- a/scripts/crawlAll.py
+++ b/scripts/crawlAll.py
+#!/usr/bin/python3
+import sys
+import datetime
+import glob
+import json
+import os
+#===============================================================================
+#===============================================================================
+def main(argv):
+	'''
+	'''
+	cwd = os.getcwd()
+	if len(argv) != 2 and len(argv) != 3:
+		print ("Usage: " + argv[0] + "<directory> [endDate:YYYY-MM-DD]")
+	else:
+		rootdir = argv[1]
+		with open(rootdir+"/"+datetime.datetime.today().strftime('%Y-%m-%d')+".log", "a") as logfile:
+			for path in glob.glob(f'{rootdir}/*/'):
+				with open(path+'settings.json') as json_file:
+						cfg = json.load(json_file)
+				logfile.write("Crawler "+cfg["crawler"]+" started at: " +datetime.datetime.now().strftime("%Y-%m-%d, %H:%M:%S")+"\n")
+				new_cwd = os.getcwd()+"/spiders/"+cfg["crawler"]
+				if os.path.exists(new_cwd):
+					os.chdir(new_cwd)
+					os.system("python3 ../../../scripts/siteCrawler.py " + path)
+				else:
+					logfile.write("\t spider not found.\n")
+				print(os.getcwd())
+				os.chdir(cwd)
+#-------------------------------------------------------------------------------
+if __name__ == "__main__":
+	main(sys.argv)
--- a/scripts/spidersTools.py
+++ b/scripts/spidersTools.py
 #!/usr/bin/python3
-#Author: Mario Chirinos Colunga
+# File: siteCrawler.py
+# Author: Mario Chirinos Colunga
+# Daily News Site Crawler
 #===============================================================================
 import sys
 import os

--- a/spiders/daily/BROKEN/alChile/alChile/__init__.py
+++ b/spiders/daily/BROKEN/alChile/alChile/__init__.py
--- a/spiders/daily/BROKEN/alChile/alChile/items.py
+++ b/spiders/daily/BROKEN/alChile/alChile/items.py
--- a/spiders/daily/BROKEN/alChile/alChile/middlewares.py
+++ b/spiders/daily/BROKEN/alChile/alChile/middlewares.py
--- a/spiders/daily/BROKEN/alChile/alChile/pipelines.py
+++ b/spiders/daily/BROKEN/alChile/alChile/pipelines.py
--- a/spiders/daily/BROKEN/alChile/alChile/settings.py
+++ b/spiders/daily/BROKEN/alChile/alChile/settings.py
--- a/spiders/daily/BROKEN/alChile/alChile/spiders/__init__.py
+++ b/spiders/daily/BROKEN/alChile/alChile/spiders/__init__.py
--- a/spiders/daily/BROKEN/alChile/alChile/spiders/noticias.py
+++ b/spiders/daily/BROKEN/alChile/alChile/spiders/noticias.py
--- a/spiders/daily/BROKEN/alChile/scrapy.cfg
+++ b/spiders/daily/BROKEN/alChile/scrapy.cfg
--- a/spiders/daily/DEPRECIATED/edoMexDia_/edoMexDia/__init__.py
+++ b/spiders/daily/DEPRECIATED/edoMexDia_/edoMexDia/__init__.py
--- a/spiders/daily/diarioYaqui/diarioYaqui/__init__.py
+++ b/spiders/daily/diarioYaqui/diarioYaqui/__init__.py
--- a/spiders/daily/elFinanciero/elFinanciero/items.py
+++ b/spiders/daily/elFinanciero/elFinanciero/items.py
--- a/spiders/daily/elFinanciero/elFinanciero/middlewares.py
+++ b/spiders/daily/elFinanciero/elFinanciero/middlewares.py
--- a/spiders/daily/elFinanciero/elFinanciero/pipelines.py
+++ b/spiders/daily/elFinanciero/elFinanciero/pipelines.py
--- a/spiders/daily/elFinanciero/elFinanciero/settings.py
+++ b/spiders/daily/elFinanciero/elFinanciero/settings.py
--- a/spiders/daily/DEPRECIATED/edoMexDia_/edoMexDia/spiders/__init__.py
+++ b/spiders/daily/DEPRECIATED/edoMexDia_/edoMexDia/spiders/__init__.py
--- a/spiders/daily/elFinanciero/elFinanciero/spiders/noticias.py
+++ b/spiders/daily/elFinanciero/elFinanciero/spiders/noticias.py
@@ -16,8 +16,9 @@ def remove_tags(text):
 #-------------------------------------------------------------------------------
 class NoticiasSpider(scrapy.Spider):
 	name = 'noticias'
-	allowed_domains = ['elfinanciero.com']
+	allowed_domains = ['elfinanciero.com.mx']
-	start_urls = ['http://elfinanciero.com/']
+	start_urls = ['https://elfinanciero.com.mx/']
 	urllist=[]
 	def start_requests(self):
 		print("start_urls")

--- a/spiders/daily/elFinanciero/scrapy.cfg
+++ b/spiders/daily/elFinanciero/scrapy.cfg
--- a/spiders/daily/elFinanciero/elFinanciero/__init__.py
+++ b/spiders/daily/elFinanciero/elFinanciero/__init__.py
--- a/spiders/daily/DEPRECIATED/edoMexDia_/edoMexDia/items.py
+++ b/spiders/daily/DEPRECIATED/edoMexDia_/edoMexDia/items.py
--- a/spiders/daily/DEPRECIATED/desdeElBalcon_/desdeElBalcon/middlewares.py
+++ b/spiders/daily/DEPRECIATED/desdeElBalcon_/desdeElBalcon/middlewares.py
+# -*- coding: utf-8 -*-
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy import signals
+class DesdeelbalconSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+    def process_start_requests(start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/spiders/daily/DEPRECIATED/edoMexDia_/edoMexDia/pipelines.py
+++ b/spiders/daily/DEPRECIATED/edoMexDia_/edoMexDia/pipelines.py
--- a/spiders/daily/DEPRECIATED/desdeElBalcon_/desdeElBalcon/settings.py
+++ b/spiders/daily/DEPRECIATED/desdeElBalcon_/desdeElBalcon/settings.py
+# -*- coding: utf-8 -*-
+# Scrapy settings for desdeElBalcon project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+BOT_NAME = 'desdeElBalcon'
+SPIDER_MODULES = ['desdeElBalcon.spiders']
+NEWSPIDER_MODULE = 'desdeElBalcon.spiders'
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'desdeElBalcon (+http://www.yourdomain.com)'
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'desdeElBalcon.middlewares.DesdeelbalconSpiderMiddleware': 543,
+#}
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'desdeElBalcon.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'desdeElBalcon.pipelines.JsonWriterPipeline': 300,
+}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/spiders/daily/diarioYaqui/diarioYaqui/spiders/__init__.py
+++ b/spiders/daily/diarioYaqui/diarioYaqui/spiders/__init__.py
--- a/spiders/daily/DEPRECIATED/desdeElBalcon_/desdeElBalcon/spiders/noticias.py
+++ b/spiders/daily/DEPRECIATED/desdeElBalcon_/desdeElBalcon/spiders/noticias.py
+# -*- coding: utf-8 -*-
+import scrapy, re
+from datetime import datetime, timedelta, tzinfo
+from desdeElBalcon.items import NoticiasItem
+"""
+MEDIO:
+Desde el Balcon, Yucatan
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+class UTC(tzinfo):
+    """clase para el 'time zone' (zona horaria)"""
+    def utcoffset(self, dt):
+        # zona horaria para yucatan (centro de mexico): utc-6
+        return timedelta(hours=-6)
+    def tzname(self, dt):
+        # nombre de la zona horaria
+        return 'UTC-6'
+class QuotesSpider(scrapy.Spider):
+    name = "noticias"
+    def start_requests(self):
+        self.tz = UTC()
+        self.year = getattr(self, 'year', None)
+        self.month = getattr(self, 'month', None)
+        self.day = getattr(self, 'day', None)
+        self.baseURL='http://www.desdeelbalcon.com/'+self.year+'/'+self.month+'/'+self.day
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
+    def parse(self, response):
+        print(response.url)
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()
+        if len(pagination) > 0:
+            pagination = pagination[-1].strip('/')
+            pages = int(pagination[pagination.rfind('/')+1:])
+            for page in range(1, pages):
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
+    def parse_page(self, response):
+        item = NoticiasItem()
+        for post in response.xpath('//ul[@class="archivepost"]/li'):
+            # item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
+            item['date'] = datetime(int(self.year),int(self.month),int(self.day),tzinfo=self.tz).isoformat('T')
+            item['topic'] = post.xpath('./p/a/text()').extract()
+            request = scrapy.Request(url=post.xpath('./h2/a/@href').extract_first(), callback=self.parse_item)
+            request.meta['item'] = item
+            yield request
+    def parse_item(self, response):
+        text = ''
+        item = response.meta['item']
+        item['title'] = response.xpath('//h1[@class="post entry-title"]/a/text()').extract_first()
+        for paragraph in response.xpath('//div[@itemprop="text"]/p').extract():
+            text += remove_tags(paragraph) + '\n'
+        item['text'] = text
+        item['url'] = response.url
+        print (item['title'])
+        yield item
--- a/spiders/daily/DEPRECIATED/desdeElBalcon_/scrapy.cfg
+++ b/spiders/daily/DEPRECIATED/desdeElBalcon_/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+[settings]
+default = desdeElBalcon.settings
+[deploy]
+#url = http://localhost:6800/
+project = desdeElBalcon
--- a/spiders/daily/elValle/elValle/__init__.py
+++ b/spiders/daily/elValle/elValle/__init__.py
--- a/spiders/daily/DEPRECIATED/diarioCoLatino_/diarioCoLatino/items.py
+++ b/spiders/daily/DEPRECIATED/diarioCoLatino_/diarioCoLatino/items.py
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+import scrapy
+class DiariocolatinoItem(scrapy.Item):
+	# define the fields for your item here like:
+	# name = scrapy.Field()
+	date = scrapy.Field()
+	title = scrapy.Field()
+	text = scrapy.Field()
+	location = scrapy.Field()
+	author = scrapy.Field()
+	topic = scrapy.Field()
+	url = scrapy.Field()
--- a/spiders/daily/DEPRECIATED/diarioCoLatino_/diarioCoLatino/middlewares.py
+++ b/spiders/daily/DEPRECIATED/diarioCoLatino_/diarioCoLatino/middlewares.py
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy import signals
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+class DiariocolatinoSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Request or item objects.
+        pass
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+class DiariocolatinoDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/spiders/daily/DEPRECIATED/diarioCoLatino_/diarioCoLatino/pipelines.py
+++ b/spiders/daily/DEPRECIATED/diarioCoLatino_/diarioCoLatino/pipelines.py
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+class DiariocolatinoPipeline:
+    def process_item(self, item, spider):
+        return item
--- a/spiders/daily/DEPRECIATED/diarioCoLatino_/diarioCoLatino/settings.py
+++ b/spiders/daily/DEPRECIATED/diarioCoLatino_/diarioCoLatino/settings.py
+# Scrapy settings for diarioCoLatino project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+BOT_NAME = 'diarioCoLatino'
+SPIDER_MODULES = ['diarioCoLatino.spiders']
+NEWSPIDER_MODULE = 'diarioCoLatino.spiders'
+FEED_EXPORT_ENCODING="utf-8"
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'diarioCoLatino (+http://www.yourdomain.com)'
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'diarioCoLatino.middlewares.DiariocolatinoSpiderMiddleware': 543,
+#}
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'diarioCoLatino.middlewares.DiariocolatinoDownloaderMiddleware': 543,
+#}
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'diarioCoLatino.pipelines.DiariocolatinoPipeline': 300,
+#}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/spiders/daily/elFinanciero/elFinanciero/spiders/__init__.py
+++ b/spiders/daily/elFinanciero/elFinanciero/spiders/__init__.py
--- a/spiders/daily/DEPRECIATED/diarioCoLatino_/diarioCoLatino/spiders/noticias.py
+++ b/spiders/daily/DEPRECIATED/diarioCoLatino_/diarioCoLatino/spiders/noticias.py
+import scrapy
+from diarioCoLatino.items import DiariocolatinoItem
+import re
+#-------------------------------------------------------------------------------
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+	return TAG_RE.sub('', text)
+#-----------------------------------
+#-------------------------------------------------------------------------------
+class NoticiasSpider(scrapy.Spider):
+	name = 'noticias'
+	allowed_domains = ['diariocolatino.com']
+	start_urls = ['http://diariocolatino.com/']
+	def start_requests(self):
+		year = getattr(self, "year", None)
+		month = getattr(self, "month", None)
+		day = getattr(self, "day", None)
+		self.baseURL = "https://www.diariocolatino.com/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
+		yield scrapy.Request(url=self.baseURL, callback=self.parseDate)
+	#-----------------------------------------------------------------------
+	def parseDate(self, response):
+		print(response.url)
+		for page in response.css('span#tie-next-page').css("a::attr(href)").extract():
+			yield scrapy.Request(url=page, callback=self.parseDate)
+		for link in response.css('div.content').css('div.post-listing').xpath('./article/h2/a/@href').extract():
+			yield scrapy.Request(url=link, callback=self.parseItem)
+	#-----------------------------------------------------------------------
+	def parseItem(self, response):
+		print(response.url)
+		item = DiariocolatinoItem()
+		item["date"] = response.xpath("//meta[@property='article:published_time']/@content").extract_first()
+		item["title"] = response.xpath('//meta[@property="og:title"]/@content').extract_first().replace("- Diario Co Latino","")
+		item["topic"] = response.css("span.post-cats").css("a::text").extract_first().lower()
+		text=""
+		for p in response.xpath('//div[@class="entry"]/p').extract():
+			text += remove_tags(p) + "\n"
+		item["text"]=text
+		item["url"]=response.url
+		print(item)
+		yield item
--- a/spiders/daily/DEPRECIATED/diarioCoLatino_/scrapy.cfg
+++ b/spiders/daily/DEPRECIATED/diarioCoLatino_/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+[settings]
+default = diarioCoLatino.settings
+[deploy]
+#url = http://localhost:6800/
+project = diarioCoLatino
--- a/spiders/daily/expresoChiapas/expresoChiapas/__init__.py
+++ b/spiders/daily/expresoChiapas/expresoChiapas/__init__.py
--- a/spiders/daily/diarioYaqui/diarioYaqui/items.py
+++ b/spiders/daily/diarioYaqui/diarioYaqui/items.py
--- a/spiders/daily/diarioYaqui/diarioYaqui/middlewares.py
+++ b/spiders/daily/diarioYaqui/diarioYaqui/middlewares.py
--- a/spiders/daily/diarioYaqui/diarioYaqui/pipelines.py
+++ b/spiders/daily/diarioYaqui/diarioYaqui/pipelines.py
--- a/spiders/daily/diarioYaqui/diarioYaqui/settings.py
+++ b/spiders/daily/diarioYaqui/diarioYaqui/settings.py
--- a/spiders/daily/elValle/elValle/spiders/__init__.py
+++ b/spiders/daily/elValle/elValle/spiders/__init__.py
--- a/spiders/daily/diarioYaqui/diarioYaqui/spiders/noticias.py
+++ b/spiders/daily/diarioYaqui/diarioYaqui/spiders/noticias.py
--- a/spiders/daily/diarioYaqui/scrapy.cfg
+++ b/spiders/daily/diarioYaqui/scrapy.cfg
--- a/spiders/daily/DEPRECIATED/edoMexDia_/2021-03-22.json
+++ b/spiders/daily/DEPRECIATED/edoMexDia_/2021-03-22.json
-[]
\ No newline at end of file
--- a/spiders/daily/DEPRECIATED/edoMexDia_/edoMexDia/spiders/noticias.py
+++ b/spiders/daily/DEPRECIATED/edoMexDia_/edoMexDia/spiders/noticias.py
-# -*- coding: utf-8 -*-
-import scrapy, re
-from edoMexDia.items import NoticiasItem
-from datetime import datetime, timedelta, tzinfo
-"""
-MEDIO:
-EDOMEX al Día, Estado de México
-USO:
-scrapy crawl noticias --nolog -s filename=2018-01-30.json -a year=2018 -a month=1 -a day=30
-"""
-TAG_RE = re.compile(r'<[^>]+>')
-def remove_tags(text):
-	return TAG_RE.sub('', text)
-class UTC(tzinfo):
-	"""clase para el 'time zone' (zona horaria)"""
-	#-----------------------------------------------------------------------
-	def utcoffset(self, dt):
-		# zona horaria para estado de méxico: utc-6
-		return timedelta(hours=-6)
-	#-----------------------------------------------------------------------
-	def tzname(self, dt):
-		# nombre de la zona horaria
-		return 'UTC-6'
-class QuotesSpider(scrapy.Spider):
-	name = "noticias"
-	def start_requests(self):
-		self.tz = UTC()
-		self.year = getattr(self, "year", None)
-		self.month = getattr(self, "month", None)
-		self.day = getattr(self, "day", None)
-		self.baseURL = "http://edomexaldia.com/" + self.year + "/" + self.month.zfill(2) + "/" + self.day.zfill(2)
-		yield scrapy.Request(url=self.baseURL, callback=self.parse_page)
-	#-----------------------------------------------------------------------
-	def parse(self, response):
-		yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-#		lastPage = response.xpath('//div[@class="numbered-pagination"]/a[@class="pagi-last"]/@href').extract_first()
-#		if lastPage is None:
-#			lastPage = response.xpath('//div[@class="numbered-pagination"]/a/@href').extract()[-1]
-#		if lastPage is not None and lastPage != '':
-#			lastPage = lastPage.strip('/')
-#			lastPage = int(lastPage[lastPage.rfind('/')+1:])
-#		for page in range(1, lastPage):
-#			yield scrapy.Request(url=self.baseURL + "/page/" + str(page+1), callback=self.parse_page)
-	#-----------------------------------------------------------------------
-	def parse_page(self, response):
-		print("parse page", response.url)
-		for link in response.xpath('//main[@id="main"]/article/header/h2[@class="entry-title"]/a/@href').extract():
-			yield scrapy.Request(url=link, callback=self.parse_item)
-#		nextPage = response.xpath('//div[@class="numbered-pagination"]/a[@class="pagi-next"]/@href').extract_first()
-#		if nextPage is not None and nextPage != '':
-#			yield scrapy.Request(url=nextPage, callback=self.parse)
-	#-----------------------------------------------------------------------
-	def parse_item(self, response):
-		print("print item", response.url)
-		item = NoticiasItem()
-####		text = ''
-#####		print(response.xpath("//meta[@property='article:published_time']/@content").extract_first())
-#####		try:
-#####			d = remove_tags(response.xpath('//span[@class="post_author_create"]').extract_first())
-#####			d = d.replace("el ", '').replace(",", '').replace(".", '').split()
-#####			dat = datetime(int(d[2]), self.date_parser[d[1].lower()], int(d[0]), tzinfo=self.tz).isoformat("T")
-#####		except:
-#####			dat = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
-		item['date'] = response.xpath("//meta[@property='article:published_time']/@content").extract_first()
-		item['title'] = response.xpath("//meta[@property='og:title']/@content").extract_first().replace(" - Edomex Al Día","")
-		item['text'] = "\n".join([remove_tags(p) for p in response.xpath('//div[@class="entry-content"]/p/text()').extract() ])
-		item['topic'] = None
-#####		author = response.xpath('//span[@class="post_author_author"]').extract_first()
-#####		if author is not None and author != '':
-#####			author = remove_tags(author).strip()
-#####			author = author.replace(" Publicado:", '')
-#####			item['author'] = author
-#####		for p in response.xpath('//div[@id="main"]/div/p').extract():
-#####			text += remove_tags(p) + "\n"
-#####		item['text'] = text.strip()
-		item['url'] = response.url
-		print(item)
-#		yield item
--- a/spiders/daily/DEPRECIATED/elComentario_/elComentario/__init__.py
+++ b/spiders/daily/DEPRECIATED/elComentario_/elComentario/__init__.py
--- a/spiders/daily/DEPRECIATED/elComentario_/elComentario/items.py
+++ b/spiders/daily/DEPRECIATED/elComentario_/elComentario/items.py
+# -*- coding: utf-8 -*-
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+import scrapy
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/spiders/daily/DEPRECIATED/elComentario_/elComentario/middlewares.py
+++ b/spiders/daily/DEPRECIATED/elComentario_/elComentario/middlewares.py
+# -*- coding: utf-8 -*-
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy import signals
+class ElcomentarioSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+class ElcomentarioDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/spiders/daily/DEPRECIATED/elComentario_/elComentario/pipelines.py
+++ b/spiders/daily/DEPRECIATED/elComentario_/elComentario/pipelines.py
+# -*- coding: utf-8 -*-
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+        return item
--- a/spiders/daily/DEPRECIATED/elComentario_/elComentario/settings.py
+++ b/spiders/daily/DEPRECIATED/elComentario_/elComentario/settings.py
+# -*- coding: utf-8 -*-
+# Scrapy settings for elComentario project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+BOT_NAME = 'elComentario'
+SPIDER_MODULES = ['elComentario.spiders']
+NEWSPIDER_MODULE = 'elComentario.spiders'
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'elComentario (+http://www.yourdomain.com)'
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'elComentario.middlewares.ElcomentarioSpiderMiddleware': 543,
+#}
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'elComentario.middlewares.ElcomentarioDownloaderMiddleware': 543,
+#}
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'elComentario.pipelines.JsonWriterPipeline': 300,
+}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/spiders/daily/expresoChiapas/expresoChiapas/spiders/__init__.py
+++ b/spiders/daily/expresoChiapas/expresoChiapas/spiders/__init__.py
--- a/spiders/daily/DEPRECIATED/elComentario_/elComentario/spiders/noticias.py
+++ b/spiders/daily/DEPRECIATED/elComentario_/elComentario/spiders/noticias.py
+# -*- coding: utf-8 -*-
+"""
+MEDIA:
+    El Comentario, Colima
+USAGE:
+    ## Get all the news from a specific date. ##
+    ---------------------------------------------------------------------------------------------
+    $ cd elComentario/
+    $ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
+"""
+import scrapy, re
+from elComentario.items import NoticiasItem
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+class QuotesSpider(scrapy.Spider):
+	"""
+	Basic Scrapy Spider class
+	"""
+	name = "noticias"
+	def start_requests(self):
+		year  = getattr(self, "year", None)
+		month = getattr(self, "month", None)
+		day   = getattr(self, "day", None)
+		baseURL = "https://elcomentario.ucol.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
+		yield scrapy.Request(url=baseURL, callback=self.parse)
+	def parse(self, response):
+		print("parse", response.url)
+		pages = response.css("li.mkd-pagination-last-page").css("a::attr(href)").extract_first()
+		pages = int(pages[pages.find("page/")+5:-1])
+		for p in range (pages):
+			next_page = response.url+"/page/"+str(p+1)
+			yield scrapy.Request(url=next_page, callback=self.parse_page)
+	def parse_page(self, response):
+		print("parse_page", response.url)
+		links = response.css("h5").css("a::attr(href)").extract()
+		for link in links:
+			yield scrapy.Request(url=link, callback=self.parse_item)
+	def parse_item(self, response):
+		print("parse_item", response.url)
+		item = NoticiasItem()
+		text = ''
+		item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+		item['title'] = response.xpath("//meta[@property='og:title']/@content").extract_first()
+		item["topic"] = response.css("div.mkd-post-info-category").css("a::text").extract_first()
+		content = response.css("div.pf-content").css("p").extract()
+		for p in content:
+			text+= remove_tags(p)+"\n"
+		text = text.strip()
+		item['text']  = text
+		item['url']   = response.url
+		print(item)
+		yield item
--- a/spiders/daily/DEPRECIATED/elComentario_/scrapy.cfg
+++ b/spiders/daily/DEPRECIATED/elComentario_/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+[settings]
+default = elComentario.settings
+[deploy]
+#url = http://localhost:6800/
+project = elComentario
--- a/spiders/daily/DEPRECIATED/elValle/elValle/__init__.py
+++ b/spiders/daily/DEPRECIATED/elValle/elValle/__init__.py
--- a/spiders/daily/elValle/elValle/items.py
+++ b/spiders/daily/elValle/elValle/items.py
--- a/spiders/daily/elValle/elValle/middlewares.py
+++ b/spiders/daily/elValle/elValle/middlewares.py
--- a/spiders/daily/elValle/elValle/pipelines.py
+++ b/spiders/daily/elValle/elValle/pipelines.py
--- a/spiders/daily/elValle/elValle/settings.py
+++ b/spiders/daily/elValle/elValle/settings.py
--- a/spiders/daily/DEPRECIATED/elValle/elValle/spiders/__init__.py
+++ b/spiders/daily/DEPRECIATED/elValle/elValle/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/spiders/daily/elValle/elValle/spiders/noticias.py
+++ b/spiders/daily/elValle/elValle/spiders/noticias.py
--- a/spiders/daily/elValle/scrapy.cfg
+++ b/spiders/daily/elValle/scrapy.cfg
--- a/spiders/daily/DEPRECIATED/expresoChiapas/expresoChiapas/__init__.py
+++ b/spiders/daily/DEPRECIATED/expresoChiapas/expresoChiapas/__init__.py
--- a/spiders/daily/expresoChiapas/expresoChiapas/items.py
+++ b/spiders/daily/expresoChiapas/expresoChiapas/items.py
--- a/spiders/daily/expresoChiapas/expresoChiapas/middlewares.py
+++ b/spiders/daily/expresoChiapas/expresoChiapas/middlewares.py
--- a/spiders/daily/expresoChiapas/expresoChiapas/pipelines.py
+++ b/spiders/daily/expresoChiapas/expresoChiapas/pipelines.py
--- a/spiders/daily/expresoChiapas/expresoChiapas/settings.py
+++ b/spiders/daily/expresoChiapas/expresoChiapas/settings.py
--- a/spiders/daily/DEPRECIATED/expresoChiapas/expresoChiapas/spiders/__init__.py
+++ b/spiders/daily/DEPRECIATED/expresoChiapas/expresoChiapas/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/spiders/daily/expresoChiapas/expresoChiapas/spiders/noticias.py
+++ b/spiders/daily/expresoChiapas/expresoChiapas/spiders/noticias.py
--- a/spiders/daily/expresoChiapas/scrapy.cfg
+++ b/spiders/daily/expresoChiapas/scrapy.cfg
--- a/spiders/daily/DEPRECIATED/laJornadaAgs_/laJornadaAgs/__init__.py
+++ b/spiders/daily/DEPRECIATED/laJornadaAgs_/laJornadaAgs/__init__.py
--- a/spiders/daily/DEPRECIATED/laJornadaAgs_/laJornadaAgs/items.py
+++ b/spiders/daily/DEPRECIATED/laJornadaAgs_/laJornadaAgs/items.py
+# -*- coding: utf-8 -*-
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+import scrapy
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/spiders/daily/DEPRECIATED/laJornadaAgs_/laJornadaAgs/middlewares.py
+++ b/spiders/daily/DEPRECIATED/laJornadaAgs_/laJornadaAgs/middlewares.py
+# -*- coding: utf-8 -*-
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy import signals
+class LajornadaagsSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+    def process_start_requests(start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/spiders/daily/DEPRECIATED/laJornadaAgs_/laJornadaAgs/pipelines.py
+++ b/spiders/daily/DEPRECIATED/laJornadaAgs_/laJornadaAgs/pipelines.py
+# -*- coding: utf-8 -*-
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+        return item
--- a/spiders/daily/DEPRECIATED/laJornadaAgs_/laJornadaAgs/settings.py
+++ b/spiders/daily/DEPRECIATED/laJornadaAgs_/laJornadaAgs/settings.py
+# -*- coding: utf-8 -*-
+# Scrapy settings for laJornadaAgs project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+BOT_NAME = 'laJornadaAgs'
+SPIDER_MODULES = ['laJornadaAgs.spiders']
+NEWSPIDER_MODULE = 'laJornadaAgs.spiders'
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'laJornadaAgs (+http://www.yourdomain.com)'
+# Obey robots.txt rules
+# ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0.5
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'laJornadaAgs.middlewares.LajornadaagsSpiderMiddleware': 543,
+#}
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'laJornadaAgs.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+    'laJornadaAgs.pipelines.JsonWriterPipeline': 300,
+}
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/spiders/daily/DEPRECIATED/laJornadaAgs_/laJornadaAgs/spiders/__init__.py
+++ b/spiders/daily/DEPRECIATED/laJornadaAgs_/laJornadaAgs/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/spiders/daily/DEPRECIATED/laJornadaAgs_/laJornadaAgs/spiders/noticias.py
+++ b/spiders/daily/DEPRECIATED/laJornadaAgs_/laJornadaAgs/spiders/noticias.py
+# -*- coding: utf-8 -*-
+import scrapy, re
+from laJornadaAgs.items import NoticiasItem
+"""
+MEDIO:
+La Jornada Aguascalientes, Ags.
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+class QuotesSpider(scrapy.Spider):
+    name = "noticias"
+    def start_requests(self):
+        year = getattr(self, 'year', None)
+        month = getattr(self, 'month', None)
+        day = getattr(self, 'day', None)
+        self.baseURL='http://www.lja.mx/'+year+'/'+month.zfill(2)+'/'+day.zfill(2)
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
+    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        pagination = response.css('div.vw-page-navigation-pagination').css('a::attr(href)').extract()
+        if len(pagination) > 0:
+            pagination = pagination[-2].strip('/')
+            pages = int(pagination[pagination.rfind('/')+1:])
+            for page in range(1, pages):
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
+    def parse_page(self, response):
+        for link in response.css('div.vw-post-loop-inner').css('div.vw-post-box-inner').xpath('./h3/a/@href').extract():
+            yield scrapy.Request(url=link, callback=self.parse_item)
+    def parse_item(self, response):
+        item = NoticiasItem()
+        text = ''
+        ## la fecha de la noticia ya incluye la zona horaria
+        item['date'] = response.xpath('//time[@itemprop="datePublished"]/@datetime').extract_first()
+        item['title'] = remove_tags(response.xpath('//div[@class="vw-page-content"]/article/h1[@class="entry-title"]').extract_first())
+        topic = response.xpath('//div[@class="vw-page-content"]/article/div[@class="vw-post-categories"]/a').extract_first()
+        if topic is not None:
+            item['topic'] = remove_tags(topic)
+        else:
+            item['topic'] = topic
+        author = response.xpath('//span[@itemprop="author"]/a[@class="author-name"]').extract_first()
+        if author is not None:
+            item['author'] = remove_tags(author)
+        else:
+            item['author'] = author
+        for paragraph in response.xpath('//div[@itemprop="articleBody"]/p').extract():
+            text += remove_tags(paragraph) + '\n'
+        item['text'] = text
+        item['url'] = response.url
+        # print item['title']
+        yield item
--- a/spiders/daily/DEPRECIATED/laJornadaAgs_/scrapy.cfg
+++ b/spiders/daily/DEPRECIATED/laJornadaAgs_/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+[settings]
+default = laJornadaAgs.settings
+[deploy]
+#url = http://localhost:6800/
+project = laJornadaAgs
--- a/spiders/daily/DEPRECIATED/lectorMX_/lectorMX/__init__.py
+++ b/spiders/daily/DEPRECIATED/lectorMX_/lectorMX/__init__.py
--- a/spiders/daily/DEPRECIATED/lectorMX_/lectorMX/items.py
+++ b/spiders/daily/DEPRECIATED/lectorMX_/lectorMX/items.py
+# -*- coding: utf-8 -*-
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+import scrapy
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/spiders/daily/DEPRECIATED/edoMexDia_/edoMexDia/middlewares.py
+++ b/spiders/daily/DEPRECIATED/edoMexDia_/edoMexDia/middlewares.py
@@ -8,7 +8,7 @@
 from scrapy import signals
-class EdomexdiaSpiderMiddleware(object):
+class LectormxSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

--- a/spiders/daily/DEPRECIATED/lectorMX_/lectorMX/pipelines.py
+++ b/spiders/daily/DEPRECIATED/lectorMX_/lectorMX/pipelines.py
+# -*- coding: utf-8 -*-
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
+    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+        return item
--- a/spiders/daily/DEPRECIATED/edoMexDia_/edoMexDia/settings.py
+++ b/spiders/daily/DEPRECIATED/edoMexDia_/edoMexDia/settings.py
 # -*- coding: utf-8 -*-
-# Scrapy settings for edoMexDia project
+# Scrapy settings for lectorMX project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
@@ -9,14 +9,14 @@
 #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
-BOT_NAME = 'edoMexDia'
+BOT_NAME = 'lectorMX'
-SPIDER_MODULES = ['edoMexDia.spiders']
+SPIDER_MODULES = ['lectorMX.spiders']
-NEWSPIDER_MODULE = 'edoMexDia.spiders'
+NEWSPIDER_MODULE = 'lectorMX.spiders'
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'edoMexDia (+http://www.yourdomain.com)'
+#USER_AGENT = 'lectorMX (+http://www.yourdomain.com)'
 # Obey robots.txt rules
 # ROBOTSTXT_OBEY = True
@@ -47,13 +47,13 @@ COOKIES_ENABLED = False
 # Enable or disable spider middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 #SPIDER_MIDDLEWARES = {
-#    'edoMexDia.middlewares.EdomexdiaSpiderMiddleware': 543,
+#    'lectorMX.middlewares.LectormxSpiderMiddleware': 543,
 #}
 # Enable or disable downloader middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 #DOWNLOADER_MIDDLEWARES = {
-#    'edoMexDia.middlewares.MyCustomDownloaderMiddleware': 543,
+#    'lectorMX.middlewares.MyCustomDownloaderMiddleware': 543,
 #}
 # Enable or disable extensions
@@ -65,7 +65,7 @@ COOKIES_ENABLED = False
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-   'edoMexDia.pipelines.JsonWriterPipeline': 300,
+   'lectorMX.pipelines.JsonWriterPipeline': 300,
 }
 # Enable and configure the AutoThrottle extension (disabled by default)

--- a/spiders/daily/DEPRECIATED/lectorMX_/lectorMX/spiders/__init__.py
+++ b/spiders/daily/DEPRECIATED/lectorMX_/lectorMX/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/spiders/daily/DEPRECIATED/lectorMX_/lectorMX/spiders/noticias.py
+++ b/spiders/daily/DEPRECIATED/lectorMX_/lectorMX/spiders/noticias.py
+# -*- coding: utf-8 -*-
+"""
+MEDIA:
+    Lector MX, Yucatán
+USAGE:
+    ## Get all the news from a specific date. ##
+    ---------------------------------------------------------------------------------------------
+    $ cd lectorMX/
+    $ scrapy crawl noticias --nolog -s filename=2017-03-30.json -a year=2017 -a month=3 -a day=30
+"""
+import scrapy, re
+from lectorMX.items import NoticiasItem
+from datetime import datetime, timedelta, tzinfo
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+class UTC(tzinfo):
+    """
+    Class for Time Zone
+    """
+    def utcoffset(self, dt):
+        ## Time zone for Yucatán: UTC-6 ##
+        return timedelta(hours=-6)
+    def tzname(self, dt):
+        ## Time zone name ##
+        return 'UTC-6'
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    name = "noticias"
+    def start_requests(self):
+        tz = UTC()
+        year = getattr(self, 'year', None)
+        month = getattr(self, 'month', None)
+        day = getattr(self, 'day', None)
+        self.news_date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
+        baseURL = 'http://lectormx.com/' + year + '/' + month + '/' + day
+        yield scrapy.Request(url=baseURL, callback=self.parse)
+    def parse(self, response):
+        for link in response.css('div.paginated_content').css('h2.entry-title > a::attr(href)').extract():
+            yield scrapy.Request(url=link, callback=self.parse_item)
+        next_page = response.css('div.archive-pagination').css('a.next::attr(href)').extract_first()
+        if next_page is not None:
+            yield scrapy.Request(url=next_page, callback=self.parse)
+    def parse_item(self, response):
+        text = ''
+        item = NoticiasItem()
+        title = response.css('h1.entry-title').extract_first()
+        if title is not None: title = remove_tags(title)
+        topic = response.xpath('//a[@rel="tag"]').extract_first()
+        if topic is not None: topic = remove_tags(topic)
+        for p in response.css('div.entry-content > p').extract():
+            text += remove_tags(p) + '\n'
+        ## News item info ##
+        item['date']  = self.news_date
+        item['title'] = title
+        item['topic'] = topic
+        item['text']  = text.strip()
+        item['url']   = response.url
+        yield item
--- a/spiders/daily/DEPRECIATED/edoMexDia_/scrapy.cfg
+++ b/spiders/daily/DEPRECIATED/edoMexDia_/scrapy.cfg
@@ -4,8 +4,8 @@
 # https://scrapyd.readthedocs.org/en/latest/deploy.html
 [settings]
-default = edoMexDia.settings
+default = lectorMX.settings
 [deploy]
 #url = http://localhost:6800/
-project = edoMexDia
+project = lectorMX
--- a/spiders/daily/desdeElBalcon/2017-03-22.json
+++ b/spiders/daily/desdeElBalcon/2017-03-22.json
-[]
\ No newline at end of file
--- a/spiders/daily/desdeElBalcon/desdeElBalcon/items.py
+++ b/spiders/daily/desdeElBalcon/desdeElBalcon/items.py
-# -*- coding: utf-8 -*-
 # Define here the models for your scraped items
 #
 # See documentation in:
-# http://doc.scrapy.org/en/latest/topics/items.html
+# https://docs.scrapy.org/en/latest/topics/items.html
 import scrapy
-class NoticiasItem(scrapy.Item):
+class DesdeelbalconItem(scrapy.Item):
 	# define the fields for your item here like:
 	# name = scrapy.Field()
+	date = scrapy.Field()
 	title = scrapy.Field()
 	text = scrapy.Field()
-    date = scrapy.Field()
 	location = scrapy.Field()
 	author = scrapy.Field()
 	topic = scrapy.Field()

--- a/spiders/daily/desdeElBalcon/desdeElBalcon/middlewares.py
+++ b/spiders/daily/desdeElBalcon/desdeElBalcon/middlewares.py
-# -*- coding: utf-8 -*-
 # Define here the models for your spider middleware
 #
 # See documentation in:
-# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 from scrapy import signals
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
-class DesdeelbalconSpiderMiddleware(object):
+class DesdeelbalconSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.
@@ -20,30 +21,29 @@ class DesdeelbalconSpiderMiddleware(object):
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
-    def process_spider_input(response, spider):
+    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.
        # Should return None or raise an exception.
        return None
-    def process_spider_output(response, result, spider):
+    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.
-        # Must return an iterable of Request, dict or Item objects.
+        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i
-    def process_spider_exception(response, exception, spider):
+    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.
-        # Should return either None or an iterable of Response, dict
+        # Should return either None or an iterable of Request or item objects.
-        # or Item objects.
        pass
-    def process_start_requests(start_requests, spider):
+    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.
@@ -54,3 +54,50 @@ class DesdeelbalconSpiderMiddleware(object):
    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
+class DesdeelbalconDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/spiders/daily/desdeElBalcon/desdeElBalcon/pipelines.py
+++ b/spiders/daily/desdeElBalcon/desdeElBalcon/pipelines.py
-# -*- coding: utf-8 -*-
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-import json
-from collections import OrderedDict
-class JsonWriterPipeline(object):
-    def __init__(self, filename):
-        self.filename = filename
-    @classmethod
-    def from_crawler(cls, crawler):
-        # Here you get whatever value was passed through the "filename" command line parameter
-        settings = crawler.settings
-        filename = settings.get('filename')
-        # Instantiate the pipeline with the file name
+# useful for handling different item types with a single interface
-        return cls(filename)
+from itemadapter import ItemAdapter
-    def open_spider(self, spider):
-        self.counter = 0
-        self.file = open(self.filename, 'w')
-        self.file.write("[")
-    def close_spider(self, spider):
-        self.file.write("]")
-        self.file.close()
+class DesdeelbalconPipeline:
    def process_item(self, item, spider):
-        # print("this is my item", item)
-        row = []
-        try:
-            row.append(("date", item['date']))
-        except:
-            pass
-        try:
-            row.append(("topic", item['topic']))
-        except:
-            pass
-        try:
-            row.append(("title", item['title']))
-        except:
-            pass
-        try:
-            row.append(("author", item['author']))
-        except:
-            pass
-        try:
-            row.append(("location", item['location']))
-        except:
-            pass
-        try:
-            row.append(("text", item['text']))
-        except:
-            pass
-        try:
-            row.append(("url", item['url']))
-        except:
-            pass
-        line = OrderedDict(row)
-        self.counter += 1
-        if self.counter == 1:
-            self.file.write(json.dumps(line))
-        elif self.counter > 1:
-            self.file.write(",\n" + json.dumps(line))
        return item
--- a/spiders/daily/desdeElBalcon/desdeElBalcon/settings.py
+++ b/spiders/daily/desdeElBalcon/desdeElBalcon/settings.py
-# -*- coding: utf-8 -*-
 # Scrapy settings for desdeElBalcon project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
 #
-#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/settings.html
-#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
-#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 BOT_NAME = 'desdeElBalcon'
 SPIDER_MODULES = ['desdeElBalcon.spiders']
 NEWSPIDER_MODULE = 'desdeElBalcon.spiders'
+FEED_EXPORT_ENCODING="utf-8"
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'desdeElBalcon (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-# ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
 # Configure a delay for requests for the same website (default: 0)
-# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-DOWNLOAD_DELAY = 0.5
+#DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-COOKIES_ENABLED = False
+#COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -45,31 +43,31 @@ COOKIES_ENABLED = False
 #}
 # Enable or disable spider middlewares
-# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 #SPIDER_MIDDLEWARES = {
 #    'desdeElBalcon.middlewares.DesdeelbalconSpiderMiddleware': 543,
 #}
 # Enable or disable downloader middlewares
-# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #DOWNLOADER_MIDDLEWARES = {
-#    'desdeElBalcon.middlewares.MyCustomDownloaderMiddleware': 543,
+#    'desdeElBalcon.middlewares.DesdeelbalconDownloaderMiddleware': 543,
 #}
 # Enable or disable extensions
-# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
 #EXTENSIONS = {
 #    'scrapy.extensions.telnet.TelnetConsole': None,
 #}
 # Configure item pipelines
-# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-ITEM_PIPELINES = {
+#ITEM_PIPELINES = {
-   'desdeElBalcon.pipelines.JsonWriterPipeline': 300,
+#    'desdeElBalcon.pipelines.DesdeelbalconPipeline': 300,
-}
+#}
 # Enable and configure the AutoThrottle extension (disabled by default)
-# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 #AUTOTHROTTLE_ENABLED = True
 # The initial download delay
 #AUTOTHROTTLE_START_DELAY = 5
@@ -82,7 +80,7 @@ ITEM_PIPELINES = {
 #AUTOTHROTTLE_DEBUG = False
 # Enable and configure HTTP caching (disabled by default)
-# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 #HTTPCACHE_ENABLED = True
 #HTTPCACHE_EXPIRATION_SECS = 0
 #HTTPCACHE_DIR = 'httpcache'

--- a/spiders/daily/desdeElBalcon/desdeElBalcon/spiders/noticias.py
+++ b/spiders/daily/desdeElBalcon/desdeElBalcon/spiders/noticias.py
-# -*- coding: utf-8 -*-
+import scrapy
-import scrapy, re
-from datetime import datetime, timedelta, tzinfo
-from desdeElBalcon.items import NoticiasItem
-"""
-MEDIO:
-Desde el Balcon, Yucatan
-USO:
-scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
-"""
+from desdeElBalcon.items import DesdeelbalconItem
+import re
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)
+class NoticiasSpider(scrapy.Spider):
+	name = 'noticias'
-class UTC(tzinfo):
+	allowed_domains = ['desdeelbalcon.com']
-    """clase para el 'time zone' (zona horaria)"""
+	start_urls = ['http://desdeelbalcon.com/']
-    def utcoffset(self, dt):
-        # zona horaria para yucatan (centro de mexico): utc-6
-        return timedelta(hours=-6)
-    def tzname(self, dt):
-        # nombre de la zona horaria
-        return 'UTC-6'
-class QuotesSpider(scrapy.Spider):
-    name = "noticias"
 	def start_requests(self):
-        self.tz = UTC()
+		year = getattr(self, "year", None)
-        self.year = getattr(self, 'year', None)
+		month = getattr(self, "month", None)
-        self.month = getattr(self, 'month', None)
+		day = getattr(self, "day", None)
-        self.day = getattr(self, 'day', None)
-        self.baseURL='http://www.desdeelbalcon.com/'+self.year+'/'+self.month+'/'+self.day
+		self.date = year + "-" + month.zfill(2) + "-" + self.day.zfill(2)
+		self.baseURL = "http://desdeelbalcon.com/" + year + "/" + month.zfill(2) + "/" + self.day.zfill(2) + "/"
 		yield scrapy.Request(url=self.baseURL, callback=self.parse)
 	def parse(self, response):
-        print(response.url)
+		for link in response.xpath('//article//h3/a/@href').extract():
-        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+			yield scrapy.Request(url=link, callback=self.parse_item)
+		nextPage = response.xpath('//div[@class="nav-links"]/a[@class="next page-numbers"]/@href').extract_first()
-        pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()
+		if nextPage is not None:
-        if len(pagination) > 0:
+			yield scrapy.Request(url=nextPage, callback=self.parsePage)
-            pagination = pagination[-1].strip('/')
-            pages = int(pagination[pagination.rfind('/')+1:])
-            for page in range(1, pages):
-                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-    def parse_page(self, response):
-        item = NoticiasItem()
-        for post in response.xpath('//ul[@class="archivepost"]/li'):
-            # item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
-            item['date'] = datetime(int(self.year),int(self.month),int(self.day),tzinfo=self.tz).isoformat('T')
-            item['topic'] = post.xpath('./p/a/text()').extract()
-            request = scrapy.Request(url=post.xpath('./h2/a/@href').extract_first(), callback=self.parse_item)
-            request.meta['item'] = item
-            yield request
 	def parse_item(self, response):
-        text = ''
+		item = DesdeelbalconItem()
-        item = response.meta['item']
+		item['date']  = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+		item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
+		item['topic'] = response.xpath('//meta[@property="article:section"]/@content').extract_first()
-        item['title'] = response.xpath('//h1[@class="post entry-title"]/a/text()').extract_first()
+		paragraphs = response.xpath('//div[contains(@class,"entry-content")]/p/text()').extract()
+		text=""
+		for p in paragraphs:
+			p = p.replace("<br>", "\n")
+			text += remove_tags(p) + "\n"
-        for paragraph in response.xpath('//div[@itemprop="text"]/p').extract():
-            text += remove_tags(paragraph) + '\n'
 		item['text']  = text
 		item['url']   = response.url
+		print(item["title"])
-        print (item['title'])
+		yield(item)
-        yield item
--- a/spiders/daily/desdeElBalcon/scrapy.cfg
+++ b/spiders/daily/desdeElBalcon/scrapy.cfg
 # Automatically created by: scrapy startproject
 #
 # For more information about the [deploy] section see:
-# https://scrapyd.readthedocs.org/en/latest/deploy.html
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
 [settings]
 default = desdeElBalcon.settings

--- a/spiders/daily/diarioCoLatino/diarioCoLatino/settings.py
+++ b/spiders/daily/diarioCoLatino/diarioCoLatino/settings.py
@@ -11,8 +11,8 @@ BOT_NAME = 'diarioCoLatino'
 SPIDER_MODULES = ['diarioCoLatino.spiders']
 NEWSPIDER_MODULE = 'diarioCoLatino.spiders'
-FEED_EXPORT_ENCODING="utf-8"
+FEED_EXPORT_ENCODING="utf-8"
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'diarioCoLatino (+http://www.yourdomain.com)'

--- a/spiders/daily/diarioCoLatino/diarioCoLatino/spiders/noticias.py
+++ b/spiders/daily/diarioCoLatino/diarioCoLatino/spiders/noticias.py
@@ -2,11 +2,10 @@ import scrapy
 from diarioCoLatino.items import DiariocolatinoItem
 import re
-#-------------------------------------------------------------------------------
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
 	return TAG_RE.sub('', text)
-#-----------------------------------
 #-------------------------------------------------------------------------------
 class NoticiasSpider(scrapy.Spider):
 	name = 'noticias'
@@ -20,28 +19,27 @@ class NoticiasSpider(scrapy.Spider):
 		self.baseURL = "https://www.diariocolatino.com/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
-		yield scrapy.Request(url=self.baseURL, callback=self.parseDate)
+		yield scrapy.Request(url=self.baseURL, callback=self.parsePage)
 	#-----------------------------------------------------------------------
-	def parseDate(self, response):
+	def parsePage(self, response):
 		print(response.url)
-		for page in response.css('span#tie-next-page').css("a::attr(href)").extract():
+		for link in response.xpath('//article/h2/a/@href').extract():
-			yield scrapy.Request(url=page, callback=self.parseDate)
-		for link in response.css('div.content').css('div.post-listing').xpath('./article/h2/a/@href').extract():
 			yield scrapy.Request(url=link, callback=self.parseItem)
+		nextPage = response.xpath('//span[@id="tie-next-page"]/a/@href').extract_first()
+		if nextPage is not None:
+			yield scrapy.Request(url=nextPage, callback=self.parsePage)
 	#-----------------------------------------------------------------------
 	def parseItem(self, response):
-		print(response.url)
 		item = DiariocolatinoItem()
 		item["date"] = response.xpath("//meta[@property='article:published_time']/@content").extract_first()
-		item["title"] = response.xpath('//meta[@property="og:title"]/@content').extract_first().replace("- Diario Co Latino","")
+		item["title"] = response.xpath('//meta[@property="og:title"]/@content').extract_first().replace("- Diario Co Latino","").strip()
 		item["topic"] = response.css("span.post-cats").css("a::text").extract_first().lower()
 		text=""
 		for p in response.xpath('//div[@class="entry"]/p').extract():
 			text += remove_tags(p) + "\n"
 		item["text"]=text
 		item["url"]=response.url
+		print(item['title'])
-		print(item)
+		yield(item)
-		yield item
--- a/spiders/daily/diarioPuntual/2018-09-05.json
+++ b/spiders/daily/diarioPuntual/2018-09-05.json
--- a/spiders/daily/diarioPuntual/2020-05-05.json
+++ b/spiders/daily/diarioPuntual/2020-05-05.json
--- a/spiders/daily/diarioPuntual/diarioPuntual/items.py
+++ b/spiders/daily/diarioPuntual/diarioPuntual/items.py
-# -*- coding: utf-8 -*-
 # Define here the models for your scraped items
 #
 # See documentation in:
-# https://doc.scrapy.org/en/latest/topics/items.html
+# https://docs.scrapy.org/en/latest/topics/items.html
 import scrapy
-class NoticiasItem(scrapy.Item):
+class DiariopuntualItem(scrapy.Item):
 	# define the fields for your item here like:
 	# name = scrapy.Field()
+	date = scrapy.Field()
 	title = scrapy.Field()
 	text = scrapy.Field()
-    date = scrapy.Field()
 	location = scrapy.Field()
 	author = scrapy.Field()
 	topic = scrapy.Field()

--- a/spiders/daily/diarioPuntual/diarioPuntual/middlewares.py
+++ b/spiders/daily/diarioPuntual/diarioPuntual/middlewares.py
-# -*- coding: utf-8 -*-
 # Define here the models for your spider middleware
 #
 # See documentation in:
-# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 from scrapy import signals
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
-class DiariopuntualSpiderMiddleware(object):
+class DiariopuntualSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.
@@ -31,7 +32,7 @@ class DiariopuntualSpiderMiddleware(object):
        # Called with the results returned from the Spider, after
        # it has processed the response.
-        # Must return an iterable of Request, dict or Item objects.
+        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i
@@ -39,8 +40,7 @@ class DiariopuntualSpiderMiddleware(object):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.
-        # Should return either None or an iterable of Response, dict
+        # Should return either None or an iterable of Request or item objects.
-        # or Item objects.
        pass
    def process_start_requests(self, start_requests, spider):
@@ -56,7 +56,7 @@ class DiariopuntualSpiderMiddleware(object):
        spider.logger.info('Spider opened: %s' % spider.name)
-class DiariopuntualDownloaderMiddleware(object):
+class DiariopuntualDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

--- a/spiders/daily/diarioPuntual/diarioPuntual/pipelines.py
+++ b/spiders/daily/diarioPuntual/diarioPuntual/pipelines.py
-# -*- coding: utf-8 -*-
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-import json
-from collections import OrderedDict
-class JsonWriterPipeline(object):
-    def __init__(self, filename):
-        self.filename = filename
-    @classmethod
-    def from_crawler(cls, crawler):
-        # Here you get whatever value was passed through the "filename" command line parameter
-        settings = crawler.settings
-        filename = settings.get('filename')
-        # Instantiate the pipeline with the file name
+# useful for handling different item types with a single interface
-        return cls(filename)
+from itemadapter import ItemAdapter
-    def open_spider(self, spider):
-        self.counter = 0
-        self.file = open(self.filename, 'w')
-        self.file.write("[")
-    def close_spider(self, spider):
-        self.file.write("]")
-        self.file.close()
+class DiariopuntualPipeline:
    def process_item(self, item, spider):
-        # print("this is my item", item)
-        row = []
-        try:
-            row.append(("date", item['date']))
-        except:
-            pass
-        try:
-            row.append(("topic", item['topic']))
-        except:
-            pass
-        try:
-            row.append(("title", item['title']))
-        except:
-            pass
-        try:
-            row.append(("author", item['author']))
-        except:
-            pass
-        try:
-            row.append(("location", item['location']))
-        except:
-            pass
-        try:
-            row.append(("text", item['text']))
-        except:
-            pass
-        try:
-            row.append(("url", item['url']))
-        except:
-            pass
-        line = OrderedDict(row)
-        self.counter += 1
-        if self.counter == 1:
-            self.file.write(json.dumps(line))
-        elif self.counter > 1:
-            self.file.write(",\n" + json.dumps(line))
        return item
--- a/spiders/daily/diarioPuntual/diarioPuntual/settings.py
+++ b/spiders/daily/diarioPuntual/diarioPuntual/settings.py
--- a/spiders/daily/diarioPuntual/diarioPuntual/spiders/noticias.py
+++ b/spiders/daily/diarioPuntual/diarioPuntual/spiders/noticias.py
--- a/spiders/daily/diarioYaqui/2020-05-17.json
+++ b/spiders/daily/diarioYaqui/2020-05-17.json
--- a/spiders/daily/edoMexDia/2021-03-22.json
+++ b/spiders/daily/edoMexDia/2021-03-22.json
--- a/spiders/daily/edoMexDia/edoMexDia/spiders/noticias.py
+++ b/spiders/daily/edoMexDia/edoMexDia/spiders/noticias.py
--- a/spiders/daily/elComentario/2020-03-18.json
+++ b/spiders/daily/elComentario/2020-03-18.json
--- a/spiders/daily/elComentario/elComentario/items.py
+++ b/spiders/daily/elComentario/elComentario/items.py
--- a/spiders/daily/elComentario/elComentario/middlewares.py
+++ b/spiders/daily/elComentario/elComentario/middlewares.py
--- a/spiders/daily/elComentario/elComentario/pipelines.py
+++ b/spiders/daily/elComentario/elComentario/pipelines.py
--- a/spiders/daily/elComentario/elComentario/settings.py
+++ b/spiders/daily/elComentario/elComentario/settings.py
--- a/spiders/daily/elComentario/elComentario/spiders/noticias.py
+++ b/spiders/daily/elComentario/elComentario/spiders/noticias.py
--- a/spiders/daily/elIndependiente/2020-03-03.json
+++ b/spiders/daily/elIndependiente/2020-03-03.json
--- a/spiders/daily/elIndependiente/2020-04-16.json
+++ b/spiders/daily/elIndependiente/2020-04-16.json
--- a/spiders/daily/elIndependiente/elIndependiente/items.py
+++ b/spiders/daily/elIndependiente/elIndependiente/items.py
--- a/spiders/daily/elIndependiente/elIndependiente/middlewares.py
+++ b/spiders/daily/elIndependiente/elIndependiente/middlewares.py
--- a/spiders/daily/elIndependiente/elIndependiente/pipelines.py
+++ b/spiders/daily/elIndependiente/elIndependiente/pipelines.py
--- a/spiders/daily/elIndependiente/elIndependiente/settings.py
+++ b/spiders/daily/elIndependiente/elIndependiente/settings.py
--- a/spiders/daily/elIndependiente/elIndependiente/spiders/noticias.py
+++ b/spiders/daily/elIndependiente/elIndependiente/spiders/noticias.py
--- a/spiders/daily/elIndependiente/scrapy.cfg
+++ b/spiders/daily/elIndependiente/scrapy.cfg
--- a/spiders/daily/elSur/2020-03-27.json
+++ b/spiders/daily/elSur/2020-03-27.json
--- a/spiders/daily/elSur/elSur/items.py
+++ b/spiders/daily/elSur/elSur/items.py
--- a/spiders/daily/elSur/elSur/middlewares.py
+++ b/spiders/daily/elSur/elSur/middlewares.py
--- a/spiders/daily/elSur/elSur/pipelines.py
+++ b/spiders/daily/elSur/elSur/pipelines.py
--- a/spiders/daily/elSur/elSur/settings.py
+++ b/spiders/daily/elSur/elSur/settings.py
--- a/spiders/daily/elSur/elSur/spiders/noticias.bk
+++ b/spiders/daily/elSur/elSur/spiders/noticias.bk
--- a/spiders/daily/elSur/elSur/spiders/noticias.py
+++ b/spiders/daily/elSur/elSur/spiders/noticias.py
--- a/spiders/daily/elSur/out_test.json
+++ b/spiders/daily/elSur/out_test.json
--- a/spiders/daily/elValle/2020-01-25.json
+++ b/spiders/daily/elValle/2020-01-25.json
--- a/spiders/daily/expresoChiapas/2020-01-19.json
+++ b/spiders/daily/expresoChiapas/2020-01-19.json
--- a/spiders/daily/grilloPorteno/2020-02-23.json
+++ b/spiders/daily/grilloPorteno/2020-02-23.json
--- a/spiders/daily/heraldoAgs/2017-04-23.json
+++ b/spiders/daily/heraldoAgs/2017-04-23.json
--- a/spiders/daily/laJornada/2020-03-03.json
+++ b/spiders/daily/laJornada/2020-03-03.json
--- a/spiders/daily/laJornada/laJornada/spiders/noticias.py
+++ b/spiders/daily/laJornada/laJornada/spiders/noticias.py
--- a/spiders/daily/laJornadaAgs/2020-01-19.json
+++ b/spiders/daily/laJornadaAgs/2020-01-19.json
--- a/spiders/daily/laJornadaAgs/2020-04-15.json
+++ b/spiders/daily/laJornadaAgs/2020-04-15.json
--- a/spiders/daily/laJornadaAgs/laJornadaAgs/items.py
+++ b/spiders/daily/laJornadaAgs/laJornadaAgs/items.py
--- a/spiders/daily/laJornadaAgs/laJornadaAgs/middlewares.py
+++ b/spiders/daily/laJornadaAgs/laJornadaAgs/middlewares.py
--- a/spiders/daily/laJornadaAgs/laJornadaAgs/pipelines.py
+++ b/spiders/daily/laJornadaAgs/laJornadaAgs/pipelines.py
--- a/spiders/daily/laJornadaAgs/laJornadaAgs/settings.py
+++ b/spiders/daily/laJornadaAgs/laJornadaAgs/settings.py
--- a/spiders/daily/laJornadaAgs/laJornadaAgs/spiders/noticias.py
+++ b/spiders/daily/laJornadaAgs/laJornadaAgs/spiders/noticias.py
--- a/spiders/daily/laJornadaAgs/scrapy.cfg
+++ b/spiders/daily/laJornadaAgs/scrapy.cfg
--- a/spiders/daily/laJornadaGro/2020-04-15.json
+++ b/spiders/daily/laJornadaGro/2020-04-15.json
--- a/spiders/daily/laJornadaZac/2020-04-15.json
+++ b/spiders/daily/laJornadaZac/2020-04-15.json
--- a/spiders/daily/lectorMX/lectorMX/items.py
+++ b/spiders/daily/lectorMX/lectorMX/items.py
--- a/spiders/daily/lectorMX/lectorMX/middlewares.py
+++ b/spiders/daily/lectorMX/lectorMX/middlewares.py
--- a/spiders/daily/lectorMX/lectorMX/pipelines.py
+++ b/spiders/daily/lectorMX/lectorMX/pipelines.py
--- a/spiders/daily/lectorMX/lectorMX/settings.py
+++ b/spiders/daily/lectorMX/lectorMX/settings.py
--- a/spiders/daily/lectorMX/lectorMX/spiders/noticias.py
+++ b/spiders/daily/lectorMX/lectorMX/spiders/noticias.py
--- a/spiders/daily/lectorMX/scrapy.cfg
+++ b/spiders/daily/lectorMX/scrapy.cfg
--- a/spiders/daily/noticias.py
+++ b/spiders/daily/noticias.py
--- a/spiders/daily/noticieroLinea/out_test.json
+++ b/spiders/daily/noticieroLinea/out_test.json
--- a/spiders/daily/opinionPuebla/2019-09-19.json
+++ b/spiders/daily/opinionPuebla/2019-09-19.json
--- a/spiders/daily/surDeCampeche/2019-09-19.json
+++ b/spiders/daily/surDeCampeche/2019-09-19.json