crawlers

ea7ae846 · Renán Sosa Guillen · 54164b6d · ea7ae846 · ea7ae846 · ea7ae846
Commit ea7ae846 authored Jan 29, 2018 by Renán Sosa Guillen
86 changed files
--- a/descarga_por_dia/alChile/alChile/spiders/noticias.py
+++ b/descarga_por_dia/alChile/alChile/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from alChile.items import NoticiasItem

--- a/descarga_por_dia/campecheHoy/campecheHoy/spiders/noticias.py
+++ b/descarga_por_dia/campecheHoy/campecheHoy/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from campecheHoy.items import NoticiasItem

--- a/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.py
+++ b/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from datetime import datetime, timedelta, tzinfo
 from desdeElBalcon.items import NoticiasItem

--- a/descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.py
+++ b/descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from diarioYaqui.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo

--- a/descarga_por_dia/grilloPorteno/grilloPorteno/spiders/noticias.py
+++ b/descarga_por_dia/grilloPorteno/grilloPorteno/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from grilloPorteno.items import NoticiasItem

--- a/descarga_por_dia/heraldoAgs/heraldoAgs/spiders/noticias.py
+++ b/descarga_por_dia/heraldoAgs/heraldoAgs/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from heraldoAgs.items import NoticiasItem

--- a/descarga_por_dia/laJornada/laJornada/spiders/noticias.py
+++ b/descarga_por_dia/laJornada/laJornada/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from laJornada.items import NoticiasItem
 from datetime import date, datetime, timedelta, tzinfo, time

--- a/descarga_por_dia/laJornadaAgs/laJornadaAgs/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaAgs/laJornadaAgs/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from laJornadaAgs.items import NoticiasItem

--- a/descarga_por_dia/laJornadaGro/laJornadaGro/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaGro/laJornadaGro/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from laJornadaGro.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo

--- a/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from laJornadaGroAntiguo.items import NoticiasItem

--- a/descarga_por_dia/laJornadaOte/laJornadaOte/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaOte/laJornadaOte/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from laJornadaOte.items import NoticiasItem

--- a/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from laJornadaSanLuis.items import NoticiasItem

--- a/descarga_por_dia/laJornadaVer/laJornadaVer/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaVer/laJornadaVer/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from laJornadaVer.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo

--- a/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from laJornadaZac.items import NoticiasItem

--- a/descarga_por_dia/laRazon/laRazon/items.py
+++ b/descarga_por_dia/laRazon/laRazon/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class LarazonItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laRazon/laRazon/pipelines.py
+++ b/descarga_por_dia/laRazon/laRazon/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class LarazonPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/laRazon/laRazon/settings.py
+++ b/descarga_por_dia/laRazon/laRazon/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laRazon.spiders'
 #USER_AGENT = 'laRazon (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'laRazon.pipelines.LarazonPipeline': 300,
+   'laRazon.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laRazon/laRazon/settings.pyc
+++ b/descarga_por_dia/laRazon/laRazon/settings.pyc
--- a/descarga_por_dia/laRazon/laRazon/spiders/noticias.py
+++ b/descarga_por_dia/laRazon/laRazon/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from laRazon.items import NoticiasItem
-'''
+"""
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=28
+MEDIO:
-'''
+La Razón de México, CDMX
+USO:
+scrapy crawl noticias --nolog -s filename=2017-09-28.json -a year=2017 -a month=9 -a day=28
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-	return TAG_RE.sub('', text)
+    return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
+class QuotesSpider(scrapy.Spider):
-	title = scrapy.Field()
+    name = "noticias"
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
+    def start_requests(self):
+        year = getattr(self, 'year', None)
+        month = getattr(self, 'month', None)
+        day = getattr(self, 'day', None)
-class QuotesSpider(scrapy.Spider):
+        self.baseURL = "https://www.razon.com.mx/" + year + "/" + month + "/" + day
-	name = "noticias"
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
-	def start_requests(self):
-		year = getattr(self, 'year', None)
-		month = getattr(self, 'month', None)
+    def parse(self, response):
-		day = getattr(self, 'day', None)
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-		self.baseURL='https://www.razon.com.mx/'+year+'/'+month+'/'+day
+        pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a[@class="last"]/@href').extract_first()
+        pagination = pagination.strip('/')
-		yield scrapy.Request(url=self.baseURL, callback=self.parse)
+        pages = int(pagination[pagination.rfind('/')+1:])
+        for page in range(1, pages):
+            yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
-	def parse(self, response):
-		pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a[@class="last"]/@href').extract_first()
-		pagination = pagination.strip('/')
+    def parse_page(self, response):
-		pages = int(pagination[pagination.rfind('/')+1:])
+        for link in response.xpath('//*[@class="td_module_1 td_module_wrap td-animation-stack"]/h3[@class="entry-title td-module-title"]/a/@href').extract():
+            yield scrapy.Request(url=link, callback=self.parse_item)
-		for page in range(0,pages):
-			if page == 0:
-				yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+    def parse_item(self, response):
-			else:
+        item = NoticiasItem()
-				yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
+        text = ''
+        d = response.xpath('//span[@class="td-post-date td-post-date-no-dot"]/time/@datetime').extract_first()
-	def parse_page(self, response):
+        ## '-06:00' corresponde al UTC-6, zona horaria del centro de mexico
-		for link in response.xpath('//*[@class="td_module_1 td_module_wrap td-animation-stack"]/h3[@class="entry-title td-module-title"]/a/@href').extract():
+        if d[-6:] != '-06:00':
-			yield scrapy.Request(url=link, callback=self.parse_item)
+            d = d[:-6] + '-06:00'
+        item['date'] = d
-	def parse_item(self, response):
+        item['topic'] = response.xpath('//*[@class="entry-crumbs"]/span/a[@class="entry-crumb"]/text()').extract()[2]
-		item = NoticiasItem()
+        ti = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first()
-		text = ''
+        if ti is None:
+            ti = response.xpath('//header[@class="td-post-title"]/h1/text()').extract_first()
-		d = response.xpath('//span[@class="td-post-date td-post-date-no-dot"]/time/@datetime').extract_first()
+        item['title'] = ti
-		## '-06:00' corresponde al UTC-6, zona horaria del centro de mexico
-		if d[-6:] != '-06:00':
+        paragraphs = response.xpath('//*[@class="td-post-content"]/p').extract()
-			d = d[:-6] + '-06:00'
+        if len(paragraphs) <= 0:
-		item['date'] = d
+            paragraphs = response.xpath('//*[@dir="auto"]').extract()
+        for p in paragraphs:
-		item['topic'] = response.xpath('//*[@class="entry-crumbs"]/span/a[@class="entry-crumb"]/text()').extract()[2]
+            text += remove_tags(p) + '\n'
-		ti = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first()
-		if ti is None:
+        item['text'] = text
-			ti = response.xpath('//header[@class="td-post-title"]/h1/text()').extract_first()
+        item['url'] = response.url
-		item['title'] = ti
+        # print item['title']
-		paragraphs = response.xpath('//*[@class="td-post-content"]/p').extract()
+        yield item
-		if len(paragraphs) <= 0:
-			paragraphs = response.xpath('//*[@dir="auto"]').extract()
-		for p in paragraphs:
-			text += remove_tags(p) + '\n'
-		item['text'] = text
-		item['url'] = response.url
-		# print item['title']
-		yield item
--- a/descarga_por_dia/laRazon/laRazon/spiders/noticias.pyc
+++ b/descarga_por_dia/laRazon/laRazon/spiders/noticias.pyc
--- a/descarga_por_dia/laVerdadYuc/laVerdadYuc/items.py
+++ b/descarga_por_dia/laVerdadYuc/laVerdadYuc/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class LaverdadyucItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laVerdadYuc/laVerdadYuc/pipelines.py
+++ b/descarga_por_dia/laVerdadYuc/laVerdadYuc/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class LaverdadyucPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/laVerdadYuc/laVerdadYuc/settings.py
+++ b/descarga_por_dia/laVerdadYuc/laVerdadYuc/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laVerdadYuc.spiders'
 #USER_AGENT = 'laVerdadYuc (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-DOWNLOAD_DELAY = 2
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'laVerdadYuc.pipelines.LaverdadyucPipeline': 300,
+   'laVerdadYuc.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laVerdadYuc/laVerdadYuc/spiders/noticias.py
+++ b/descarga_por_dia/laVerdadYuc/laVerdadYuc/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from laVerdadYuc.items import NoticiasItem
-'''
+"""
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+MEDIO:
-'''
+La Verdad Yucatán
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
 # class QuotesSpider(scrapy.Spider):
 # 	name = "noticias"
@@ -72,7 +69,8 @@ class QuotesSpider(scrapy.Spider):
        year = getattr(self, 'year', None)
        month = getattr(self, 'month', None)
        day = getattr(self, 'day', None)
-        self.baseURL = 'http://laverdadnoticias.com/' + year + '/' + month + '/' + day
+        self.baseURL = "http://laverdadnoticias.com/" + year + "/" + month + "/" + day
        yield scrapy.Request(url=self.baseURL, callback=self.parse)

--- a/descarga_por_dia/lectorMX/lectorMX/items.py
+++ b/descarga_por_dia/lectorMX/lectorMX/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class LectormxItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/lectorMX/lectorMX/pipelines.py
+++ b/descarga_por_dia/lectorMX/lectorMX/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class LectormxPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/lectorMX/lectorMX/settings.py
+++ b/descarga_por_dia/lectorMX/lectorMX/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'lectorMX.spiders'
 #USER_AGENT = 'lectorMX (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'lectorMX.pipelines.LectormxPipeline': 300,
+   'lectorMX.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/lectorMX/lectorMX/settings.pyc
+++ b/descarga_por_dia/lectorMX/lectorMX/settings.pyc
--- a/descarga_por_dia/lectorMX/lectorMX/spiders/noticias.py
+++ b/descarga_por_dia/lectorMX/lectorMX/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from lectorMX.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo
+"""
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=30
+MEDIO:
+Lector MX, Yucatán
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-30.json -a year=2017 -a month=3 -a day=30
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-	return TAG_RE.sub('', text)
+    return TAG_RE.sub('', text)
 class UTC(tzinfo):
-	"""clase para el 'time zone' (zona horaria)"""
+    """clase para el 'time zone' (zona horaria)"""
+    def utcoffset(self, dt):
+        # zona horaria para yucatan (centro de mexico): utc-6
+        return timedelta(hours=-6)
-	def utcoffset(self, dt):
+    def tzname(self, dt):
-		# zona horaria para yucatan (centro de mexico): utc-6
+        # nombre de la zona horaria
-		return timedelta(hours=-6)
+        return 'UTC-6'
-	def tzname(self, dt):
-		# nombre de la zona horaria
-		return 'UTC-6'
+class QuotesSpider(scrapy.Spider):
+    name = "noticias"
-class NoticiasItem(scrapy.Item):
+    def start_requests(self):
-	title = scrapy.Field()
+        tz = UTC()
-	text = scrapy.Field()
+        year = getattr(self, 'year', None)
-	date = scrapy.Field()
+        month = getattr(self, 'month', None)
-	location = scrapy.Field()
+        day = getattr(self, 'day', None)
-	author = scrapy.Field()
+        self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
-	topic = scrapy.Field()
+        self.baseURL='http://lectormx.com/'+year+'/'+month+'/'+day
-	url = scrapy.Field()
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
-class QuotesSpider(scrapy.Spider):
-	name = "noticias"
+    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-	def start_requests(self):
-		tz = UTC()
+        pagination = response.css('div.pagination').xpath('./ul/li/a/@href').extract()
-		year = getattr(self, 'year', None)
+        if len(pagination) > 0:
-		month = getattr(self, 'month', None)
+            pagination = pagination[-1].strip('/')
-		day = getattr(self, 'day', None)
+            pages = int(pagination[pagination.rfind('/')+1:])
-		self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
-		self.baseURL='http://lectormx.com/'+year+'/'+month+'/'+day
+            for page in range(1, pages):
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-		yield scrapy.Request(url=self.baseURL, callback=self.parse)
+    def parse_page(self, response):
-	def parse(self, response):
+        for link in response.xpath('//h2[@class="title"]/a/@href').extract():
-		pagination = response.css('div.pagination').xpath('./ul/li/a/@href').extract()
+            yield scrapy.Request(url=link, callback=self.parse_item)
-		if ( len(pagination) > 0 ):
-			pagination = pagination[-1].strip('/')
-			pages = int(pagination[pagination.rfind('/')+1:])
+    def parse_item(self, response):
-			for page in range(0, pages):
+        text = ''
-				if ( page == 0 ):
+        item = NoticiasItem()
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        item['date'] = self.date
-				else:
+        item['title'] = response.xpath('//div[@class="single_post"]/header/h1/text()').extract_first()
-					yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
+        item['topic'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[6]/a/text()').extract_first()
-		else:
+        for paragraph in response.css('div.post-single-content').css('p').extract():
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+            text += remove_tags(paragraph) + '\n'
+        item['text'] = text
+        item['url'] = response.url
-	def parse_page(self, response):
+        # print item['title']
-		for link in response.xpath('//h2[@class="title"]/a/@href').extract():
+        yield item
-			yield scrapy.Request(url=link, callback=self.parse_item)
-	def parse_item(self, response):
-		text = ''
-		item = NoticiasItem()
-		item['date'] = self.date
-		item['title'] = response.xpath('//div[@class="single_post"]/header/h1/text()').extract_first()
-		item['topic'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[6]/a/text()').extract_first()
-		for paragraph in response.css('div.post-single-content').css('p').extract():
-			text += remove_tags(paragraph) + '\n'
-		item['text'] = text
-		item['url'] = response.url
-		# print item['title']
-		yield item
--- a/descarga_por_dia/lectorMX/lectorMX/spiders/noticias.pyc
+++ b/descarga_por_dia/lectorMX/lectorMX/spiders/noticias.pyc
--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/items.py
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class MipuntodevistaItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/items.pyc
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/items.pyc
--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/pipelines.pyc
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/pipelines.pyc
--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/spiders/noticias.py
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from miPuntoDeVista.items import NoticiasItem
 """
+MEDIO:
+Mi Punto de Vista, Yucatán
 USO:
 scrapy crawl noticias --nolog -s filename=2018-03-22.json -a year=2017 -a month=3 -a day=22
 """
@@ -10,16 +14,6 @@ def remove_tags(text):
    return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"

--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/spiders/noticias.pyc
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/spiders/noticias.pyc
--- a/descarga_por_dia/notirivas/notirivas/items.py
+++ b/descarga_por_dia/notirivas/notirivas/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class NotirivasItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/notirivas/notirivas/pipelines.py
+++ b/descarga_por_dia/notirivas/notirivas/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class NotirivasPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/notirivas/notirivas/settings.py
+++ b/descarga_por_dia/notirivas/notirivas/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'notirivas.spiders'
 #USER_AGENT = 'notirivas (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'notirivas.pipelines.NotirivasPipeline': 300,
+   'notirivas.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/notirivas/notirivas/settings.pyc
+++ b/descarga_por_dia/notirivas/notirivas/settings.pyc
--- a/descarga_por_dia/notirivas/notirivas/spiders/noticias.py
+++ b/descarga_por_dia/notirivas/notirivas/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from notirivas.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo
+"""
-#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+MEDIO:
+Notirivas, Yucatán
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-	return TAG_RE.sub('', text)
+    return TAG_RE.sub('', text)
 class UTC(tzinfo):
-	"""clase para el 'time zone' (zona horaria)"""
+    """clase para el 'time zone' (zona horaria)"""
-	def utcoffset(self, dt):
+    def utcoffset(self, dt):
-		# zona horaria para yucatan (centro de mexico): utc-6
+        # zona horaria para yucatan (centro de mexico): utc-6
-		return timedelta(hours=-6)
+        return timedelta(hours=-6)
-	def tzname(self, dt):
+    def tzname(self, dt):
-		# nombre de la zona horaria
+        # nombre de la zona horaria
-		return 'UTC-6'
+        return 'UTC-6'
-class NoticiasItem(scrapy.Item):
+class QuotesSpider(scrapy.Spider):
-	title = scrapy.Field()
+    name = "noticias"
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
+    def start_requests(self):
+        tz = UTC()
+        year = getattr(self, 'year', None)
+        month = getattr(self, 'month', None)
+        day = getattr(self, 'day', None)
+        self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
-class QuotesSpider(scrapy.Spider):
+        self.baseURL = "http://gruporivas.com.mx/notirivas/" + year + "/" + month + "/" +day
-	name = "noticias"
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
-	def start_requests(self):
-		tz = UTC()
-		year = getattr(self, 'year', None)
+    def parse(self, response):
-		month = getattr(self, 'month', None)
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-		day = getattr(self, 'day', None)
-		self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
+        pagination = response.xpath('//*[@class="bdaia-pagination"]/span[@class="pages"]/text()').extract()
-		self.baseURL='http://gruporivas.com.mx/notirivas/'+year+'/'+month+'/'+day
+        if len(pagination) > 0:
+            pagination = pagination[0]
-		yield scrapy.Request(url=self.baseURL, callback=self.parse)
+            pages = int(pagination[pagination.rfind(' ')+1:])
+            for page in range(1, pages):
-	def parse(self, response):
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-		pagination = response.xpath('//*[@class="bdaia-pagination"]/span[@class="pages"]/text()').extract()
-		if ( len(pagination) > 0 ):
-			pagination = pagination[0]
+    def parse_page(self, response):
-			pages = int(pagination[pagination.rfind(' ')+1:])
+        for link in response.xpath('//article/header/h2/a/@href').extract():
-			for page in range(0, pages):
+            yield scrapy.Request(url=link, callback=self.parse_item)
-				if ( page == 0 ):
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-				else:
+    def parse_item(self, response):
-					yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
+        item = NoticiasItem()
-		else:
+        text = ''
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        item['date'] = self.date
+        item['title'] = response.xpath('//*[@class="bdaia-post-title"]/h1/span/text()').extract_first()
+        item['topic'] = response.xpath('//*[@class="bdaia-category"]/a/text()').extract_first()
-	def parse_page(self, response):
-		for link in response.xpath('//article/header/h2/a/@href').extract():
+        content = response.xpath('//*[@class="bdaia-post-content"]/p/text()').extract()
-			yield scrapy.Request(url=link, callback=self.parse_item)
+        if ( len(content) == 0 ):
+            content = response.xpath('//*[@class="bdaia-post-content"]/div/div/p/text()').extract()
+            if ( len(content) == 0 ):
-	def parse_item(self, response):
+                content = response.xpath('//*[@class="bdaia-post-content"]/p/span/text()').extract()
-		item = NoticiasItem()
-		text = ''
+        for paragraph in content:
-		item['date'] = self.date
+            text += remove_tags(paragraph) + '\n'
-		item['title'] = response.xpath('//*[@class="bdaia-post-title"]/h1/span/text()').extract_first()
+        item['text'] = text
-		item['topic'] = response.xpath('//*[@class="bdaia-category"]/a/text()').extract_first()
+        item['url'] = response.url
-		content = response.xpath('//*[@class="bdaia-post-content"]/p/text()').extract()
+        # print item['title']
-		if ( len(content) == 0 ):
+        yield item
-			content = response.xpath('//*[@class="bdaia-post-content"]/div/div/p/text()').extract()
-			if ( len(content) == 0 ):
-				content = response.xpath('//*[@class="bdaia-post-content"]/p/span/text()').extract()
-		for paragraph in content:
-			text += remove_tags(paragraph) + '\n'
-		item['text'] = text
-		item['url'] = response.url
-		# print item['title']
-		yield item
--- a/descarga_por_dia/notirivas/notirivas/spiders/noticias.pyc
+++ b/descarga_por_dia/notirivas/notirivas/spiders/noticias.pyc
--- a/descarga_por_dia/notisureste/notisureste/__init__.pyc
+++ b/descarga_por_dia/notisureste/notisureste/__init__.pyc
--- a/descarga_por_dia/notisureste/notisureste/items.py
+++ b/descarga_por_dia/notisureste/notisureste/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class NotisuresteItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/notisureste/notisureste/pipelines.py
+++ b/descarga_por_dia/notisureste/notisureste/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class NotisurestePipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/notisureste/notisureste/settings.py
+++ b/descarga_por_dia/notisureste/notisureste/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'notisureste.spiders'
 #USER_AGENT = 'notisureste (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'notisureste.pipelines.NotisurestePipeline': 300,
+   'notisureste.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/notisureste/notisureste/settings.pyc
+++ b/descarga_por_dia/notisureste/notisureste/settings.pyc
--- a/descarga_por_dia/notisureste/notisureste/spiders/__init__.pyc
+++ b/descarga_por_dia/notisureste/notisureste/spiders/__init__.pyc
--- a/descarga_por_dia/notisureste/notisureste/spiders/noticias.py
+++ b/descarga_por_dia/notisureste/notisureste/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from notisureste.items import NoticiasItem
+"""
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+MEDIO:
+Notisureste, Yucatán
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-	return TAG_RE.sub('', text)
+    return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
+class QuotesSpider(scrapy.Spider):
-	title = scrapy.Field()
+    name = "noticias"
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
+    def start_requests(self):
+        year = getattr(self, 'year', None)
+        month = getattr(self, 'month', None)
+        day = getattr(self, 'day', None)
-class QuotesSpider(scrapy.Spider):
+        self.baseURL = "http://www.notisureste.com/" + year + "/" + month + "/" + day
-	name = "noticias"
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
-	def start_requests(self):
-		year = getattr(self, 'year', None)
-		month = getattr(self, 'month', None)
+    def parse(self, response):
-		day = getattr(self, 'day', None)
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-		self.baseURL='http://www.notisureste.com/'+year+'/'+month+'/'+day
-		urls = [
+        pagination = response.css('div.page-nav').css('a::attr(href)').extract()
-			self.baseURL,
+        if len(pagination) > 0:
-								]
+            pagination = pagination[-2].strip('/')
-		for url in urls:
+            pages = int(pagination[pagination.rfind('/')+1:])
-			yield scrapy.Request(url=url, callback=self.parse)
+            for page in range(1, pages):
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-	def parse(self, response):
-		pagination = response.css('div.page-nav').css('a::attr(href)').extract()
-		if ( len(pagination) > 0 ):
+    def parse_page(self, response):
-			pagination = pagination[-2].strip('/')
+        for link in response.css('div.td-ss-main-content').css('h3.entry-title').css('a::attr(href)').extract():
-			pages = int(pagination[pagination.rfind('/')+1:])
+            yield scrapy.Request(url=link, callback=self.parse_item)
-			for page in range(0,int(pages)):
-				if ( page == 0 ):
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+    def parse_item(self, response):
-				else:
+        item = NoticiasItem()
-					yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
+        text = ''
-		else:
+        item['title'] = response.css('div.td-post-header').css('h1.entry-title::text').extract_first()
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        d = response.css('div.td-post-header').css('span.td-post-date').css('time::attr(datetime)').extract_first()
+        ## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
-	def parse_page(self, response):
+        if d[-6:] != '-06:00':
-		for link in response.css('div.td-ss-main-content').css('h3.entry-title').css('a::attr(href)').extract():
+            d = d[:-6] + '-06:00'
-			yield scrapy.Request(url=link, callback=self.parse_item)
+        item['date'] = d
+        item['url'] = response.url
-	def parse_item(self, response):
+        item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()
-		item = NoticiasItem()
-		text = ''
+        for paragraph in response.css('div.td-post-content').css('p').extract():
-		item['title'] = response.css('div.td-post-header').css('h1.entry-title::text').extract_first()
+            text += remove_tags(paragraph) + '\n'
+        item['text'] = text
-		d = response.css('div.td-post-header').css('span.td-post-date').css('time::attr(datetime)').extract_first()
-		## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
-		if d[-6:] != '-06:00':
-			d = d[:-6] + '-06:00'
-		item['date'] = d
-		item['url'] = response.url
-		item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()
-		for paragraph in response.css('div.td-post-content').css('p').extract():
-			text += remove_tags(paragraph) + '\n'
-		item['text'] = text
-		# print item['title']
-		yield item
+        # print item['title']
+        yield item
--- a/descarga_por_dia/notisureste/notisureste/spiders/noticias.pyc
+++ b/descarga_por_dia/notisureste/notisureste/spiders/noticias.pyc
--- a/descarga_por_dia/notisureste/notisureste/spiders/notisureste.pyc
+++ b/descarga_por_dia/notisureste/notisureste/spiders/notisureste.pyc
--- a/descarga_por_dia/puntoMedio/puntoMedio/items.py
+++ b/descarga_por_dia/puntoMedio/puntoMedio/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class PuntomedioItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/puntoMedio/puntoMedio/pipelines.py
+++ b/descarga_por_dia/puntoMedio/puntoMedio/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class PuntomedioPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/puntoMedio/puntoMedio/settings.py
+++ b/descarga_por_dia/puntoMedio/puntoMedio/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'puntoMedio.spiders'
 #USER_AGENT = 'puntoMedio (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'puntoMedio.pipelines.PuntomedioPipeline': 300,
+   'puntoMedio.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/puntoMedio/puntoMedio/settings.pyc
+++ b/descarga_por_dia/puntoMedio/puntoMedio/settings.pyc
--- a/descarga_por_dia/puntoMedio/puntoMedio/spiders/noticias.py
+++ b/descarga_por_dia/puntoMedio/puntoMedio/spiders/noticias.py
+# -*- coding: utf-8 -*-
+from puntoMedio.items import NoticiasItem
 import scrapy, re
-'''
+"""
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=28
+MEDIO:
-'''
+Punto Medio, Yucatán
+USO:
+scrapy crawl noticias --nolog -s filename=2018-09-28.json -a year=2017 -a month=9 -a day=28
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
  return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
-  title = scrapy.Field()
-  text = scrapy.Field()
-  date = scrapy.Field()
-  location = scrapy.Field()
-  author = scrapy.Field()
-  topic = scrapy.Field()
-  url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
  name = "noticias"
@@ -26,7 +21,8 @@ class QuotesSpider(scrapy.Spider):
    year = getattr(self, 'year', None)
    month = getattr(self, 'month', None)
    day = getattr(self, 'day', None)
-    self.baseURL = 'http://www.puntomedio.mx/'+year+'/'+month+'/'+day
+    self.baseURL = "http://www.puntomedio.mx/" + year + "/" + month + "/" + day
    yield scrapy.Request(url=self.baseURL, callback=self.parse)

--- a/descarga_por_dia/puntoMedio/puntoMedio/spiders/noticias.pyc
+++ b/descarga_por_dia/puntoMedio/puntoMedio/spiders/noticias.pyc
--- a/descarga_por_dia/sona893/sona893/items.py
+++ b/descarga_por_dia/sona893/sona893/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class Sona893Item(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/sona893/sona893/pipelines.py
+++ b/descarga_por_dia/sona893/sona893/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class Sona893Pipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/sona893/sona893/settings.py
+++ b/descarga_por_dia/sona893/sona893/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'sona893.spiders'
 #USER_AGENT = 'sona893 (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'sona893.pipelines.Sona893Pipeline': 300,
+   'sona893.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/sona893/sona893/settings.pyc
+++ b/descarga_por_dia/sona893/sona893/settings.pyc
--- a/descarga_por_dia/sona893/sona893/spiders/noticias.py
+++ b/descarga_por_dia/sona893/sona893/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from sona893.items import NoticiasItem
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+"""
+MEDIO:
+Sona 89.3, Yucatán
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-	return TAG_RE.sub('', text)
+    return TAG_RE.sub('', text)
+class QuotesSpider(scrapy.Spider):
+    name = "noticias"
-class NoticiasItem(scrapy.Item):
+    def start_requests(self):
-	title = scrapy.Field()
+        year = getattr(self, 'year', None)
-	text = scrapy.Field()
+        month = getattr(self, 'month', None)
-	date = scrapy.Field()
+        day = getattr(self, 'day', None)
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
+        self.baseURL = "http://sona893.fm/" + year + "/" + month + "/" + day
-class QuotesSpider(scrapy.Spider):
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
-	name = "noticias"
-	def start_requests(self):
+    def parse(self, response):
-		year = getattr(self, 'year', None)
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-		month = getattr(self, 'month', None)
-		day = getattr(self, 'day', None)
+        pagination = response.css('div.pagination').css('a::attr(href)').extract()
-		self.baseURL='http://sona893.fm/'+year+'/'+month+'/'+day
+        if len(pagination) > 0:
+            pagination = pagination[-1].strip('/')
-		yield scrapy.Request(url=self.baseURL, callback=self.parse)
+            pages = int(pagination[pagination.rfind('/')+1:])
+            for page in range(1, pages):
-	def parse(self, response):
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-		pagination = response.css('div.pagination').css('a::attr(href)').extract()
-		if ( len(pagination) > 0 ):
-			pagination = pagination[-1].strip('/')
+    def parse_page(self, response):
-			pages = int(pagination[pagination.rfind('/')+1:])
+        for post in response.css('div.mosaicflow').css('div.post'):
+            item = NoticiasItem()
-			for page in range(0, pages):
+            item['topic'] = post.css('a.custom_cat_class_Kesha::text').extract_first()
-				if ( page == 0 ):
+            item['title'] = post.xpath('./h1/a/@title').extract_first()
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+            request = scrapy.Request(url=post.xpath('./h1/a/@href').extract_first(), callback=self.parse_item)
+            request.meta['item'] = item
-				else:
-					yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
+            yield request
-		else:
+    def parse_item(self, response):
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        item = response.meta['item']
+        text = ''
-	def parse_page(self, response):
+        d = response.xpath('/html/head/meta[@property="article:published_time"]').css('::attr(content)').extract_first()
-		for post in response.css('div.mosaicflow').css('div.post'):
+        ## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
-			item = NoticiasItem()
+        if d[-6:] != '-06:00':
-			item['topic'] = post.css('a.custom_cat_class_Kesha::text').extract_first()
+            d = d[:-6] + '-06:00'
-			item['title'] = post.xpath('./h1/a/@title').extract_first()
+        item['date'] = d
-			request = scrapy.Request(url=post.xpath('./h1/a/@href').extract_first(), callback=self.parse_item)
-			request.meta['item'] = item
+        for paragraph in response.css('div.single_text').css('p').extract():
+            text += remove_tags(paragraph) + '\n'
-			yield request
+        item['text'] = text
+        item['url'] = response.url
-	def parse_item(self, response):
-		item = response.meta['item']
+        # print item['title']
-		text = ''
+        yield item
-		d = response.xpath('/html/head/meta[@property="article:published_time"]').css('::attr(content)').extract_first()
-		## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
-		if d[-6:] != '-06:00':
-			d = d[:-6] + '-06:00'
-		item['date'] = d
-		for paragraph in response.css('div.single_text').css('p').extract():
-			text += remove_tags(paragraph) + '\n'
-		item['text'] = text
-		item['url'] = response.url
-		# print item['title']
-		yield item
--- a/descarga_por_dia/sona893/sona893/spiders/noticias.pyc
+++ b/descarga_por_dia/sona893/sona893/spiders/noticias.pyc
--- a/descarga_por_dia/tribunaCabos/tribunaCabos/items.py
+++ b/descarga_por_dia/tribunaCabos/tribunaCabos/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class TribunacabosItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/tribunaCabos/tribunaCabos/pipelines.py
+++ b/descarga_por_dia/tribunaCabos/tribunaCabos/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class TribunacabosPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/tribunaCabos/tribunaCabos/settings.py
+++ b/descarga_por_dia/tribunaCabos/tribunaCabos/settings.py
@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'tribunaCabos.pipelines.TribunacabosPipeline': 300,
+   'tribunaCabos.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/tribunaCabos/tribunaCabos/settings.pyc
+++ b/descarga_por_dia/tribunaCabos/tribunaCabos/settings.pyc
--- a/descarga_por_dia/tribunaCabos/tribunaCabos/spiders/noticias.py
+++ b/descarga_por_dia/tribunaCabos/tribunaCabos/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from tribunaCabos.items import NoticiasItem
 """
 MEDIO:
@@ -18,28 +20,6 @@ DAT_RE = re.compile(ur',?\s?(\d?\d[\s-][a-zA-Z]+)?\s?(\([^\)]+\))?\s?\.[\u2013-]
 DAT2_RE = re.compile(r',?\sa\s\d{1,2}\sde\s[a-zA-Z]+\sde\s\d{4}')
-# def parseLocation(p, sign):
-#     p = p.split(sign)
-#     location = p[0].strip()
-#     del p[0]
-#     for j in range(0, len(p)):
-#         p[j] = p[j].lstrip(" ")
-#         p[j] = p[j].rstrip(" ")
-#
-#     p = " ".join(p)
-#     return p, location
-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"
@@ -86,30 +66,6 @@ class QuotesSpider(scrapy.Spider):
                    p = LOC_RE.sub('', p)
            text += p + "\n"
-        # for i in range(0, len(bodyText)):
-        #     p = remove_tags(bodyText[i])
-        #     if i == 0:
-        #         sign = u'.\u2013'
-        #         limit = 35
-        #         n = p.count(sign)
-        #         if n == 0:
-        #             sign = ".-"
-        #             limit = 30
-        #             n = p.count(sign)
-        #         if n > 0 and len(p.split(sign)[0]) <= limit:
-        #             loc = p.split(sign)
-        #             if len(loc[0].split(",")[0]) <= 20:
-        #                 p = loc
-        #                 loc = p[0].split(",")
-        #                 item['location'] = loc[0].strip()
-        #                 # item['location'] = item['location'].rstrip()
-        #                 del p[0]
-        #                 for j in range(0, len(p)):
-        #                     p[j] = p[j].lstrip(" ")
-        #                     p[j] = p[j].rstrip(" ")
-        #                 p = " ".join(p)
-        #
-        #     text += p + "\n"
        item['text'] = text
        item['url'] = response.url

--- a/descarga_por_dia/tribunaCabos/tribunaCabos/spiders/noticias.pyc
+++ b/descarga_por_dia/tribunaCabos/tribunaCabos/spiders/noticias.pyc
--- a/descarga_por_dia/unoMasUno/unoMasUno/items.py
+++ b/descarga_por_dia/unoMasUno/unoMasUno/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class UnomasunoItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/unoMasUno/unoMasUno/pipelines.py
+++ b/descarga_por_dia/unoMasUno/unoMasUno/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class UnomasunoPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/unoMasUno/unoMasUno/settings.py
+++ b/descarga_por_dia/unoMasUno/unoMasUno/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'unoMasUno.spiders'
 #USER_AGENT = 'unoMasUno (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'unoMasUno.pipelines.UnomasunoPipeline': 300,
+   'unoMasUno.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/unoMasUno/unoMasUno/settings.pyc
+++ b/descarga_por_dia/unoMasUno/unoMasUno/settings.pyc
--- a/descarga_por_dia/unoMasUno/unoMasUno/spiders/noticias.py
+++ b/descarga_por_dia/unoMasUno/unoMasUno/spiders/noticias.py
-import scrapy, re
+# -*- coding: utf-8 -*-
+import scrapy, re, json
+from unoMasUno.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo
-'''
+"""
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=22
+MEDIO:
-'''
+Uno Más Uno, Yucatán
+USO:
+scrapy crawl noticias --nolog -s filename=2017-09-22.json -a year=2017 -a month=9 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-	return TAG_RE.sub('', text)
+    return TAG_RE.sub('', text)
 class UTC(tzinfo):
-	"""clase para el 'time zone' (zona horaria)"""
+    """clase para el 'time zone' (zona horaria)"""
-	def utcoffset(self, dt):
+    def utcoffset(self, dt):
-		# zona horaria para hidalgo (centro de mexico): utc-6
+        # zona horaria para hidalgo (centro de mexico): utc-6
-		return timedelta(hours=-6)
+        return timedelta(hours=-6)
-	def tzname(self, dt):
+    def tzname(self, dt):
-		# nombre de la zona horaria
+        # nombre de la zona horaria
-		return 'UTC-6'
+        return 'UTC-6'
-class NoticiasItem(scrapy.Item):
+class QuotesSpider(scrapy.Spider):
-	title = scrapy.Field()
+    name = "noticias"
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
+    def start_requests(self):
+        self.tz = UTC()
+        self.year = getattr(self, 'year', None)
+        self.month = getattr(self, 'month', None)
+        self.day = getattr(self, 'day', None)
+        self.date_parser = {'enero': 1,      'febrero': 2,  'marzo': 3,      'abril': 4,
+                            'mayo': 5,       'junio': 6,    'julio': 7,      'agosto': 8,
+                            'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
-class QuotesSpider(scrapy.Spider):
+        self.baseURL = "http://www.unomasuno.com.mx/" + self.year + "/" + self.month + "/" + self.day
-	name = "noticias"
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
-	def start_requests(self):
-		self.tz = UTC()
-		year = getattr(self, 'year', None)
+    def parse(self, response):
-		month = getattr(self, 'month', None)
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-		day = getattr(self, 'day', None)
-		self.date_parser = {'enero': 1,      'febrero': 2,  'marzo': 3,      'abril': 4,
+        pagination = response.xpath('//*[@class="pagination"]/a[@class="last"]/@href').extract_first()
-							'mayo': 5,       'junio': 6,    'julio': 7,      'agosto': 8,
+        if pagination is None:
-							'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
+            pagination = response.xpath('//*[@class="pagination"]/a/@href').extract()
-		self.baseURL='http://www.unomasuno.com.mx/'+year+'/'+month+'/'+day
+            if len(pagination) > 0:
+                pagination = pagination[-1].strip('/')
-		yield scrapy.Request(url=self.baseURL, callback=self.parse)
+                pages = int(pagination[pagination.rfind('/')+1:])
+                for page in range(1, pages):
-	def parse(self, response):
+                    yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-		pagination = response.xpath('//*[@class="pagination"]/a[@class="last"]/@href').extract_first()
+        else:
-		if pagination is None:
+            pagination = pagination.strip('/')
-			pagination = response.xpath('//*[@class="pagination"]/a/@href').extract()
+            pages = int(pagination[pagination.rfind('/')+1:])
-			if len(pagination) > 0:
+            for page in range(1, pages):
-				pagination = pagination[-1].strip('/')
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-				pages = int(pagination[pagination.rfind('/')+1:])
-				for page in range(0,pages):
+    def parse_page(self, response):
-					if page == 0:
+        for link in response.xpath('//h2[@class="post-box-title"]/a/@href').extract():
-						yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+            yield scrapy.Request(url=link, callback=self.parse_item)
-					else:
-						yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
+    def parse_item(self, response):
-			else:
+        item = NoticiasItem()
-				yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        text = ''
-		else:
+        try:
-			pagination = pagination.strip('/')
+            jsonInfo = response.xpath('//script[@type="application/ld+json"]').extract_first()
-			pages = int(pagination[pagination.rfind('/')+1:])
+            jsonInfo = json.loads(remove_tags(jsonInfo))
+            dat = jsonInfo['datePublished']
-			for page in range(0,pages):
+        except:
-				if page == 0:
+            try:
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+                d = response.xpath('//p[@class="post-meta"]/span/text()').extract_first()
-				else:
+                d = d.replace(',', '').split(' ')
-					yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
+                dat = datetime(int(d[2]), self.date_parser[d[1].lower()], int(d[0]), tzinfo=self.tz).isoformat("T")
+            except:
+                dat = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
-	def parse_page(self, response):
+        item['date'] = dat
-		for link in response.xpath('//h2[@class="post-box-title"]/a/@href').extract():
-			yield scrapy.Request(url=link, callback=self.parse_item)
+        item['topic'] = response.xpath('//span[@typeof="v:Breadcrumb"]/a/text()').extract()[1]
+        item['title'] = response.xpath('//*[@class="post-inner"]/h1/span/text()').extract_first()
-	def parse_item(self, response):
+        for p in response.xpath('//*[@class="entry"]/p').extract():
-		item = NoticiasItem()
+            text += remove_tags(p) + '\n'
-		text = ''
+        item['text'] = text
-		d = response.xpath('//p[@class="post-meta"]/span/text()').extract_first()
+        item['url'] = response.url
-		d = d.replace(',','').split(' ')
-		item['date'] = datetime(int(d[2]), self.date_parser[d[1].lower()], int(d[0]), tzinfo=self.tz).isoformat('T')
+        # print item['title']
+        yield item
-		item['topic'] = response.xpath('//span[@typeof="v:Breadcrumb"]/a/text()').extract()[1]
-		item['title'] = response.xpath('//*[@class="post-inner"]/h1/span/text()').extract_first()
-		for p in response.xpath('//*[@class="entry"]/p').extract():
-			text += remove_tags(p) + '\n'
-		item['text'] = text
-		item['url'] = response.url
-		# print item['title']
-		yield item
--- a/descarga_por_dia/unoMasUno/unoMasUno/spiders/noticias.pyc
+++ b/descarga_por_dia/unoMasUno/unoMasUno/spiders/noticias.pyc
--- a/descarga_por_dia/yucatanALaMano/yucatanALaMano/items.py
+++ b/descarga_por_dia/yucatanALaMano/yucatanALaMano/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class YucatanalamanoItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/yucatanALaMano/yucatanALaMano/pipelines.py
+++ b/descarga_por_dia/yucatanALaMano/yucatanALaMano/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class YucatanalamanoPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/yucatanALaMano/yucatanALaMano/settings.py
+++ b/descarga_por_dia/yucatanALaMano/yucatanALaMano/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'yucatanALaMano.spiders'
 #USER_AGENT = 'yucatanALaMano (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'yucatanALaMano.pipelines.YucatanalamanoPipeline': 300,
+   'yucatanALaMano.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/yucatanALaMano/yucatanALaMano/settings.pyc
+++ b/descarga_por_dia/yucatanALaMano/yucatanALaMano/settings.pyc
--- a/descarga_por_dia/yucatanALaMano/yucatanALaMano/spiders/noticias.py
+++ b/descarga_por_dia/yucatanALaMano/yucatanALaMano/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from yucatanALaMano.items import NoticiasItem
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+"""
+MEDIO:
+Yucatán a la Mano, Yuc.
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-	return TAG_RE.sub('', text)
+    return TAG_RE.sub('', text)
+class QuotesSpider(scrapy.Spider):
+    name = "noticias"
-class NoticiasItem(scrapy.Item):
+    def start_requests(self):
-	title = scrapy.Field()
+        year = getattr(self, 'year', None)
-	text = scrapy.Field()
+        month = getattr(self, 'month', None)
-	date = scrapy.Field()
+        day = getattr(self, 'day', None)
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
+        self.baseURL = "http://www.yucatanalamano.com/" + year + "/" + month + "/" + day
-class QuotesSpider(scrapy.Spider):
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
-	name = "noticias"
-	def start_requests(self):
+    def parse(self, response):
-		year = getattr(self, 'year', None)
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-		month = getattr(self, 'month', None)
-		day = getattr(self, 'day', None)
+        pagination = response.css('div.pagination').css('a::attr(href)').extract()
-		self.baseURL='http://www.yucatanalamano.com/'+year+'/'+month+'/'+day
+        if len(pagination) > 0:
+            pagination = pagination[-1].strip('/')
-		yield scrapy.Request(url=self.baseURL, callback=self.parse)
+            pages = int(pagination[pagination.rfind('/')+1:])
+            for page in range(1, pages):
-	def parse(self, response):
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-		pagination = response.css('div.pagination').css('a::attr(href)').extract()
-		if ( len(pagination) > 0 ):
+    def parse_page(self, response):
-			pagination = pagination[-1].strip('/')
+        for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
-			pages = int(pagination[pagination.rfind('/')+1:])
+            yield scrapy.Request(url=link, callback=self.parse_item)
-			for page in range(0, pages):
-				if ( page == 0 ):
+    def parse_item(self, response):
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        item = NoticiasItem()
+        text = ''
-				else:
+        item['title'] = response.css('div.main_container').css('h1.post-tile::text').extract_first()
-					yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
+        d = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first()
-		else:
+        ## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        if d[-6:] != '-06:00':
+            d = d[:-6] + '-06:00'
+        item['date'] = d
-	def parse_page(self, response):
-		for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
+        item['topic'] = response.css('div.breadcrumbs-plus').css('span').css('a::attr(title)').extract_first()
-			yield scrapy.Request(url=link, callback=self.parse_item)
+        for paragraph in response.css('div.entry-content').css('p').extract():
+            text += remove_tags(paragraph) + '\n'
-	def parse_item(self, response):
+        item['text'] = text
-		item = NoticiasItem()
+        item['url'] = response.url
-		text = ''
-		item['title'] = response.css('div.main_container').css('h1.post-tile::text').extract_first()
+        # print item['title']
+        yield item
-		d = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first()
-		## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
-		if d[-6:] != '-06:00':
-			d = d[:-6] + '-06:00'
-		item['date'] = d
-		item['topic'] = response.css('div.breadcrumbs-plus').css('span').css('a::attr(title)').extract_first()
-		for paragraph in response.css('div.entry-content').css('p').extract():
-			text += remove_tags(paragraph) + '\n'
-		item['text'] = text
-		item['url'] = response.url
-		# print item['title']
-		yield item
--- a/descarga_por_dia/yucatanALaMano/yucatanALaMano/spiders/noticias.pyc
+++ b/descarga_por_dia/yucatanALaMano/yucatanALaMano/spiders/noticias.pyc
--- a/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/items.py
+++ b/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class YucatanencortoItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/pipelines.py
+++ b/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class YucatanencortoPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/settings.py
+++ b/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'yucatanEnCorto.spiders'
 #USER_AGENT = 'yucatanEnCorto (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'yucatanEnCorto.pipelines.YucatanencortoPipeline': 300,
+   'yucatanEnCorto.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/settings.pyc
+++ b/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/settings.pyc
--- a/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/spiders/noticias.py
+++ b/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from yucatanEnCorto.items import NoticiasItem
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
 """
-Crawler descarga noticias desde el 2017.10.18
+MEDIO:
+Yucatán en Corto, Yuc.
+Esta versión descarga noticias desde el 2017.10.18, por cambio en la URL del sitio.
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
 """
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-	return TAG_RE.sub('', text)
+    return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
-	name = "noticias"
+    name = "noticias"
-	def start_requests(self):
+    def start_requests(self):
-		year = getattr(self, 'year', None)
+        year = getattr(self, 'year', None)
-		month = getattr(self, 'month', None)
+        month = getattr(self, 'month', None)
-		day = getattr(self, 'day', None)
+        day = getattr(self, 'day', None)
-		self.baseURL='http://www.yucatanencorto.com/noticias/'+year+'/'+month+'/'+day
+        self.baseURL = "http://www.yucatanencorto.com/noticias/" + year + "/" + month + "/" + day
-		yield scrapy.Request(url=self.baseURL, callback=self.parse)
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
-	def parse(self, response):
-		pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a/@href').extract()
+    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-		if ( len(pagination) > 0 ):
-			pagination = pagination[-2].strip('/')
+        pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a/@href').extract()
-			pages = int(pagination[pagination.rfind('/')+1:])
+        if len(pagination) > 0:
+            pagination = pagination[-2].strip('/')
-			for page in range(0, pages):
+            pages = int(pagination[pagination.rfind('/')+1:])
-				if ( page == 0 ):
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+            for page in range(0, pages):
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-				else:
-					yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
+    def parse_page(self, response):
-		else:
+        for link in response.xpath('//*[@class="td_module_1 td_module_wrap td-animation-stack"]/h3/a/@href').extract():
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+            yield scrapy.Request(url=link, callback=self.parse_item)
+    def parse_item(self, response):
-	def parse_page(self, response):
+        item = NoticiasItem()
-		for link in response.xpath('//*[@class="td_module_1 td_module_wrap td-animation-stack"]/h3/a/@href').extract():
+        text = ''
-			yield scrapy.Request(url=link, callback=self.parse_item)
+        title = response.xpath('//*[@class="td-post-header-holder"]/header/h1/text()').extract_first()
+        if title is None:
-	def parse_item(self, response):
+            title = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first()
-		item = NoticiasItem()
+        item['title'] = title
-		text = ''
-		title = response.xpath('//*[@class="td-post-header-holder"]/header/h1/text()').extract_first()
+        d = response.xpath('//span[@class="td-post-date td-post-date-no-dot"]/time/@datetime').extract_first()
-		if title is None:
+        ## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
-			title = response.xpath('//*[@class="td-post-header"]/header/h1/text()').extract_first()
+        if d[-6:] != '-06:00':
-		item['title'] = title
+            d = d[:-6] + '-06:00'
+        item['date'] = d
-		d = response.xpath('//span[@class="td-post-date td-post-date-no-dot"]/time/@datetime').extract_first()
-		## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
+        item['topic'] = response.xpath('//*[@class="entry-crumbs"]/span/a/text()').extract()[1]
-		if d[-6:] != '-06:00':
-			d = d[:-6] + '-06:00'
+        paragraphs = response.xpath('//*[@class="td-post-content"]/div').extract()
-		item['date'] = d
+        if len(paragraphs) <= 2:
+            paragraphs = response.xpath('//*[@class="td-post-content"]/p').extract()
-		item['topic'] = response.xpath('//*[@class="entry-crumbs"]/span/a/text()').extract()[1]
+        for p in paragraphs:
+            text += remove_tags(p) + '\n'
-		paragraphs = response.xpath('//*[@class="td-post-content"]/div').extract()
+        item['text'] = text
-		if len(paragraphs) <= 2:
-			paragraphs = response.xpath('//*[@class="td-post-content"]/p').extract()
+        item['url'] = response.url
-		for p in paragraphs:
-			text += remove_tags(p) + '\n'
+        # print item['title']
-		item['text'] = text
+        yield item
-		item['url'] = response.url
-		# print item['title']
-		yield item
--- a/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/spiders/noticias.pyc
+++ b/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/spiders/noticias.pyc