crawlers

ea7ae846 · Renán Sosa Guillen · 54164b6d · ea7ae846 · ea7ae846 · ea7ae846
Commit ea7ae846 authored Jan 29, 2018 by Renán Sosa Guillen
86 changed files
--- a/descarga_por_dia/alChile/alChile/spiders/noticias.py
+++ b/descarga_por_dia/alChile/alChile/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from alChile.items import NoticiasItem

--- a/descarga_por_dia/campecheHoy/campecheHoy/spiders/noticias.py
+++ b/descarga_por_dia/campecheHoy/campecheHoy/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from campecheHoy.items import NoticiasItem

--- a/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.py
+++ b/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from datetime import datetime, timedelta, tzinfo
 from desdeElBalcon.items import NoticiasItem

--- a/descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.py
+++ b/descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from diarioYaqui.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo

--- a/descarga_por_dia/grilloPorteno/grilloPorteno/spiders/noticias.py
+++ b/descarga_por_dia/grilloPorteno/grilloPorteno/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from grilloPorteno.items import NoticiasItem

--- a/descarga_por_dia/heraldoAgs/heraldoAgs/spiders/noticias.py
+++ b/descarga_por_dia/heraldoAgs/heraldoAgs/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from heraldoAgs.items import NoticiasItem

--- a/descarga_por_dia/laJornada/laJornada/spiders/noticias.py
+++ b/descarga_por_dia/laJornada/laJornada/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from laJornada.items import NoticiasItem
 from datetime import date, datetime, timedelta, tzinfo, time

--- a/descarga_por_dia/laJornadaAgs/laJornadaAgs/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaAgs/laJornadaAgs/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from laJornadaAgs.items import NoticiasItem

--- a/descarga_por_dia/laJornadaGro/laJornadaGro/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaGro/laJornadaGro/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from laJornadaGro.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo

--- a/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from laJornadaGroAntiguo.items import NoticiasItem

--- a/descarga_por_dia/laJornadaOte/laJornadaOte/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaOte/laJornadaOte/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from laJornadaOte.items import NoticiasItem

--- a/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from laJornadaSanLuis.items import NoticiasItem

--- a/descarga_por_dia/laJornadaVer/laJornadaVer/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaVer/laJornadaVer/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from laJornadaVer.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo

--- a/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
 from laJornadaZac.items import NoticiasItem

--- a/descarga_por_dia/laRazon/laRazon/items.py
+++ b/descarga_por_dia/laRazon/laRazon/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class LarazonItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laRazon/laRazon/pipelines.py
+++ b/descarga_por_dia/laRazon/laRazon/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class LarazonPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/laRazon/laRazon/settings.py
+++ b/descarga_por_dia/laRazon/laRazon/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laRazon.spiders'
 #USER_AGENT = 'laRazon (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'laRazon.pipelines.LarazonPipeline': 300,
+   'laRazon.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laRazon/laRazon/settings.pyc
+++ b/descarga_por_dia/laRazon/laRazon/settings.pyc
--- a/descarga_por_dia/laRazon/laRazon/spiders/noticias.py
+++ b/descarga_por_dia/laRazon/laRazon/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from laRazon.items import NoticiasItem
-'''
+"""
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=28
+MEDIO:
-'''
+La Razón de México, CDMX
+USO:
+scrapy crawl noticias --nolog -s filename=2017-09-28.json -a year=2017 -a month=9 -a day=28
+"""
 TAG_RE = re.compile(r'<[^>]+>')
@@ -10,16 +15,6 @@ def remove_tags(text):
    return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"
@@ -28,20 +23,18 @@ class QuotesSpider(scrapy.Spider):
        month = getattr(self, 'month', None)
        day = getattr(self, 'day', None)
-		self.baseURL='https://www.razon.com.mx/'+year+'/'+month+'/'+day
+        self.baseURL = "https://www.razon.com.mx/" + year + "/" + month + "/" + day
        yield scrapy.Request(url=self.baseURL, callback=self.parse)
    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
        pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a[@class="last"]/@href').extract_first()
        pagination = pagination.strip('/')
        pages = int(pagination[pagination.rfind('/')+1:])
+        for page in range(1, pages):
-		for page in range(0,pages):
-			if page == 0:
-				yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-			else:
            yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)

--- a/descarga_por_dia/laRazon/laRazon/spiders/noticias.pyc
+++ b/descarga_por_dia/laRazon/laRazon/spiders/noticias.pyc
--- a/descarga_por_dia/laVerdadYuc/laVerdadYuc/items.py
+++ b/descarga_por_dia/laVerdadYuc/laVerdadYuc/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class LaverdadyucItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laVerdadYuc/laVerdadYuc/pipelines.py
+++ b/descarga_por_dia/laVerdadYuc/laVerdadYuc/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class LaverdadyucPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/laVerdadYuc/laVerdadYuc/settings.py
+++ b/descarga_por_dia/laVerdadYuc/laVerdadYuc/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laVerdadYuc.spiders'
 #USER_AGENT = 'laVerdadYuc (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-DOWNLOAD_DELAY = 2
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'laVerdadYuc.pipelines.LaverdadyucPipeline': 300,
+   'laVerdadYuc.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laVerdadYuc/laVerdadYuc/spiders/noticias.py
+++ b/descarga_por_dia/laVerdadYuc/laVerdadYuc/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from laVerdadYuc.items import NoticiasItem
-'''
+"""
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+MEDIO:
-'''
+La Verdad Yucatán
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
 # class QuotesSpider(scrapy.Spider):
 # 	name = "noticias"
@@ -72,7 +69,8 @@ class QuotesSpider(scrapy.Spider):
        year = getattr(self, 'year', None)
        month = getattr(self, 'month', None)
        day = getattr(self, 'day', None)
-        self.baseURL = 'http://laverdadnoticias.com/' + year + '/' + month + '/' + day
+        self.baseURL = "http://laverdadnoticias.com/" + year + "/" + month + "/" + day
        yield scrapy.Request(url=self.baseURL, callback=self.parse)

--- a/descarga_por_dia/lectorMX/lectorMX/items.py
+++ b/descarga_por_dia/lectorMX/lectorMX/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class LectormxItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/lectorMX/lectorMX/pipelines.py
+++ b/descarga_por_dia/lectorMX/lectorMX/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class LectormxPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/lectorMX/lectorMX/settings.py
+++ b/descarga_por_dia/lectorMX/lectorMX/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'lectorMX.spiders'
 #USER_AGENT = 'lectorMX (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'lectorMX.pipelines.LectormxPipeline': 300,
+   'lectorMX.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/lectorMX/lectorMX/settings.pyc
+++ b/descarga_por_dia/lectorMX/lectorMX/settings.pyc
--- a/descarga_por_dia/lectorMX/lectorMX/spiders/noticias.py
+++ b/descarga_por_dia/lectorMX/lectorMX/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from lectorMX.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo
+"""
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=30
+MEDIO:
+Lector MX, Yucatán
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-30.json -a year=2017 -a month=3 -a day=30
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
@@ -22,16 +27,6 @@ class UTC(tzinfo):
        return 'UTC-6'
-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"
@@ -47,17 +42,15 @@ class QuotesSpider(scrapy.Spider):
    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
        pagination = response.css('div.pagination').xpath('./ul/li/a/@href').extract()
-		if ( len(pagination) > 0 ):
+        if len(pagination) > 0:
            pagination = pagination[-1].strip('/')
            pages = int(pagination[pagination.rfind('/')+1:])
-			for page in range(0, pages):
-				if ( page == 0 ):
+            for page in range(1, pages):
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-				else:
                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-		else:
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
    def parse_page(self, response):

--- a/descarga_por_dia/lectorMX/lectorMX/spiders/noticias.pyc
+++ b/descarga_por_dia/lectorMX/lectorMX/spiders/noticias.pyc
--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/items.py
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class MipuntodevistaItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/items.pyc
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/items.pyc
--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/pipelines.pyc
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/pipelines.pyc
--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/spiders/noticias.py
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from miPuntoDeVista.items import NoticiasItem
 """
+MEDIO:
+Mi Punto de Vista, Yucatán
 USO:
 scrapy crawl noticias --nolog -s filename=2018-03-22.json -a year=2017 -a month=3 -a day=22
 """
@@ -10,16 +14,6 @@ def remove_tags(text):
    return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"

--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/spiders/noticias.pyc
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/spiders/noticias.pyc
--- a/descarga_por_dia/notirivas/notirivas/items.py
+++ b/descarga_por_dia/notirivas/notirivas/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class NotirivasItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/notirivas/notirivas/pipelines.py
+++ b/descarga_por_dia/notirivas/notirivas/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class NotirivasPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/notirivas/notirivas/settings.py
+++ b/descarga_por_dia/notirivas/notirivas/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'notirivas.spiders'
 #USER_AGENT = 'notirivas (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'notirivas.pipelines.NotirivasPipeline': 300,
+   'notirivas.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/notirivas/notirivas/settings.pyc
+++ b/descarga_por_dia/notirivas/notirivas/settings.pyc
--- a/descarga_por_dia/notirivas/notirivas/spiders/noticias.py
+++ b/descarga_por_dia/notirivas/notirivas/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from notirivas.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo
+"""
-#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+MEDIO:
+Notirivas, Yucatán
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
@@ -22,16 +27,6 @@ class UTC(tzinfo):
        return 'UTC-6'
-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"
@@ -41,23 +36,22 @@ class QuotesSpider(scrapy.Spider):
        month = getattr(self, 'month', None)
        day = getattr(self, 'day', None)
        self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
-		self.baseURL='http://gruporivas.com.mx/notirivas/'+year+'/'+month+'/'+day
+        self.baseURL = "http://gruporivas.com.mx/notirivas/" + year + "/" + month + "/" +day
        yield scrapy.Request(url=self.baseURL, callback=self.parse)
    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
        pagination = response.xpath('//*[@class="bdaia-pagination"]/span[@class="pages"]/text()').extract()
-		if ( len(pagination) > 0 ):
+        if len(pagination) > 0:
            pagination = pagination[0]
            pages = int(pagination[pagination.rfind(' ')+1:])
-			for page in range(0, pages):
-				if ( page == 0 ):
+            for page in range(1, pages):
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-				else:
                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-		else:
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
    def parse_page(self, response):

--- a/descarga_por_dia/notirivas/notirivas/spiders/noticias.pyc
+++ b/descarga_por_dia/notirivas/notirivas/spiders/noticias.pyc
--- a/descarga_por_dia/notisureste/notisureste/__init__.pyc
+++ b/descarga_por_dia/notisureste/notisureste/__init__.pyc
--- a/descarga_por_dia/notisureste/notisureste/items.py
+++ b/descarga_por_dia/notisureste/notisureste/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class NotisuresteItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/notisureste/notisureste/pipelines.py
+++ b/descarga_por_dia/notisureste/notisureste/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class NotisurestePipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/notisureste/notisureste/settings.py
+++ b/descarga_por_dia/notisureste/notisureste/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'notisureste.spiders'
 #USER_AGENT = 'notisureste (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'notisureste.pipelines.NotisurestePipeline': 300,
+   'notisureste.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/notisureste/notisureste/settings.pyc
+++ b/descarga_por_dia/notisureste/notisureste/settings.pyc
--- a/descarga_por_dia/notisureste/notisureste/spiders/__init__.pyc
+++ b/descarga_por_dia/notisureste/notisureste/spiders/__init__.pyc
--- a/descarga_por_dia/notisureste/notisureste/spiders/noticias.py
+++ b/descarga_por_dia/notisureste/notisureste/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from notisureste.items import NoticiasItem
+"""
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+MEDIO:
+Notisureste, Yucatán
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"
@@ -26,26 +21,22 @@ class QuotesSpider(scrapy.Spider):
        year = getattr(self, 'year', None)
        month = getattr(self, 'month', None)
        day = getattr(self, 'day', None)
-		self.baseURL='http://www.notisureste.com/'+year+'/'+month+'/'+day
-		urls = [
+        self.baseURL = "http://www.notisureste.com/" + year + "/" + month + "/" + day
-			self.baseURL,
-								]
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
-		for url in urls:
-			yield scrapy.Request(url=url, callback=self.parse)
    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
        pagination = response.css('div.page-nav').css('a::attr(href)').extract()
-		if ( len(pagination) > 0 ):
+        if len(pagination) > 0:
            pagination = pagination[-2].strip('/')
            pages = int(pagination[pagination.rfind('/')+1:])
-			for page in range(0,int(pages)):
-				if ( page == 0 ):
+            for page in range(1, pages):
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-				else:
                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-		else:
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
    def parse_page(self, response):
@@ -73,4 +64,3 @@ class QuotesSpider(scrapy.Spider):
        # print item['title']
        yield item
--- a/descarga_por_dia/notisureste/notisureste/spiders/noticias.pyc
+++ b/descarga_por_dia/notisureste/notisureste/spiders/noticias.pyc
--- a/descarga_por_dia/notisureste/notisureste/spiders/notisureste.pyc
+++ b/descarga_por_dia/notisureste/notisureste/spiders/notisureste.pyc
--- a/descarga_por_dia/puntoMedio/puntoMedio/items.py
+++ b/descarga_por_dia/puntoMedio/puntoMedio/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class PuntomedioItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/puntoMedio/puntoMedio/pipelines.py
+++ b/descarga_por_dia/puntoMedio/puntoMedio/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class PuntomedioPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/puntoMedio/puntoMedio/settings.py
+++ b/descarga_por_dia/puntoMedio/puntoMedio/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'puntoMedio.spiders'
 #USER_AGENT = 'puntoMedio (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'puntoMedio.pipelines.PuntomedioPipeline': 300,
+   'puntoMedio.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/puntoMedio/puntoMedio/settings.pyc
+++ b/descarga_por_dia/puntoMedio/puntoMedio/settings.pyc
--- a/descarga_por_dia/puntoMedio/puntoMedio/spiders/noticias.py
+++ b/descarga_por_dia/puntoMedio/puntoMedio/spiders/noticias.py
+# -*- coding: utf-8 -*-
+from puntoMedio.items import NoticiasItem
 import scrapy, re
-'''
+"""
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=28
+MEDIO:
-'''
+Punto Medio, Yucatán
+USO:
+scrapy crawl noticias --nolog -s filename=2018-09-28.json -a year=2017 -a month=9 -a day=28
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
  return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
-  title = scrapy.Field()
-  text = scrapy.Field()
-  date = scrapy.Field()
-  location = scrapy.Field()
-  author = scrapy.Field()
-  topic = scrapy.Field()
-  url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
  name = "noticias"
@@ -26,7 +21,8 @@ class QuotesSpider(scrapy.Spider):
    year = getattr(self, 'year', None)
    month = getattr(self, 'month', None)
    day = getattr(self, 'day', None)
-    self.baseURL = 'http://www.puntomedio.mx/'+year+'/'+month+'/'+day
+    self.baseURL = "http://www.puntomedio.mx/" + year + "/" + month + "/" + day
    yield scrapy.Request(url=self.baseURL, callback=self.parse)

--- a/descarga_por_dia/puntoMedio/puntoMedio/spiders/noticias.pyc
+++ b/descarga_por_dia/puntoMedio/puntoMedio/spiders/noticias.pyc
--- a/descarga_por_dia/sona893/sona893/items.py
+++ b/descarga_por_dia/sona893/sona893/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class Sona893Item(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/sona893/sona893/pipelines.py
+++ b/descarga_por_dia/sona893/sona893/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class Sona893Pipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/sona893/sona893/settings.py
+++ b/descarga_por_dia/sona893/sona893/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'sona893.spiders'
 #USER_AGENT = 'sona893 (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'sona893.pipelines.Sona893Pipeline': 300,
+   'sona893.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/sona893/sona893/settings.pyc
+++ b/descarga_por_dia/sona893/sona893/settings.pyc
--- a/descarga_por_dia/sona893/sona893/spiders/noticias.py
+++ b/descarga_por_dia/sona893/sona893/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from sona893.items import NoticiasItem
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+"""
+MEDIO:
+Sona 89.3, Yucatán
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"
@@ -25,27 +21,23 @@ class QuotesSpider(scrapy.Spider):
        year = getattr(self, 'year', None)
        month = getattr(self, 'month', None)
        day = getattr(self, 'day', None)
-		self.baseURL='http://sona893.fm/'+year+'/'+month+'/'+day
+        self.baseURL = "http://sona893.fm/" + year + "/" + month + "/" + day
        yield scrapy.Request(url=self.baseURL, callback=self.parse)
    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
        pagination = response.css('div.pagination').css('a::attr(href)').extract()
-		if ( len(pagination) > 0 ):
+        if len(pagination) > 0:
            pagination = pagination[-1].strip('/')
            pages = int(pagination[pagination.rfind('/')+1:])
-			for page in range(0, pages):
+            for page in range(1, pages):
-				if ( page == 0 ):
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-				else:
                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-		else:
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
    def parse_page(self, response):
        for post in response.css('div.mosaicflow').css('div.post'):

--- a/descarga_por_dia/sona893/sona893/spiders/noticias.pyc
+++ b/descarga_por_dia/sona893/sona893/spiders/noticias.pyc
--- a/descarga_por_dia/tribunaCabos/tribunaCabos/items.py
+++ b/descarga_por_dia/tribunaCabos/tribunaCabos/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class TribunacabosItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/tribunaCabos/tribunaCabos/pipelines.py
+++ b/descarga_por_dia/tribunaCabos/tribunaCabos/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class TribunacabosPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/tribunaCabos/tribunaCabos/settings.py
+++ b/descarga_por_dia/tribunaCabos/tribunaCabos/settings.py
@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'tribunaCabos.pipelines.TribunacabosPipeline': 300,
+   'tribunaCabos.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/tribunaCabos/tribunaCabos/settings.pyc
+++ b/descarga_por_dia/tribunaCabos/tribunaCabos/settings.pyc
--- a/descarga_por_dia/tribunaCabos/tribunaCabos/spiders/noticias.py
+++ b/descarga_por_dia/tribunaCabos/tribunaCabos/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from tribunaCabos.items import NoticiasItem
 """
 MEDIO:
@@ -18,28 +20,6 @@ DAT_RE = re.compile(ur',?\s?(\d?\d[\s-][a-zA-Z]+)?\s?(\([^\)]+\))?\s?\.[\u2013-]
 DAT2_RE = re.compile(r',?\sa\s\d{1,2}\sde\s[a-zA-Z]+\sde\s\d{4}')
-# def parseLocation(p, sign):
-#     p = p.split(sign)
-#     location = p[0].strip()
-#     del p[0]
-#     for j in range(0, len(p)):
-#         p[j] = p[j].lstrip(" ")
-#         p[j] = p[j].rstrip(" ")
-#
-#     p = " ".join(p)
-#     return p, location
-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"
@@ -86,30 +66,6 @@ class QuotesSpider(scrapy.Spider):
                    p = LOC_RE.sub('', p)
            text += p + "\n"
-        # for i in range(0, len(bodyText)):
-        #     p = remove_tags(bodyText[i])
-        #     if i == 0:
-        #         sign = u'.\u2013'
-        #         limit = 35
-        #         n = p.count(sign)
-        #         if n == 0:
-        #             sign = ".-"
-        #             limit = 30
-        #             n = p.count(sign)
-        #         if n > 0 and len(p.split(sign)[0]) <= limit:
-        #             loc = p.split(sign)
-        #             if len(loc[0].split(",")[0]) <= 20:
-        #                 p = loc
-        #                 loc = p[0].split(",")
-        #                 item['location'] = loc[0].strip()
-        #                 # item['location'] = item['location'].rstrip()
-        #                 del p[0]
-        #                 for j in range(0, len(p)):
-        #                     p[j] = p[j].lstrip(" ")
-        #                     p[j] = p[j].rstrip(" ")
-        #                 p = " ".join(p)
-        #
-        #     text += p + "\n"
        item['text'] = text
        item['url'] = response.url

--- a/descarga_por_dia/tribunaCabos/tribunaCabos/spiders/noticias.pyc
+++ b/descarga_por_dia/tribunaCabos/tribunaCabos/spiders/noticias.pyc
--- a/descarga_por_dia/unoMasUno/unoMasUno/items.py
+++ b/descarga_por_dia/unoMasUno/unoMasUno/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class UnomasunoItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/unoMasUno/unoMasUno/pipelines.py
+++ b/descarga_por_dia/unoMasUno/unoMasUno/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class UnomasunoPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/unoMasUno/unoMasUno/settings.py
+++ b/descarga_por_dia/unoMasUno/unoMasUno/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'unoMasUno.spiders'
 #USER_AGENT = 'unoMasUno (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'unoMasUno.pipelines.UnomasunoPipeline': 300,
+   'unoMasUno.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/unoMasUno/unoMasUno/settings.pyc
+++ b/descarga_por_dia/unoMasUno/unoMasUno/settings.pyc
--- a/descarga_por_dia/unoMasUno/unoMasUno/spiders/noticias.py
+++ b/descarga_por_dia/unoMasUno/unoMasUno/spiders/noticias.py
-import scrapy, re
+# -*- coding: utf-8 -*-
+import scrapy, re, json
+from unoMasUno.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo
-'''
+"""
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=22
+MEDIO:
-'''
+Uno Más Uno, Yucatán
+USO:
+scrapy crawl noticias --nolog -s filename=2017-09-22.json -a year=2017 -a month=9 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
@@ -23,36 +28,27 @@ class UTC(tzinfo):
        return 'UTC-6'
-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"
    def start_requests(self):
        self.tz = UTC()
-		year = getattr(self, 'year', None)
+        self.year = getattr(self, 'year', None)
-		month = getattr(self, 'month', None)
+        self.month = getattr(self, 'month', None)
-		day = getattr(self, 'day', None)
+        self.day = getattr(self, 'day', None)
        self.date_parser = {'enero': 1,      'febrero': 2,  'marzo': 3,      'abril': 4,
                            'mayo': 5,       'junio': 6,    'julio': 7,      'agosto': 8,
                            'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
-		self.baseURL='http://www.unomasuno.com.mx/'+year+'/'+month+'/'+day
+        self.baseURL = "http://www.unomasuno.com.mx/" + self.year + "/" + self.month + "/" + self.day
        yield scrapy.Request(url=self.baseURL, callback=self.parse)
    def parse(self, response):
-		pagination = response.xpath('//*[@class="pagination"]/a[@class="last"]/@href').extract_first()
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        pagination = response.xpath('//*[@class="pagination"]/a[@class="last"]/@href').extract_first()
        if pagination is None:
            pagination = response.xpath('//*[@class="pagination"]/a/@href').extract()
@@ -60,24 +56,15 @@ class QuotesSpider(scrapy.Spider):
                pagination = pagination[-1].strip('/')
                pages = int(pagination[pagination.rfind('/')+1:])
-				for page in range(0,pages):
+                for page in range(1, pages):
-					if page == 0:
+                    yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-						yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-					else:
-						yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
-			else:
-				yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
        else:
            pagination = pagination.strip('/')
            pages = int(pagination[pagination.rfind('/')+1:])
-			for page in range(0,pages):
+            for page in range(1, pages):
-				if page == 0:
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-				else:
-					yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
    def parse_page(self, response):
@@ -89,9 +76,18 @@ class QuotesSpider(scrapy.Spider):
        item = NoticiasItem()
        text = ''
+        try:
+            jsonInfo = response.xpath('//script[@type="application/ld+json"]').extract_first()
+            jsonInfo = json.loads(remove_tags(jsonInfo))
+            dat = jsonInfo['datePublished']
+        except:
+            try:
                d = response.xpath('//p[@class="post-meta"]/span/text()').extract_first()
-		d = d.replace(',','').split(' ')
+                d = d.replace(',', '').split(' ')
-		item['date'] = datetime(int(d[2]), self.date_parser[d[1].lower()], int(d[0]), tzinfo=self.tz).isoformat('T')
+                dat = datetime(int(d[2]), self.date_parser[d[1].lower()], int(d[0]), tzinfo=self.tz).isoformat("T")
+            except:
+                dat = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
+        item['date'] = dat
        item['topic'] = response.xpath('//span[@typeof="v:Breadcrumb"]/a/text()').extract()[1]
        item['title'] = response.xpath('//*[@class="post-inner"]/h1/span/text()').extract_first()

--- a/descarga_por_dia/unoMasUno/unoMasUno/spiders/noticias.pyc
+++ b/descarga_por_dia/unoMasUno/unoMasUno/spiders/noticias.pyc
--- a/descarga_por_dia/yucatanALaMano/yucatanALaMano/items.py
+++ b/descarga_por_dia/yucatanALaMano/yucatanALaMano/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class YucatanalamanoItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/yucatanALaMano/yucatanALaMano/pipelines.py
+++ b/descarga_por_dia/yucatanALaMano/yucatanALaMano/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class YucatanalamanoPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/yucatanALaMano/yucatanALaMano/settings.py
+++ b/descarga_por_dia/yucatanALaMano/yucatanALaMano/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'yucatanALaMano.spiders'
 #USER_AGENT = 'yucatanALaMano (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'yucatanALaMano.pipelines.YucatanalamanoPipeline': 300,
+   'yucatanALaMano.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/yucatanALaMano/yucatanALaMano/settings.pyc
+++ b/descarga_por_dia/yucatanALaMano/yucatanALaMano/settings.pyc
--- a/descarga_por_dia/yucatanALaMano/yucatanALaMano/spiders/noticias.py
+++ b/descarga_por_dia/yucatanALaMano/yucatanALaMano/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from yucatanALaMano.items import NoticiasItem
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+"""
+MEDIO:
+Yucatán a la Mano, Yuc.
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"
@@ -25,28 +21,23 @@ class QuotesSpider(scrapy.Spider):
        year = getattr(self, 'year', None)
        month = getattr(self, 'month', None)
        day = getattr(self, 'day', None)
-		self.baseURL='http://www.yucatanalamano.com/'+year+'/'+month+'/'+day
+        self.baseURL = "http://www.yucatanalamano.com/" + year + "/" + month + "/" + day
        yield scrapy.Request(url=self.baseURL, callback=self.parse)
    def parse(self, response):
-		pagination = response.css('div.pagination').css('a::attr(href)').extract()
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-		if ( len(pagination) > 0 ):
+        pagination = response.css('div.pagination').css('a::attr(href)').extract()
+        if len(pagination) > 0:
            pagination = pagination[-1].strip('/')
            pages = int(pagination[pagination.rfind('/')+1:])
-			for page in range(0, pages):
+            for page in range(1, pages):
-				if ( page == 0 ):
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-				else:
                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-		else:
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
    def parse_page(self, response):
        for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():

--- a/descarga_por_dia/yucatanALaMano/yucatanALaMano/spiders/noticias.pyc
+++ b/descarga_por_dia/yucatanALaMano/yucatanALaMano/spiders/noticias.pyc
--- a/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/items.py
+++ b/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class YucatanencortoItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/pipelines.py
+++ b/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class YucatanencortoPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/settings.py
+++ b/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'yucatanEnCorto.spiders'
 #USER_AGENT = 'yucatanEnCorto (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'yucatanEnCorto.pipelines.YucatanencortoPipeline': 300,
+   'yucatanEnCorto.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/settings.pyc
+++ b/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/settings.pyc
--- a/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/spiders/noticias.py
+++ b/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from yucatanEnCorto.items import NoticiasItem
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
 """
-Crawler descarga noticias desde el 2017.10.18
+MEDIO:
+Yucatán en Corto, Yuc.
+Esta versión descarga noticias desde el 2017.10.18, por cambio en la URL del sitio.
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
 """
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"
@@ -27,28 +22,23 @@ class QuotesSpider(scrapy.Spider):
        year = getattr(self, 'year', None)
        month = getattr(self, 'month', None)
        day = getattr(self, 'day', None)
-		self.baseURL='http://www.yucatanencorto.com/noticias/'+year+'/'+month+'/'+day
+        self.baseURL = "http://www.yucatanencorto.com/noticias/" + year + "/" + month + "/" + day
        yield scrapy.Request(url=self.baseURL, callback=self.parse)
    def parse(self, response):
-		pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a/@href').extract()
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-		if ( len(pagination) > 0 ):
+        pagination = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a/@href').extract()
+        if len(pagination) > 0:
            pagination = pagination[-2].strip('/')
            pages = int(pagination[pagination.rfind('/')+1:])
            for page in range(0, pages):
-				if ( page == 0 ):
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-				else:
                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-		else:
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
    def parse_page(self, response):
        for link in response.xpath('//*[@class="td_module_1 td_module_wrap td-animation-stack"]/h3/a/@href').extract():

--- a/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/spiders/noticias.pyc
+++ b/descarga_por_dia/yucatanEnCorto/yucatanEnCorto/spiders/noticias.pyc