crawlers

5953d008 · Renán Sosa Guillen · ba65cc37 · 5953d008 · 5953d008 · 5953d008
Commit 5953d008 authored Jan 29, 2018 by Renán Sosa Guillen
87 changed files
--- a/descarga_por_dia/alChile/alChile/items.py
+++ b/descarga_por_dia/alChile/alChile/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class AlchileItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/alChile/alChile/pipelines.py
+++ b/descarga_por_dia/alChile/alChile/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class AlchilePipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/alChile/alChile/settings.py
+++ b/descarga_por_dia/alChile/alChile/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'alChile.spiders'
 #USER_AGENT = 'alChile (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'alChile.pipelines.AlchilePipeline': 300,
+   'alChile.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/alChile/alChile/settings.pyc
+++ b/descarga_por_dia/alChile/alChile/settings.pyc
--- a/descarga_por_dia/alChile/alChile/spiders/noticias.py
+++ b/descarga_por_dia/alChile/alChile/spiders/noticias.py
 import scrapy, re
+from alChile.items import NoticiasItem
 """
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+MEDIO:
+Al Chile, Yucatan
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
 """
 TAG_RE = re.compile(r'<[^>]+>')
@@ -9,16 +13,6 @@ def remove_tags(text):
    return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"
@@ -26,26 +20,22 @@ class QuotesSpider(scrapy.Spider):
        year = getattr(self, 'year', None)
        month = getattr(self, 'month', None)
        day = getattr(self, 'day', None)
-        self.baseURL='http://alchile.com.mx/'+year+'/'+month+'/'+day
+        self.baseURL = 'http://alchile.com.mx/' + year + '/' + month + '/' + day
        yield scrapy.Request(url=self.baseURL, callback=self.parse)
    def parse(self, response):
-        pagination = response.css('div.page-nav').css('a.last::attr(href)').extract()
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-        if ( len(pagination) > 0 ):
-            pagination = pagination[0].strip('/')
-            pages = int(pagination[pagination.rfind('/')+1:])
-            for page in range(0,pages):
+        pagination = response.css('div.page-nav').css('a::attr(href)').extract()
-                if ( page == 0 ):
+        if len(pagination) > 0:
-                    yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+            pagination = pagination[-2].strip('/')
+            pages = int(pagination[pagination.rfind('/')+1:])
-                else:
-                    yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-        else:
+            for page in range(1,pages):
-            yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
    def parse_page(self, response):

--- a/descarga_por_dia/alChile/alChile/spiders/noticias.pyc
+++ b/descarga_por_dia/alChile/alChile/spiders/noticias.pyc
--- a/descarga_por_dia/campecheHoy/campecheHoy/items.py
+++ b/descarga_por_dia/campecheHoy/campecheHoy/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class CampechehoyItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/campecheHoy/campecheHoy/pipelines.py
+++ b/descarga_por_dia/campecheHoy/campecheHoy/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class CampechehoyPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/campecheHoy/campecheHoy/settings.py
+++ b/descarga_por_dia/campecheHoy/campecheHoy/settings.py
@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'campecheHoy.pipelines.CampechehoyPipeline': 300,
+   'campecheHoy.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/campecheHoy/campecheHoy/settings.pyc
+++ b/descarga_por_dia/campecheHoy/campecheHoy/settings.pyc
--- a/descarga_por_dia/campecheHoy/campecheHoy/spiders/noticias.py
+++ b/descarga_por_dia/campecheHoy/campecheHoy/spiders/noticias.py
 import scrapy, re
+from campecheHoy.items import NoticiasItem
 """
 MEDIO:
 Campeche Hoy, Campeche
 USO:
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2018 -a month=1 -a day=17
+scrapy crawl noticias --nolog -s filename=2018-01-17.json -a year=2018 -a month=1 -a day=17
 """
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)
-TRASH_RE = re.compile(r'<.*?>.*</.*?>\s?')
-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"

--- a/descarga_por_dia/campecheHoy/campecheHoy/spiders/noticias.pyc
+++ b/descarga_por_dia/campecheHoy/campecheHoy/spiders/noticias.pyc
--- a/descarga_por_dia/desdeElBalcon/desdeElBalcon/items.py
+++ b/descarga_por_dia/desdeElBalcon/desdeElBalcon/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class DesdeelbalconItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/desdeElBalcon/desdeElBalcon/pipelines.py
+++ b/descarga_por_dia/desdeElBalcon/desdeElBalcon/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class DesdeelbalconPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/desdeElBalcon/desdeElBalcon/settings.py
+++ b/descarga_por_dia/desdeElBalcon/desdeElBalcon/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'desdeElBalcon.spiders'
 #USER_AGENT = 'desdeElBalcon (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'desdeElBalcon.pipelines.DesdeelbalconPipeline': 300,
+   'desdeElBalcon.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/desdeElBalcon/desdeElBalcon/settings.pyc
+++ b/descarga_por_dia/desdeElBalcon/desdeElBalcon/settings.pyc
--- a/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.py
+++ b/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.py
 import scrapy, re
 from datetime import datetime, timedelta, tzinfo
+from desdeElBalcon.items import NoticiasItem
 """
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+MEDIO:
+Desde el Balcon, Yucatan
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
 """
 TAG_RE = re.compile(r'<[^>]+>')
@@ -22,16 +26,6 @@ class UTC(tzinfo):
        return 'UTC-6'
-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"
@@ -46,21 +40,16 @@ class QuotesSpider(scrapy.Spider):
    def parse(self, response):
-        pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()
        if len(pagination) > 0:
            pagination = pagination[-1].strip('/')
            pages = int(pagination[pagination.rfind('/')+1:])
-            for page in range(0, pages):
+            for page in range(1, pages):
-                if page == 0:
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-                    yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-                else:
-                    yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-        else:
-            yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
    def parse_page(self, response):

--- a/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.pyc
+++ b/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.pyc
--- a/descarga_por_dia/diarioYaqui/diarioYaqui/items.py
+++ b/descarga_por_dia/diarioYaqui/diarioYaqui/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class DiarioyaquiItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/diarioYaqui/diarioYaqui/pipelines.py
+++ b/descarga_por_dia/diarioYaqui/diarioYaqui/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class DiarioyaquiPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/diarioYaqui/diarioYaqui/settings.py
+++ b/descarga_por_dia/diarioYaqui/diarioYaqui/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'diarioYaqui.spiders'
 #USER_AGENT = 'diarioYaqui (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'diarioYaqui.pipelines.DiarioyaquiPipeline': 300,
+   'diarioYaqui.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/diarioYaqui/diarioYaqui/settings.pyc
+++ b/descarga_por_dia/diarioYaqui/diarioYaqui/settings.pyc
--- a/descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.py
+++ b/descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.py
 import scrapy, re
+from diarioYaqui.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+"""
+MEDIO:
+Diario del Yaqui, Sonora
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
@@ -21,16 +27,6 @@ class UTC(tzinfo):
        return 'UTC-7'
-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"
@@ -46,21 +42,15 @@ class QuotesSpider(scrapy.Spider):
    def parse(self, response):
-        pagination = response.xpath('//ul[@class="page-numbers"]/li/a/@href').extract()
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-        if ( len(pagination) > 0 ):
+        pagination = response.xpath('//ul[@class="page-numbers"]/li/a/@href').extract()
+        if len(pagination) > 0:
            pagination = pagination[-2].strip('/')
            pages = int(pagination[pagination.rfind('/')+1:])
-            for page in range(0,pages):
+            for page in range(1,pages):
-                if ( page == 0 ):
+                yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
-                    yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-                else:
-                    yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
-        else:
-            yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
    def parse_page(self, response):

--- a/descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.pyc
+++ b/descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.pyc
--- a/descarga_por_dia/grilloPorteno/grilloPorteno/items.py
+++ b/descarga_por_dia/grilloPorteno/grilloPorteno/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class GrilloportenoItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/grilloPorteno/grilloPorteno/pipelines.py
+++ b/descarga_por_dia/grilloPorteno/grilloPorteno/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class GrilloportenoPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/grilloPorteno/grilloPorteno/settings.py
+++ b/descarga_por_dia/grilloPorteno/grilloPorteno/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'grilloPorteno.spiders'
 #USER_AGENT = 'grilloPorteno (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'grilloPorteno.pipelines.GrilloportenoPipeline': 300,
+   'grilloPorteno.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/grilloPorteno/grilloPorteno/settings.pyc
+++ b/descarga_por_dia/grilloPorteno/grilloPorteno/settings.pyc
--- a/descarga_por_dia/grilloPorteno/grilloPorteno/spiders/noticias.py
+++ b/descarga_por_dia/grilloPorteno/grilloPorteno/spiders/noticias.py
 import scrapy, re
+from grilloPorteno.items import NoticiasItem
+"""
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+MEDIO:
+El Grillo, Yucatan
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-	return TAG_RE.sub('', text)
+    return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
+class QuotesSpider(scrapy.Spider):
-	title = scrapy.Field()
+    name = "noticias"
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
+    def start_requests(self):
+        year = getattr(self, 'year', None)
+        month = getattr(self, 'month', None)
+        day = getattr(self, 'day', None)
-class QuotesSpider(scrapy.Spider):
+        self.baseURL='http://grilloporteno.com/' + year +'/' + month + '/' + day
-	name = "noticias"
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
-	def start_requests(self):
-		year = getattr(self, 'year', None)
-		month = getattr(self, 'month', None)
+    def parse(self, response):
-		day = getattr(self, 'day', None)
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-		self.baseURL='http://grilloporteno.com/'+year+'/'+month+'/'+day
+        pagination = response.css('div.pagination').css('a::attr(href)').extract()
-		yield scrapy.Request(url=self.baseURL, callback=self.parse)
+        if len(pagination) > 0:
+            pagination = pagination[-1].strip('/')
+            pages = int(pagination[pagination.rfind('/')+1:])
-	def parse(self, response):
-		pagination = response.css('div.pagination').css('a::attr(href)').extract()
+            for page in range(1, pages):
-		if ( len(pagination) > 0 ):
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-			pagination = pagination[-1].strip('/')
-			pages = int(pagination[pagination.rfind('/')+1:])
-			for page in range(0, pages):
+    def parse_page(self, response):
-				if ( page == 0 ):
+        for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+            yield scrapy.Request(url=link, callback=self.parse_item)
-				else:
-					yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
+    def parse_item(self, response):
+        item = NoticiasItem()
-		else:
+        text = ''
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        item['title'] = response.xpath('/html/body/div/div[2]/div[4]/div[1]/div[1]/div[2]/h1/text()').extract_first()
+        d = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first()
-	def parse_page(self, response):
+        ## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
-		for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
+        if d[-6:] != '-06:00':
-			yield scrapy.Request(url=link, callback=self.parse_item)
+            d = d[:-6] + '-06:00'
+        item['date'] = d
-	def parse_item(self, response):
+        item['topic'] = response.css('div.breadcrumbs-plus').css('span').css('a::attr(title)').extract_first()
-		item = NoticiasItem()
-		text = ''
+        for paragraph in response.css('div.entry-content').css('p').extract():
-		item['title'] = response.xpath('/html/body/div/div[2]/div[4]/div[1]/div[1]/div[2]/h1/text()').extract_first()
+            text += remove_tags(paragraph) + '\n'
+        item['text'] = text
-		d = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first()
+        item['url'] = response.url
-		## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
-		if d[-6:] != '-06:00':
+        #print item['title']
-			d = d[:-6] + '-06:00'
+        yield item
-		item['date'] = d
-		item['topic'] = response.css('div.breadcrumbs-plus').css('span').css('a::attr(title)').extract_first()
-		for paragraph in response.css('div.entry-content').css('p').extract():
-			text += remove_tags(paragraph) + '\n'
-		item['text'] = text
-		item['url'] = response.url
-		#print item['title']
-		yield item
--- a/descarga_por_dia/grilloPorteno/grilloPorteno/spiders/noticias.pyc
+++ b/descarga_por_dia/grilloPorteno/grilloPorteno/spiders/noticias.pyc
--- a/descarga_por_dia/heraldoAgs/heraldoAgs/items.py
+++ b/descarga_por_dia/heraldoAgs/heraldoAgs/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class HeraldoagsItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/heraldoAgs/heraldoAgs/pipelines.py
+++ b/descarga_por_dia/heraldoAgs/heraldoAgs/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class HeraldoagsPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/heraldoAgs/heraldoAgs/settings.py
+++ b/descarga_por_dia/heraldoAgs/heraldoAgs/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'heraldoAgs.spiders'
 #USER_AGENT = 'heraldoAgs (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'heraldoAgs.pipelines.HeraldoagsPipeline': 300,
+   'heraldoAgs.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/heraldoAgs/heraldoAgs/settings.pyc
+++ b/descarga_por_dia/heraldoAgs/heraldoAgs/settings.pyc
--- a/descarga_por_dia/heraldoAgs/heraldoAgs/spiders/noticias.py
+++ b/descarga_por_dia/heraldoAgs/heraldoAgs/spiders/noticias.py
 import scrapy, re
+from heraldoAgs.items import NoticiasItem
 """
 MEDIO:
 El Heraldo, Aguascalientes
 USO:
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=5
+scrapy crawl noticias --nolog -s filename=2018-01-05.json -a year=2018 -a month=1 -a day=5
 """
 TAG_RE = re.compile(r'<[^>]+>')
@@ -15,16 +16,6 @@ LOC_RE = re.compile(r'.+?(\d{1,2}-?[a-zA-Z]+)?\s?\.-\s?')
 DAT_RE = re.compile(r'\s?(\d{1,2}-?[a-zA-Z]+)?\s?\.-\s?')
-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"

--- a/descarga_por_dia/heraldoAgs/heraldoAgs/spiders/noticias.pyc
+++ b/descarga_por_dia/heraldoAgs/heraldoAgs/spiders/noticias.pyc
--- a/descarga_por_dia/laJornada/laJornada/items.py
+++ b/descarga_por_dia/laJornada/laJornada/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class LajornadaItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laJornada/laJornada/pipelines.py
+++ b/descarga_por_dia/laJornada/laJornada/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class LajornadaPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/laJornada/laJornada/settings.py
+++ b/descarga_por_dia/laJornada/laJornada/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornada.spiders'
 #USER_AGENT = 'laJornada (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'laJornada.pipelines.LajornadaPipeline': 300,
+   'laJornada.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laJornada/laJornada/settings.pyc
+++ b/descarga_por_dia/laJornada/laJornada/settings.pyc
--- a/descarga_por_dia/laJornada/laJornada/spiders/noticias.py
+++ b/descarga_por_dia/laJornada/laJornada/spiders/noticias.py
 import scrapy, re
+from laJornada.items import NoticiasItem
 from datetime import date, datetime, timedelta, tzinfo, time
 from collections import OrderedDict
 """
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+MEDIO:
+La Jornada, CDMX
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
 """
 TAG_RE = re.compile(r'<[^>]+>')
@@ -117,16 +121,6 @@ class UTC(tzinfo):
        return 'UTC-6'
-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"

--- a/descarga_por_dia/laJornada/laJornada/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornada/laJornada/spiders/noticias.pyc
--- a/descarga_por_dia/laJornadaAgs/laJornadaAgs/items.py
+++ b/descarga_por_dia/laJornadaAgs/laJornadaAgs/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class LajornadaagsItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laJornadaAgs/laJornadaAgs/pipelines.py
+++ b/descarga_por_dia/laJornadaAgs/laJornadaAgs/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class LajornadaagsPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/laJornadaAgs/laJornadaAgs/settings.py
+++ b/descarga_por_dia/laJornadaAgs/laJornadaAgs/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaAgs.spiders'
 #USER_AGENT = 'laJornadaAgs (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'laJornadaAgs.pipelines.LajornadaagsPipeline': 300,
+    'laJornadaAgs.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laJornadaAgs/laJornadaAgs/settings.pyc
+++ b/descarga_por_dia/laJornadaAgs/laJornadaAgs/settings.pyc
--- a/descarga_por_dia/laJornadaAgs/laJornadaAgs/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaAgs/laJornadaAgs/spiders/noticias.py
 import scrapy, re
+from laJornadaAgs.items import NoticiasItem
+"""
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+MEDIO:
+La Jornada Aguascalientes, Ags.
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-	return TAG_RE.sub('', text)
+    return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
+class QuotesSpider(scrapy.Spider):
-	title = scrapy.Field()
+    name = "noticias"
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
+    def start_requests(self):
+        year = getattr(self, 'year', None)
+        month = getattr(self, 'month', None)
+        day = getattr(self, 'day', None)
-class QuotesSpider(scrapy.Spider):
+        self.baseURL='http://www.lja.mx/'+year+'/'+month.zfill(2)+'/'+day.zfill(2)
-	name = "noticias"
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
-	def start_requests(self):
-		year = getattr(self, 'year', None)
-		month = getattr(self, 'month', None)
+    def parse(self, response):
-		day = getattr(self, 'day', None)
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-		self.baseURL='http://www.lja.mx/'+year+'/'+month.zfill(2)+'/'+day.zfill(2)
+        pagination = response.css('div.pagination').css('a::attr(href)').extract()
-		yield scrapy.Request(url=self.baseURL, callback=self.parse)
+        if len(pagination) > 0:
+            pagination = pagination[-1].strip('/')
+            pages = int(pagination[pagination.rfind('/')+1:])
-	def parse(self, response):
-		pagination = response.css('div.pagination').css('a::attr(href)').extract()
+            for page in range(1, pages):
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-		if ( len(pagination) > 0 ):
-			pagination = pagination[-1].strip('/')
-			pages = int(pagination[pagination.rfind('/')+1:])
+    def parse_page(self, response):
+        for link in response.xpath('//li[@class="infinite-post"]/a/@href').extract():
-			for page in range(0, pages):
+            yield scrapy.Request(url=link, callback=self.parse_item)
-				if ( page == 0 ):
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+    def parse_item(self, response):
-				else:
+        item = NoticiasItem()
-					yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
+        text = ''
+        ## la fecha de la noticia ya incluye la zona horaria
-		else:
+        item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        item['title'] = response.css('h1.story-title::text').extract_first()
+        item['topic'] = response.css('h3.story-cat::text').extract_first()
+        item['author'] = response.xpath('//div[@id="post-info"]/span/a/text()').extract_first()
-	def parse_page(self, response):
-		for link in response.xpath('//li[@class="infinite-post"]/a/@href').extract():
+        for paragraph in response.xpath('//div[@id="content-area"]/p').extract():
-			yield scrapy.Request(url=link, callback=self.parse_item)
+            text += remove_tags(paragraph) + '\n'
+        item['text'] = text
+        item['url'] = response.url
-	def parse_item(self, response):
-		item = NoticiasItem()
+        # print item['title']
-		text = ''
+        yield item
-		## la fecha de la noticia ya incluye la zona horaria
-		item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
-		item['title'] = response.css('h1.story-title::text').extract_first()
-		item['topic'] = response.css('h3.story-cat::text').extract_first()
-		item['author'] = response.xpath('//div[@id="post-info"]/span/a/text()').extract_first()
-		for paragraph in response.xpath('//div[@id="content-area"]/p').extract():
-			text += remove_tags(paragraph) + '\n'
-		item['text'] = text
-		item['url'] = response.url
-		# print item['title']
-		yield item
--- a/descarga_por_dia/laJornadaAgs/laJornadaAgs/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornadaAgs/laJornadaAgs/spiders/noticias.pyc
--- a/descarga_por_dia/laJornadaGro/laJornadaGro/items.py
+++ b/descarga_por_dia/laJornadaGro/laJornadaGro/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class LajornadagroItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laJornadaGro/laJornadaGro/pipelines.py
+++ b/descarga_por_dia/laJornadaGro/laJornadaGro/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class LajornadagroPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/laJornadaGro/laJornadaGro/settings.py
+++ b/descarga_por_dia/laJornadaGro/laJornadaGro/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaGro.spiders'
 #USER_AGENT = 'laJornadaGro (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'laJornadaGro.pipelines.LajornadagroPipeline': 300,
+   'laJornadaGro.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laJornadaGro/laJornadaGro/settings.pyc
+++ b/descarga_por_dia/laJornadaGro/laJornadaGro/settings.pyc
--- a/descarga_por_dia/laJornadaGro/laJornadaGro/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaGro/laJornadaGro/spiders/noticias.py
 import scrapy, re
+from laJornadaGro.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo
 """
+MEDIO:
+La Jornada Guerrero, Gro.
 Esta version se encarga de la descarga de la nueva pagina de La Jornada Guerrero
 con url: 'http://www.lajornadaguerrero.com.mx'
-Esta version tiene noticias a partir del 2017.08.15
+Esta version descarga noticias a partir del 2017.08.15
 Uso:
+scrapy crawl noticias --nolog -s filename=2017-09-18.json -a year=2017 -a month=9 -a day=18
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=18
 """
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-	return TAG_RE.sub('', text)
+    return TAG_RE.sub('', text)
 class UTC(tzinfo):
-	"""clase para el 'time zone' (zona horaria)"""
+    """clase para el 'time zone' (zona horaria)"""
-	def utcoffset(self, dt):
-		# zona horaria para guerrero (centro de mexico): utc-6
-		return timedelta(hours=-6)
-	def tzname(self, dt):
+    def utcoffset(self, dt):
-		# nombre de la zona horaria
+        # zona horaria para guerrero (centro de mexico): utc-6
-		return 'UTC-6'
+        return timedelta(hours=-6)
+    def tzname(self, dt):
+        # nombre de la zona horaria
+        return 'UTC-6'
-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
-	name = "noticias"
+    name = "noticias"
-	def start_requests(self):
+    def start_requests(self):
-		self.tz = UTC()
+        self.tz = UTC()
-		year = getattr(self, 'year', None)
+        year = getattr(self, 'year', None)
-		month = getattr(self, 'month', None)
+        month = getattr(self, 'month', None)
-		day = getattr(self, 'day', None)
+        day = getattr(self, 'day', None)
-		self.parse_month = {'Ene': '01', 'Feb': '02', 'Mar': '03', 'Abr': '04', 'May': '05', 'Jun': '06',
+        self.parse_month = {'Ene': '01', 'Feb': '02', 'Mar': '03', 'Abr': '04', 'May': '05', 'Jun': '06',
-												'Jul': '07', 'Ago': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dic': '12'}
+                                                'Jul': '07', 'Ago': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dic': '12'}
-		# self.baseURL = 'http://www.ljg.com.mx'
+        # self.baseURL = 'http://www.ljg.com.mx'
-		self.baseURL = 'http://www.lajornadaguerrero.com.mx'
+        self.baseURL = "http://www.lajornadaguerrero.com.mx"
-		url = '/index.php?option=com_k2&view=itemlist&task=date&year='+year+'&month='+month+'&day='+day+'&Itemid=588'
+        url = "/index.php?option=com_k2&view=itemlist&task=date&year="+year+"&month="+month+"&day="+day+"&Itemid=588"
-		yield scrapy.Request(url=self.baseURL+url, callback=self.parse)
+        yield scrapy.Request(url=self.baseURL+url, callback=self.parse)
-	def parse(self, response):
+    def parse(self, response):
-		pagination = response.xpath('//ul[@class="pagination"]/li/a/@href').extract()[:-2]
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-		if len(pagination) > 0:
-			pagination.insert(0, response.url)
+        pagination = response.xpath('//ul[@class="pagination"]/li/a/@href').extract()[:-2]
-			for page in range(0, len(pagination)):
+        if len(pagination) > 0:
-				if page == 0:
+            for page in range(0, len(pagination)):
-					yield scrapy.Request(url=pagination[page], callback=self.parse_page, dont_filter=True)
+                yield scrapy.Request(url=self.baseURL+pagination[page], callback=self.parse_page)
-				elif page > 0:
-					yield scrapy.Request(url=self.baseURL+pagination[page], callback=self.parse_page)
+    def parse_page(self, response):
+        for li in response.xpath('//*[@class="genericItemView"]/div[@class="genericItemHeader"]/h2/a/@href').extract():
-		else:
+            yield scrapy.Request(url=self.baseURL+li, callback=self.parse_item)
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+    def parse_item(self, response):
-	def parse_page(self, response):
+        item = NoticiasItem()
-		for li in response.xpath('//*[@class="genericItemView"]/div[@class="genericItemHeader"]/h2/a/@href').extract():
+        path_list = ['//*[@class="itemIntroText"]/p', '//*[@class="itemFullText"]/p']
-			yield scrapy.Request(url=self.baseURL+li, callback=self.parse_item)
+        text = ''
+        d = response.xpath('//span[@class="itemDateCreated"]/text()').extract_first()
-	def parse_item(self, response):
+        if d is not None:
-		print response.url
+            d = d.replace('\n','')
-		item = NoticiasItem()
+            d = d.replace('\t','')
-		path_list = ['//*[@class="itemIntroText"]/p', '//*[@class="itemFullText"]/p']
+            d = d.replace(',','')
-		text = ''
+            m = d[:d.find(' ')]
+            d = d.replace(m, self.parse_month[m])
-		d = response.xpath('//span[@class="itemDateCreated"]/text()').extract_first()
+            # item['date'] = datetime.strptime(d, '%m %d %Y').date()
-		if d is not None:
+            d = map(int, d.split(' '))
-			d = d.replace('\n','')
+            item['date'] = datetime(d[2],d[0],d[1],tzinfo=self.tz).isoformat('T')
-			d = d.replace('\t','')
-			d = d.replace(',','')
+        title = response.xpath('//*[@class="itemHeader"]/h2/text()').extract_first()
-			m = d[:d.find(' ')]
+        if title is not None:
-			d = d.replace(m, self.parse_month[m])
+            title = title.replace('\n','')
-			# item['date'] = datetime.strptime(d, '%m %d %Y').date()
+            title = title.replace('\t','')
-			d = map(int, d.split(' '))
+            title = title.lstrip()
-			item['date'] = datetime(d[2],d[0],d[1],tzinfo=self.tz).isoformat('T')
+            title = title.rstrip()
+            item['title'] = title
-		title = response.xpath('//*[@class="itemHeader"]/h2/text()').extract_first()
-		if title is not None:
+        item['topic'] = response.xpath('//*[@class="itemCategory"]/a/text()').extract_first()
-			title = title.replace('\n','')
-			title = title.replace('\t','')
+        for path in path_list:
-			title = title.lstrip()
+            for p in response.xpath(path).extract():
-			title = title.rstrip()
+                text += remove_tags(p)
-			item['title'] = title
+        item['text'] = text
-		item['topic'] = response.xpath('//*[@class="itemCategory"]/a/text()').extract_first()
+        item['url'] = response.url
-		for path in path_list:
+        # print item['url']
-			for p in response.xpath(path).extract():
+        yield item
-				text += remove_tags(p)
-		item['text'] = text
-		item['url'] = response.url
-		# print item['url']
-		yield item
--- a/descarga_por_dia/laJornadaGro/laJornadaGro/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornadaGro/laJornadaGro/spiders/noticias.pyc
--- a/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/items.py
+++ b/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class LajornadagroantiguoItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/pipelines.py
+++ b/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class LajornadagroantiguoPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/settings.py
+++ b/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaGroAntiguo.spiders'
 #USER_AGENT = 'laJornadaGroAntiguo (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'laJornadaGroAntiguo.pipelines.LajornadagroantiguoPipeline': 300,
+   'laJornadaGroAntiguo.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/spiders/noticias.py
 import scrapy, re
+from laJornadaGroAntiguo.items import NoticiasItem
 """
 Esta version se encarga de la descarga de la pagina de La Jornada Guerrero
 con url: 'http://www.lajornadaguerrero.com.mx/'
+--> LA ESCTRUCTURA DE LA PAGINA HA CAMBIADO. VER CRAWLER laJornadaGro.
 Uso:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
 """
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-	return TAG_RE.sub('', text)
+    return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
-	name = "noticias"
+    name = "noticias"
-	def start_requests(self):
+    def start_requests(self):
-		self.link_list = []
+        self.link_list = []
-		self.year = getattr(self, 'year', None)
+        self.year = getattr(self, 'year', None)
-		self.month = getattr(self, 'month', None)
+        self.month = getattr(self, 'month', None)
-		self.day = getattr(self, 'day', None)
+        self.day = getattr(self, 'day', None)
-		self.baseURL='http://www.lajornadaguerrero.com.mx/'+self.year+'/'+self.month.zfill(2)+'/'+self.day.zfill(2)
+        self.baseURL='http://www.lajornadaguerrero.com.mx/'+self.year+'/'+self.month.zfill(2)+'/'+self.day.zfill(2)
-		urls = [
+        urls = [
-			self.baseURL,
+            self.baseURL,
-								]
+                                ]
-		for url in urls:
+        for url in urls:
-			yield scrapy.Request(url=url, callback=self.parse)
+            yield scrapy.Request(url=url, callback=self.parse)
-	def parse(self, response):
+    def parse(self, response):
-		paths = ['//div[@class="viewlet"]/h1/a/@href', '//div[@class="viewlet"]/h2/a/@href',
+        paths = ['//div[@class="viewlet"]/h1/a/@href', '//div[@class="viewlet"]/h2/a/@href',
-				 '//div[@class="viewlet"]/h3/a/@href', '//div[@class="viewlet image"]/h1/a/@href',
+                 '//div[@class="viewlet"]/h3/a/@href', '//div[@class="viewlet image"]/h1/a/@href',
-				 '//div[@class="viewlet image"]/h2/a/@href', '//div[@class="viewlet image"]/h3/a/@href',
+                 '//div[@class="viewlet image"]/h2/a/@href', '//div[@class="viewlet image"]/h3/a/@href',
-				 '//div[@class="text_block_200"]/p/a/@href']
+                 '//div[@class="text_block_200"]/p/a/@href']
-		for path in paths:
+        for path in paths:
-			links = response.xpath(path).extract()
+            links = response.xpath(path).extract()
-			if ( len(links) > 0 ):
+            if ( len(links) > 0 ):
-				for link in links:
+                for link in links:
-					if not ( link in self.link_list ):
+                    if not ( link in self.link_list ):
-						self.link_list.append(link)
+                        self.link_list.append(link)
-						yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item)
+                        yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item)
-		for link in response.xpath('//*[@class="text_block_200"]/h2/a/@href').extract():
+        for link in response.xpath('//*[@class="text_block_200"]/h2/a/@href').extract():
-			yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_page)
+            yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_page)
-	def parse_page(self, response):
+    def parse_page(self, response):
-		paths = ['//*[@id="article_list"]/h2/a/@href',
+        paths = ['//*[@id="article_list"]/h2/a/@href',
-				 '//*[@id="article_list"]/h3/a/@href']
+                 '//*[@id="article_list"]/h3/a/@href']
-		for path in paths:
+        for path in paths:
-			for link in response.xpath(path).extract():
+            for link in response.xpath(path).extract():
-				if not ( link in self.link_list ):
+                if not ( link in self.link_list ):
-					yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item)
+                    yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item)
-	def parse_item(self, response):
+    def parse_item(self, response):
-		item = NoticiasItem()
+        item = NoticiasItem()
-		text = ''
+        text = ''
-		item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
+        item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
-		item['title'] = response.xpath('//*[@class="documentContent"]/h1/text()').extract_first()
+        item['title'] = response.xpath('//*[@class="documentContent"]/h1/text()').extract_first()
-		item['topic'] = response.xpath('//*[@id="portal-breadcrumbs"]/a[2]/text()').extract()
+        item['topic'] = response.xpath('//*[@id="portal-breadcrumbs"]/a[2]/text()').extract()
-		for paragraph in response.xpath('//*[@class="documentContent"]/p/text()').extract():
+        for paragraph in response.xpath('//*[@class="documentContent"]/p/text()').extract():
-			text += paragraph
+            text += paragraph
-		item['text'] = text
+        item['text'] = text
-		item['url'] = response.url
+        item['url'] = response.url
-		# print item['title']
+        # print item['title']
-		yield item
+        yield item
--- a/descarga_por_dia/laJornadaOte/laJornadaOte/items.py
+++ b/descarga_por_dia/laJornadaOte/laJornadaOte/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class LajornadaoteItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laJornadaOte/laJornadaOte/pipelines.py
+++ b/descarga_por_dia/laJornadaOte/laJornadaOte/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class LajornadaotePipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/laJornadaOte/laJornadaOte/settings.py
+++ b/descarga_por_dia/laJornadaOte/laJornadaOte/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaOte.spiders'
 #USER_AGENT = 'laJornadaOte (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'laJornadaOte.pipelines.LajornadaotePipeline': 300,
+   'laJornadaOte.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laJornadaOte/laJornadaOte/settings.pyc
+++ b/descarga_por_dia/laJornadaOte/laJornadaOte/settings.pyc
--- a/descarga_por_dia/laJornadaOte/laJornadaOte/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaOte/laJornadaOte/spiders/noticias.py
 import scrapy, re
+from laJornadaOte.items import NoticiasItem
+"""
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+MEDIO:
+La Jornada de Oriente, Puebla
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-	return TAG_RE.sub('', text)
+    return TAG_RE.sub('', text)
+class QuotesSpider(scrapy.Spider):
+    name = "noticias"
-class NoticiasItem(scrapy.Item):
+    def start_requests(self):
-	title = scrapy.Field()
+        year = getattr(self, 'year', None)
-	text = scrapy.Field()
+        month = getattr(self, 'month', None)
-	date = scrapy.Field()
+        day = getattr(self, 'day', None)
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
+        self.baseURL='http://www.lajornadadeoriente.com.mx/'+year+'/'+month+'/'+day
-class QuotesSpider(scrapy.Spider):
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
-	name = "noticias"
-	def start_requests(self):
+    def parse(self, response):
-		year = getattr(self, 'year', None)
+        for link in response.xpath('//div[@class="mas_noticias"]/ul[@class="sollet"]/li/a/@href').extract():
-		month = getattr(self, 'month', None)
+            yield scrapy.Request(url=link, callback=self.parse_item)
-		day = getattr(self, 'day', None)
-		self.baseURL='http://www.lajornadadeoriente.com.mx/'+year+'/'+month+'/'+day
+    def parse_item(self, response):
-		yield scrapy.Request(url=self.baseURL, callback=self.parse)
+        item = NoticiasItem()
+        text = ''
+        item['title'] = response.xpath('//h1[@itemprop="headline"]/text()').extract_first()
-	def parse(self, response):
+        ## la fecha de la noticia ya incluye la zona horaria
-		for link in response.xpath('//div[@class="mas_noticias"]/ul[@class="sollet"]/li/a/@href').extract():
+        item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
-			yield scrapy.Request(url=link, callback=self.parse_item)
+        item['topic'] = response.xpath('//span[@itemprop="genre"]/text()').extract_first()
+        for paragraph in response.xpath('//span[@itemprop="articleBody"]').extract():
-	def parse_item(self, response):
+            text += remove_tags(paragraph) + '\n'
-		item = NoticiasItem()
+        item['text'] = text
-		text = ''
+        item['url'] = response.url
-		item['title'] = response.xpath('//h1[@itemprop="headline"]/text()').extract_first()
-		## la fecha de la noticia ya incluye la zona horaria
+        # print item['title']
-		item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+        yield item
-		item['topic'] = response.xpath('//span[@itemprop="genre"]/text()').extract_first()
-		for paragraph in response.xpath('//span[@itemprop="articleBody"]').extract():
-			text += remove_tags(paragraph) + '\n'
-		item['text'] = text
-		item['url'] = response.url
-		# print item['title']
-		yield item
--- a/descarga_por_dia/laJornadaOte/laJornadaOte/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornadaOte/laJornadaOte/spiders/noticias.pyc
--- a/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/items.py
+++ b/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class LajornadasanluisItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/pipelines.py
+++ b/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class LajornadasanluisPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/settings.py
+++ b/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaSanLuis.spiders'
 #USER_AGENT = 'laJornadaSanLuis (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'laJornadaSanLuis.pipelines.LajornadasanluisPipeline': 300,
+   'laJornadaSanLuis.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/settings.pyc
+++ b/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/settings.pyc
--- a/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/spiders/noticias.py
 import scrapy, re
+from laJornadaSanLuis.items import NoticiasItem
+"""
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+MEDIO:
+La Jornada de San Luis, San Luis Potosi
+Uso:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-	return TAG_RE.sub('', text)
+    return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
+class QuotesSpider(scrapy.Spider):
-	title = scrapy.Field()
+    name = "noticias"
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
+    def start_requests(self):
+        year = getattr(self, 'year', None)
+        month = getattr(self, 'month', None)
+        day = getattr(self, 'day', None)
-class QuotesSpider(scrapy.Spider):
+        self.baseURL='http://lajornadasanluis.com.mx/'+year+'/'+month+'/'+day
-	name = "noticias"
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
-	def start_requests(self):
-		year = getattr(self, 'year', None)
-		month = getattr(self, 'month', None)
+    def parse(self, response):
-		day = getattr(self, 'day', None)
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-		self.baseURL='http://lajornadasanluis.com.mx/'+year+'/'+month+'/'+day
+        pagination = response.xpath('//div[@class="pages"]/a/@href').extract()
-		yield scrapy.Request(url=self.baseURL, callback=self.parse)
+        if len(pagination) > 0:
+            pagination = pagination[-1].strip('/')
+            pages = int(pagination[pagination.rfind('/')+1:])
-	def parse(self, response):
+            for page in range(1, pages):
-		pagination = response.xpath('//div[@class="pages"]/a/@href').extract()
+                yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
-		if ( len(pagination) > 0 ):
-			pagination = pagination[-1].strip('/')
-			pages = int(pagination[pagination.rfind('/')+1:])
+    def parse_page(self, response):
-			for page in range(0,pages):
+        for link in response.xpath('//*[@class="post-title"]/h2/a/@href').extract():
-				if ( page == 0 ):
+            yield scrapy.Request(url=link, callback=self.parse_item)
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-				else:
-					yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
+    def parse_item(self, response):
-		else:
+        item = NoticiasItem()
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        text = ''
+        ## la fecha de la noticia ya incluye la zona horaria
+        d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
-	def parse_page(self, response):
+        if d is None:
-		for link in response.xpath('//*[@class="post-title"]/h2/a/@href').extract():
+            d = response.xpath('//time[@class="entry-date updated"]/@datetime').extract_first()
-			yield scrapy.Request(url=link, callback=self.parse_item)
+        item['date'] = d
+        item['title'] = response.css('h1.entry-title::text').extract_first()
-	def parse_item(self, response):
-		item = NoticiasItem()
+        item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract_first()
-		text = ''
-		## la fecha de la noticia ya incluye la zona horaria
+        for paragraph in response.xpath('//p[@style="text-align: justify;"]/text()').extract():
-		d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+            text += remove_tags(paragraph) + '\n'
-		if d is None:
+        item['text'] = text
-			d = response.xpath('//time[@class="entry-date updated"]/@datetime').extract_first()
+        item['url'] = response.url
-		item['date'] = d
+        # print item['title']
+        yield item
-		item['title'] = response.css('h1.entry-title::text').extract_first()
-		item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract_first()
-		for paragraph in response.xpath('//p[@style="text-align: justify;"]/text()').extract():
-			text += remove_tags(paragraph) + '\n'
-		item['text'] = text
-		item['url'] = response.url
-		# print item['title']
-		yield item
--- a/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/spiders/noticias.pyc
--- a/descarga_por_dia/laJornadaVer/laJornadaVer/items.py
+++ b/descarga_por_dia/laJornadaVer/laJornadaVer/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class LajornadaverItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laJornadaVer/laJornadaVer/pipelines.py
+++ b/descarga_por_dia/laJornadaVer/laJornadaVer/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class LajornadaverPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/laJornadaVer/laJornadaVer/settings.py
+++ b/descarga_por_dia/laJornadaVer/laJornadaVer/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaVer.spiders'
 #USER_AGENT = 'laJornadaVer (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'laJornadaVer.pipelines.LajornadaverPipeline': 300,
+   'laJornadaVer.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laJornadaVer/laJornadaVer/settings.pyc
+++ b/descarga_por_dia/laJornadaVer/laJornadaVer/settings.pyc
--- a/descarga_por_dia/laJornadaVer/laJornadaVer/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaVer/laJornadaVer/spiders/noticias.py
 import scrapy, re
+from laJornadaVer.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+"""
+MEDIO:
+La Jornada de Veracruz, Ver.
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-	return TAG_RE.sub('', text)
+    return TAG_RE.sub('', text)
 class UTC(tzinfo):
-	"""clase para el 'time zone' (zona horaria)"""
+    """clase para el 'time zone' (zona horaria)"""
-	def utcoffset(self, dt):
-		# zona horaria para veracruz (centro de mexico): utc-6
-		return timedelta(hours=-6)
-	def tzname(self, dt):
-		# nombre de la zona horaria
-		return 'UTC-6'
+    def utcoffset(self, dt):
+        # zona horaria para veracruz (centro de mexico): utc-6
+        return timedelta(hours=-6)
-class NoticiasItem(scrapy.Item):
+    def tzname(self, dt):
-	title = scrapy.Field()
+        # nombre de la zona horaria
-	text = scrapy.Field()
+        return 'UTC-6'
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
-	name = "noticias"
+    name = "noticias"
-	def start_requests(self):
+    def start_requests(self):
-		tz = UTC()
+        tz = UTC()
-		year = getattr(self, 'year', None)
+        year = getattr(self, 'year', None)
-		month = getattr(self, 'month', None)
+        month = getattr(self, 'month', None)
-		day = getattr(self, 'day', None)
+        day = getattr(self, 'day', None)
-		self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
+        self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
-		self.baseURL = 'http://www.jornadaveracruz.com.mx/'
-		self.builtURL= self.baseURL+'Archive.aspx?date='+day.zfill(2)+'/'+month.zfill(2)+'/'+year
+        self.baseURL = 'http://www.jornadaveracruz.com.mx/'
+        self.builtURL= self.baseURL+'Archive.aspx?date='+day.zfill(2)+'/'+month.zfill(2)+'/'+year
-		yield scrapy.Request(url=self.builtURL, callback=self.parse)
+        yield scrapy.Request(url=self.builtURL, callback=self.parse)
-	def parse(self, response):
-		paths = ['//*[@class="article-header"]/h2/a/@href',
+    def parse(self, response):
-				 '//ul[@class="article-array content-category"]/li/a/@href']
+        paths = ['//*[@class="article-header"]/h2/a/@href',
-		for path in paths:
+                 '//ul[@class="article-array content-category"]/li/a/@href']
-			links = response.xpath(path).extract()
-			if ( len(links) > 0 ):
+        for path in paths:
-				for link in links:
+            links = response.xpath(path).extract()
-					yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item)
+            if len(links) > 0:
+                for link in links:
-		# for link in response.xpath('//*[@class="text_block_200"]/h2/a/@href').extract():
+                    yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item)
-		# 	yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_page)
+        # for link in response.xpath('//*[@class="text_block_200"]/h2/a/@href').extract():
+        # 	yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_page)
-	def parse_page(self, response):
-		paths = ['//*[@id="article_list"]/h2/a/@href',
-				 '//*[@id="article_list"]/h3/a/@href']
+    def parse_page(self, response):
-		for path in paths:
+        paths = ['//*[@id="article_list"]/h2/a/@href',
-			for link in response.xpath(path).extract():
+                 '//*[@id="article_list"]/h3/a/@href']
-				if not ( link in self.link_list ):
-					yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item)
+        for path in paths:
+            for link in response.xpath(path).extract():
+                if not link in self.link_list:
-	def parse_item(self, response):
+                    yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item)
-		item = NoticiasItem()
-		text = ''
-		item['date'] = self.date
+    def parse_item(self, response):
+        item = NoticiasItem()
-		title = response.xpath('//h2[@class="article-title"]/text()').extract_first()
+        text = ''
-		title = title.replace('\r','')
+        item['date'] = self.date
-		title = title.replace('\n','')
-		title = title.lstrip(' ')
+        title = response.xpath('//h2[@class="article-title"]/text()').extract_first()
-		title = title.rstrip(' ')
+        title = title.replace('\r','')
-		item['title'] = title
+        title = title.replace('\n','')
+        title = title.lstrip(' ')
-		topic = response.xpath('//*[@class="content-article-title"]/h2/text()').extract_first()
+        title = title.rstrip(' ')
-		topic = topic.replace('\r','')
+        item['title'] = title
-		topic = topic.replace('\n','')
-		topic = topic.lstrip(' ')
+        topic = response.xpath('//*[@class="content-article-title"]/h2/text()').extract_first()
-		topic = topic.rstrip(' ')
+        topic = topic.replace('\r','')
-		item['topic'] = topic
+        topic = topic.replace('\n','')
-		# item['author'] = response.xpath('//*[@class="right-side"]/div/a[@rel="author"]/text()').extract_first()
+        topic = topic.lstrip(' ')
+        topic = topic.rstrip(' ')
-		paragraph = response.xpath('//*[@class="shortcode-content"]/p/text()').extract()
+        item['topic'] = topic
-		if ( len(paragraph) > 0 ):
+        # item['author'] = response.xpath('//*[@class="right-side"]/div/a[@rel="author"]/text()').extract_first()
-			for p in paragraph:
-				text += paragraph[3]
+        paragraph = response.xpath('//*[@class="shortcode-content"]/p/text()').extract()
-		else:
+        if len(paragraph) > 0:
-			for p in response.xpath('//*[@class="shortcode-content"]').extract():
+            for p in paragraph:
-				text += remove_tags(p) + '\n'
+                text += paragraph[3]
+        else:
-		item['text'] = text
+            for p in response.xpath('//*[@class="shortcode-content"]').extract():
-		item['url'] = response.url
+                text += remove_tags(p) + '\n'
-		# print item['title']
-		yield item
+        item['text'] = text
+        item['url'] = response.url
+        # print item['title']
+        yield item
--- a/descarga_por_dia/laJornadaVer/laJornadaVer/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornadaVer/laJornadaVer/spiders/noticias.pyc
--- a/descarga_por_dia/laJornadaZac/laJornadaZac/items.py
+++ b/descarga_por_dia/laJornadaZac/laJornadaZac/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class LajornadazacItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laJornadaZac/laJornadaZac/pipelines.py
+++ b/descarga_por_dia/laJornadaZac/laJornadaZac/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class LajornadazacPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/laJornadaZac/laJornadaZac/settings.py
+++ b/descarga_por_dia/laJornadaZac/laJornadaZac/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaZac.spiders'
 #USER_AGENT = 'laJornadaZac (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'laJornadaZac.pipelines.LajornadazacPipeline': 300,
+   'laJornadaZac.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laJornadaZac/laJornadaZac/settings.pyc
+++ b/descarga_por_dia/laJornadaZac/laJornadaZac/settings.pyc
--- a/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.py
 import scrapy, re
+from laJornadaZac.items import NoticiasItem
+"""
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+MEDIO:
+La Jornada Zacatecas, Zac.
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-	return TAG_RE.sub('', text)
+    return TAG_RE.sub('', text)
+class QuotesSpider(scrapy.Spider):
+    name = "noticias"
-class NoticiasItem(scrapy.Item):
+    def start_requests(self):
-	title = scrapy.Field()
+        year = getattr(self, 'year', None)
-	text = scrapy.Field()
+        month = getattr(self, 'month', None)
-	date = scrapy.Field()
+        day = getattr(self, 'day', None)
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
+        self.baseURL = "http://ljz.mx/" + year + "/" + month + "/" + day
-class QuotesSpider(scrapy.Spider):
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
-	name = "noticias"
-	def start_requests(self):
-		year = getattr(self, 'year', None)
+    def parse(self, response):
-		month = getattr(self, 'month', None)
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-		day = getattr(self, 'day', None)
-		self.baseURL='http://ljz.mx/'+year+'/'+month+'/'+day
+        pagination = response.xpath('//div[@class="pagination"]/a/@href').extract()
+        if len(pagination) > 0:
-		yield scrapy.Request(url=self.baseURL, callback=self.parse)
+            pagination = pagination[-1].strip('/')
+            pages = int(pagination[pagination.rfind('/')+1:])
-	def parse(self, response):
+            for page in range(1,pages):
-		pagination = response.xpath('//div[@class="pagination"]/a/@href').extract()
+                yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
-		if ( len(pagination) > 0 ):
-			pagination = pagination[-1].strip('/')
-			pages = int(pagination[pagination.rfind('/')+1:])
+    def parse_page(self, response):
-			for page in range(0,pages):
+        for link in response.xpath('//h2[@class="cat-list-title"]/a/@href').extract():
-				if ( page == 0 ):
+            yield scrapy.Request(url=link, callback=self.parse_item)
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-				else:
-					yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
+    def parse_item(self, response):
-		else:
+        item = NoticiasItem()
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+        text = ''
+        d = response.xpath('//*[@class="entry-post-meta"]').css('time::attr(content)').extract_first()
-	def parse_page(self, response):
+        ## '-06:00' corresponde al UTC-6, zona horaria de zacatecas (centro de mexico)
-		for link in response.xpath('//h2[@class="cat-list-title"]/a/@href').extract():
+        if d[-6:] != '-06:00':
-			yield scrapy.Request(url=link, callback=self.parse_item)
+            d = d[:-6] + '-06:00'
+        item['date'] = d
-	def parse_item(self, response):
+        item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
-		item = NoticiasItem()
+        item['topic'] = response.xpath('//*[@class="entry-cat"]/a/text()').extract_first()
-		text = ''
+        content = response.xpath('//*[@class="entry-content clearfix"]/p').extract()
-		d = response.xpath('//*[@class="entry-post-meta"]').css('time::attr(content)').extract_first()
+        if ( len(content) == 0 ):
-		## '-06:00' corresponde al UTC-6, zona horaria de zacatecas (centro de mexico)
+            content = response.xpath('//*[@class="entry-content clearfix"]/div/p').extract()
-		if d[-6:] != '-06:00':
+            if ( len(content) == 0 ):
-			d = d[:-6] + '-06:00'
+                content = response.xpath('//*[@class="entry-content clearfix"]/div').extract()
-		item['date'] = d
+                if ( len(content) == 0 ):
+                    content = response.xpath('//*[@class="entry-content clearfix"]/div/div/p').extract()
-		item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
-		item['topic'] = response.xpath('//*[@class="entry-cat"]/a/text()').extract_first()
+        for paragraph in content:
+            text += remove_tags(paragraph) + '\n'
-		content = response.xpath('//*[@class="entry-content clearfix"]/p').extract()
+        item['text'] = text
-		if ( len(content) == 0 ):
+        item['url'] = response.url
-			content = response.xpath('//*[@class="entry-content clearfix"]/div/p').extract()
+        # print item['title']
-			if ( len(content) == 0 ):
+        yield item
-				content = response.xpath('//*[@class="entry-content clearfix"]/div').extract()
-				if ( len(content) == 0 ):
-					content = response.xpath('//*[@class="entry-content clearfix"]/div/div/p').extract()
-		for paragraph in content:
-			text += remove_tags(paragraph) + '\n'
-		item['text'] = text
-		item['url'] = response.url
-		# print item['title']
-		yield item
--- a/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.pyc
--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/pipelines.py
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def __init__(self, filename):
+        self.filename = filename
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class MipuntodevistaPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/settings.py
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'miPuntoDeVista.spiders'
 #USER_AGENT = 'miPuntoDeVista (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'miPuntoDeVista.pipelines.MipuntodevistaPipeline': 300,
+    'miPuntoDeVista.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/settings.pyc
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/settings.pyc
--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/spiders/noticias.py
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/spiders/noticias.py
 import scrapy, re
+"""
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+USO:
+scrapy crawl noticias -t json --nolog -s filename=2018-03-22.json -a year=2017 -a month=3 -a day=22
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-	return TAG_RE.sub('', text)
+    return TAG_RE.sub('', text)
 class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
+    title = scrapy.Field()
-	text = scrapy.Field()
+    text = scrapy.Field()
-	date = scrapy.Field()
+    date = scrapy.Field()
-	location = scrapy.Field()
+    location = scrapy.Field()
-	author = scrapy.Field()
+    author = scrapy.Field()
-	topic = scrapy.Field()
+    topic = scrapy.Field()
-	url = scrapy.Field()
+    url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
-	name = "noticias"
+    name = "noticias"
-	def start_requests(self):
+    def start_requests(self):
-		year = getattr(self, 'year', None)
+        year = getattr(self, 'year', None)
-		month = getattr(self, 'month', None)
+        month = getattr(self, 'month', None)
-		day = getattr(self, 'day', None)
+        day = getattr(self, 'day', None)
-		self.baseURL='http://www.mipuntodevista.com.mx/'+year+'/'+month+'/'+day
-		urls = [
+        self.baseURL = 'http://www.mipuntodevista.com.mx/' + year + '/' + month + '/' + day
-			self.baseURL,
-							 ]
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)
-		for url in urls:
-			yield scrapy.Request(url=url, callback=self.parse)
+    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-	def parse(self, response):
-		pagination = response.css('div.page-nav').css('a::attr(href)').extract()
+        pagination = response.css('div.page-nav').css('a::attr(href)').extract()
-		if ( len(pagination) > 0 ):
+        if len(pagination) > 0:
-			pagination = pagination[-2].strip('/')
+            pagination = pagination[-2].strip('/')
-			pages = int(pagination[pagination.rfind('/')+1:])
+            pages = int(pagination[pagination.rfind('/')+1:])
-			for page in range(0, pages):
+            for page in range(1, pages):
-				if ( page == 0 ):
+                yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-				else:
-					yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
+    def parse_page(self, response):
-		else:
+        for link in response.css('h3.entry-title').css('a::attr(href)').extract():
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+            yield scrapy.Request(url=link, callback=self.parse_item)
-	def parse_page(self, response):
+    def parse_item(self, response):
-		for link in response.css('h3.entry-title').css('a::attr(href)').extract():
+        item = NoticiasItem()
-			yield scrapy.Request(url=link, callback=self.parse_item)
+        text = ''
+        item['title'] = response.css('div.td-post-header').css('h1.entry-title::text').extract_first()
-	def parse_item(self, response):
-		item = NoticiasItem()
+        d = response.css('div.td-post-header').css('span.td-post-date').css('time::attr(datetime)').extract_first()
-		text = ''
+        ## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
+        if d[-6:] != '-06:00':
-		item['title'] = response.css('div.td-post-header').css('h1.entry-title::text').extract_first()
+            d = d[:-6] + '-06:00'
+        item['date'] = d
-		d = response.css('div.td-post-header').css('span.td-post-date').css('time::attr(datetime)').extract_first()
-		## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
+        author = response.xpath('//*[@class="td-post-author-name"]/a/text()').extract_first()
-		if d[-6:] != '-06:00':
+        if author is not None: item['author'] = author
-			d = d[:-6] + '-06:00'
+        try:
-		item['date'] = d
+            item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract()[1]
+        except:
-		item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()
+            item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()
-		for paragraph in response.css('div.td-post-content').css('p').extract():
+        for paragraph in response.css('div.td-post-content').css('p').extract():
-			text += remove_tags(paragraph) + '\n'
+            text += remove_tags(paragraph) + "\n"
-		item['text'] = text
+        item['text'] = text.strip()
-		item['url'] = response.url
+        item['url'] = response.url
-		# print item['title']
-		yield item
+        yield item
--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/spiders/noticias.pyc
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/spiders/noticias.pyc