crawlers

5953d008 · Renán Sosa Guillen · ba65cc37 · 5953d008 · 5953d008 · 5953d008
Commit 5953d008 authored Jan 29, 2018 by Renán Sosa Guillen
87 changed files
--- a/descarga_por_dia/alChile/alChile/items.py
+++ b/descarga_por_dia/alChile/alChile/items.py
@@ -8,7 +8,13 @@
 import scrapy


-class AlchileItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/alChile/alChile/pipelines.py
+++ b/descarga_por_dia/alChile/alChile/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()

-class AlchilePipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
        return item
--- a/descarga_por_dia/alChile/alChile/settings.py
+++ b/descarga_por_dia/alChile/alChile/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'alChile.spiders'
 #USER_AGENT = 'alChile (+http://www.yourdomain.com)'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False

 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True

 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'alChile.pipelines.AlchilePipeline': 300,
-#}
+ITEM_PIPELINES = {
+   'alChile.pipelines.JsonWriterPipeline': 300,
+}

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/alChile/alChile/settings.pyc
+++ b/descarga_por_dia/alChile/alChile/settings.pyc
--- a/descarga_por_dia/alChile/alChile/spiders/noticias.py
+++ b/descarga_por_dia/alChile/alChile/spiders/noticias.py
 import scrapy, re
+from alChile.items import NoticiasItem

 """
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+MEDIO:
+Al Chile, Yucatan
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
 """

 TAG_RE = re.compile(r'<[^>]+>')
@@ -9,16 +13,6 @@ def remove_tags(text):
    return TAG_RE.sub('', text)


-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
-
-
 class QuotesSpider(scrapy.Spider):
    name = "noticias"

@@ -26,26 +20,22 @@ class QuotesSpider(scrapy.Spider):
        year = getattr(self, 'year', None)
        month = getattr(self, 'month', None)
        day = getattr(self, 'day', None)
-        self.baseURL='http://alchile.com.mx/'+year+'/'+month+'/'+day
+        
+        self.baseURL = 'http://alchile.com.mx/' + year + '/' + month + '/' + day

        yield scrapy.Request(url=self.baseURL, callback=self.parse)


    def parse(self, response):
-        pagination = response.css('div.page-nav').css('a.last::attr(href)').extract()
-        if ( len(pagination) > 0 ):
-            pagination = pagination[0].strip('/')
-            pages = int(pagination[pagination.rfind('/')+1:])
-
-            for page in range(0,pages):
-                if ( page == 0 ):
        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)

-                else:
-                    yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
+        pagination = response.css('div.page-nav').css('a::attr(href)').extract()
+        if len(pagination) > 0:
+            pagination = pagination[-2].strip('/')
+            pages = int(pagination[pagination.rfind('/')+1:])

-        else:
-            yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+            for page in range(1,pages):
+                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)


    def parse_page(self, response):

--- a/descarga_por_dia/alChile/alChile/spiders/noticias.pyc
+++ b/descarga_por_dia/alChile/alChile/spiders/noticias.pyc
--- a/descarga_por_dia/campecheHoy/campecheHoy/items.py
+++ b/descarga_por_dia/campecheHoy/campecheHoy/items.py
@@ -8,7 +8,13 @@
 import scrapy


-class CampechehoyItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/campecheHoy/campecheHoy/pipelines.py
+++ b/descarga_por_dia/campecheHoy/campecheHoy/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()

-class CampechehoyPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
        return item
--- a/descarga_por_dia/campecheHoy/campecheHoy/settings.py
+++ b/descarga_por_dia/campecheHoy/campecheHoy/settings.py
@@ -64,9 +64,9 @@ COOKIES_ENABLED = False

 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'campecheHoy.pipelines.CampechehoyPipeline': 300,
-#}
+ITEM_PIPELINES = {
+   'campecheHoy.pipelines.JsonWriterPipeline': 300,
+}

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/campecheHoy/campecheHoy/settings.pyc
+++ b/descarga_por_dia/campecheHoy/campecheHoy/settings.pyc
--- a/descarga_por_dia/campecheHoy/campecheHoy/spiders/noticias.py
+++ b/descarga_por_dia/campecheHoy/campecheHoy/spiders/noticias.py
 import scrapy, re
+from campecheHoy.items import NoticiasItem

 """
 MEDIO:
 Campeche Hoy, Campeche
 USO:
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2018 -a month=1 -a day=17
+scrapy crawl noticias --nolog -s filename=2018-01-17.json -a year=2018 -a month=1 -a day=17
 """

 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)

-TRASH_RE = re.compile(r'<.*?>.*</.*?>\s?')
-
-
-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
-

 class QuotesSpider(scrapy.Spider):
    name = "noticias"

--- a/descarga_por_dia/campecheHoy/campecheHoy/spiders/noticias.pyc
+++ b/descarga_por_dia/campecheHoy/campecheHoy/spiders/noticias.pyc
--- a/descarga_por_dia/desdeElBalcon/desdeElBalcon/items.py
+++ b/descarga_por_dia/desdeElBalcon/desdeElBalcon/items.py
@@ -8,7 +8,13 @@
 import scrapy


-class DesdeelbalconItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/desdeElBalcon/desdeElBalcon/pipelines.py
+++ b/descarga_por_dia/desdeElBalcon/desdeElBalcon/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()

-class DesdeelbalconPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
        return item
--- a/descarga_por_dia/desdeElBalcon/desdeElBalcon/settings.py
+++ b/descarga_por_dia/desdeElBalcon/desdeElBalcon/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'desdeElBalcon.spiders'
 #USER_AGENT = 'desdeElBalcon (+http://www.yourdomain.com)'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False

 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True

 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'desdeElBalcon.pipelines.DesdeelbalconPipeline': 300,
-#}
+ITEM_PIPELINES = {
+   'desdeElBalcon.pipelines.JsonWriterPipeline': 300,
+}

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/desdeElBalcon/desdeElBalcon/settings.pyc
+++ b/descarga_por_dia/desdeElBalcon/desdeElBalcon/settings.pyc
--- a/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.py
+++ b/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.py
 import scrapy, re
 from datetime import datetime, timedelta, tzinfo
+from desdeElBalcon.items import NoticiasItem

 """
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+MEDIO:
+Desde el Balcon, Yucatan
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
 """

 TAG_RE = re.compile(r'<[^>]+>')
@@ -22,16 +26,6 @@ class UTC(tzinfo):
        return 'UTC-6'


-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
-
-
 class QuotesSpider(scrapy.Spider):
    name = "noticias"

@@ -46,21 +40,16 @@ class QuotesSpider(scrapy.Spider):


    def parse(self, response):
-        pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)

+        pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()
        if len(pagination) > 0:
            pagination = pagination[-1].strip('/')
            pages = int(pagination[pagination.rfind('/')+1:])

-            for page in range(0, pages):
-                if page == 0:
-                    yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-
-                else:
+            for page in range(1, pages):
                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)

-        else:
-            yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)


    def parse_page(self, response):

--- a/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.pyc
+++ b/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.pyc
--- a/descarga_por_dia/diarioYaqui/diarioYaqui/items.py
+++ b/descarga_por_dia/diarioYaqui/diarioYaqui/items.py
@@ -8,7 +8,13 @@
 import scrapy


-class DiarioyaquiItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/diarioYaqui/diarioYaqui/pipelines.py
+++ b/descarga_por_dia/diarioYaqui/diarioYaqui/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()

-class DiarioyaquiPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
        return item
--- a/descarga_por_dia/diarioYaqui/diarioYaqui/settings.py
+++ b/descarga_por_dia/diarioYaqui/diarioYaqui/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'diarioYaqui.spiders'
 #USER_AGENT = 'diarioYaqui (+http://www.yourdomain.com)'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False

 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True

 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'diarioYaqui.pipelines.DiarioyaquiPipeline': 300,
-#}
+ITEM_PIPELINES = {
+   'diarioYaqui.pipelines.JsonWriterPipeline': 300,
+}

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/diarioYaqui/diarioYaqui/settings.pyc
+++ b/descarga_por_dia/diarioYaqui/diarioYaqui/settings.pyc
--- a/descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.py
+++ b/descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.py
 import scrapy, re
+from diarioYaqui.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo

-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+"""
+MEDIO:
+Diario del Yaqui, Sonora
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""


 TAG_RE = re.compile(r'<[^>]+>')
@@ -21,16 +27,6 @@ class UTC(tzinfo):
        return 'UTC-7'


-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
-
-
 class QuotesSpider(scrapy.Spider):
    name = "noticias"

@@ -46,22 +42,16 @@ class QuotesSpider(scrapy.Spider):


    def parse(self, response):
-        pagination = response.xpath('//ul[@class="page-numbers"]/li/a/@href').extract()
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)

-        if ( len(pagination) > 0 ):
+        pagination = response.xpath('//ul[@class="page-numbers"]/li/a/@href').extract()
+        if len(pagination) > 0:
            pagination = pagination[-2].strip('/')
            pages = int(pagination[pagination.rfind('/')+1:])

-            for page in range(0,pages):
-                if ( page == 0 ):
-                    yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-
-                else:
+            for page in range(1,pages):
                yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)

-        else:
-            yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-

    def parse_page(self, response):
        for link in response.xpath('//h2[@class="entry-title"]/a/@href').extract():

--- a/descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.pyc
+++ b/descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.pyc
--- a/descarga_por_dia/grilloPorteno/grilloPorteno/items.py
+++ b/descarga_por_dia/grilloPorteno/grilloPorteno/items.py
@@ -8,7 +8,13 @@
 import scrapy


-class GrilloportenoItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/grilloPorteno/grilloPorteno/pipelines.py
+++ b/descarga_por_dia/grilloPorteno/grilloPorteno/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()

-class GrilloportenoPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
        return item
--- a/descarga_por_dia/grilloPorteno/grilloPorteno/settings.py
+++ b/descarga_por_dia/grilloPorteno/grilloPorteno/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'grilloPorteno.spiders'
 #USER_AGENT = 'grilloPorteno (+http://www.yourdomain.com)'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False

 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True

 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'grilloPorteno.pipelines.GrilloportenoPipeline': 300,
-#}
+ITEM_PIPELINES = {
+   'grilloPorteno.pipelines.JsonWriterPipeline': 300,
+}

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/grilloPorteno/grilloPorteno/settings.pyc
+++ b/descarga_por_dia/grilloPorteno/grilloPorteno/settings.pyc
--- a/descarga_por_dia/grilloPorteno/grilloPorteno/spiders/noticias.py
+++ b/descarga_por_dia/grilloPorteno/grilloPorteno/spiders/noticias.py
 import scrapy, re
+from grilloPorteno.items import NoticiasItem

-
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
-
+"""
+MEDIO:
+El Grillo, Yucatan
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""

 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)


-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
-
-
 class QuotesSpider(scrapy.Spider):
    name = "noticias"

@@ -26,26 +20,23 @@ class QuotesSpider(scrapy.Spider):
        year = getattr(self, 'year', None)
        month = getattr(self, 'month', None)
        day = getattr(self, 'day', None)
-		self.baseURL='http://grilloporteno.com/'+year+'/'+month+'/'+day
+
+        self.baseURL='http://grilloporteno.com/' + year +'/' + month + '/' + day

        yield scrapy.Request(url=self.baseURL, callback=self.parse)


    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+
        pagination = response.css('div.pagination').css('a::attr(href)').extract()
-		if ( len(pagination) > 0 ):
+        if len(pagination) > 0:
            pagination = pagination[-1].strip('/')
            pages = int(pagination[pagination.rfind('/')+1:])

-			for page in range(0, pages):
-				if ( page == 0 ):
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-				
-				else:
+            for page in range(1, pages):
                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)

-		else:
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)


    def parse_page(self, response):

--- a/descarga_por_dia/grilloPorteno/grilloPorteno/spiders/noticias.pyc
+++ b/descarga_por_dia/grilloPorteno/grilloPorteno/spiders/noticias.pyc
--- a/descarga_por_dia/heraldoAgs/heraldoAgs/items.py
+++ b/descarga_por_dia/heraldoAgs/heraldoAgs/items.py
@@ -8,7 +8,13 @@
 import scrapy


-class HeraldoagsItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/heraldoAgs/heraldoAgs/pipelines.py
+++ b/descarga_por_dia/heraldoAgs/heraldoAgs/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()

-class HeraldoagsPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
        return item
--- a/descarga_por_dia/heraldoAgs/heraldoAgs/settings.py
+++ b/descarga_por_dia/heraldoAgs/heraldoAgs/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'heraldoAgs.spiders'
 #USER_AGENT = 'heraldoAgs (+http://www.yourdomain.com)'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False

 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True

 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'heraldoAgs.pipelines.HeraldoagsPipeline': 300,
-#}
+ITEM_PIPELINES = {
+   'heraldoAgs.pipelines.JsonWriterPipeline': 300,
+}

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/heraldoAgs/heraldoAgs/settings.pyc
+++ b/descarga_por_dia/heraldoAgs/heraldoAgs/settings.pyc
--- a/descarga_por_dia/heraldoAgs/heraldoAgs/spiders/noticias.py
+++ b/descarga_por_dia/heraldoAgs/heraldoAgs/spiders/noticias.py
 import scrapy, re
+from heraldoAgs.items import NoticiasItem

 """
 MEDIO:
 El Heraldo, Aguascalientes
 USO:
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=5
+scrapy crawl noticias --nolog -s filename=2018-01-05.json -a year=2018 -a month=1 -a day=5
 """

 TAG_RE = re.compile(r'<[^>]+>')
@@ -15,16 +16,6 @@ LOC_RE = re.compile(r'.+?(\d{1,2}-?[a-zA-Z]+)?\s?\.-\s?')
 DAT_RE = re.compile(r'\s?(\d{1,2}-?[a-zA-Z]+)?\s?\.-\s?')


-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
-
-
 class QuotesSpider(scrapy.Spider):
    name = "noticias"


--- a/descarga_por_dia/heraldoAgs/heraldoAgs/spiders/noticias.pyc
+++ b/descarga_por_dia/heraldoAgs/heraldoAgs/spiders/noticias.pyc
--- a/descarga_por_dia/laJornada/laJornada/items.py
+++ b/descarga_por_dia/laJornada/laJornada/items.py
@@ -8,7 +8,13 @@
 import scrapy


-class LajornadaItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laJornada/laJornada/pipelines.py
+++ b/descarga_por_dia/laJornada/laJornada/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()

-class LajornadaPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
        return item
--- a/descarga_por_dia/laJornada/laJornada/settings.py
+++ b/descarga_por_dia/laJornada/laJornada/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornada.spiders'
 #USER_AGENT = 'laJornada (+http://www.yourdomain.com)'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False

 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True

 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'laJornada.pipelines.LajornadaPipeline': 300,
-#}
+ITEM_PIPELINES = {
+   'laJornada.pipelines.JsonWriterPipeline': 300,
+}

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laJornada/laJornada/settings.pyc
+++ b/descarga_por_dia/laJornada/laJornada/settings.pyc
--- a/descarga_por_dia/laJornada/laJornada/spiders/noticias.py
+++ b/descarga_por_dia/laJornada/laJornada/spiders/noticias.py
 import scrapy, re
+from laJornada.items import NoticiasItem
 from datetime import date, datetime, timedelta, tzinfo, time
 from collections import OrderedDict

 """
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+MEDIO:
+La Jornada, CDMX
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
 """

 TAG_RE = re.compile(r'<[^>]+>')
@@ -117,16 +121,6 @@ class UTC(tzinfo):
        return 'UTC-6'


-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
-
-
 class QuotesSpider(scrapy.Spider):
    name = "noticias"


--- a/descarga_por_dia/laJornada/laJornada/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornada/laJornada/spiders/noticias.pyc
--- a/descarga_por_dia/laJornadaAgs/laJornadaAgs/items.py
+++ b/descarga_por_dia/laJornadaAgs/laJornadaAgs/items.py
@@ -8,7 +8,13 @@
 import scrapy


-class LajornadaagsItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laJornadaAgs/laJornadaAgs/pipelines.py
+++ b/descarga_por_dia/laJornadaAgs/laJornadaAgs/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()

-class LajornadaagsPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
        return item
--- a/descarga_por_dia/laJornadaAgs/laJornadaAgs/settings.py
+++ b/descarga_por_dia/laJornadaAgs/laJornadaAgs/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaAgs.spiders'
 #USER_AGENT = 'laJornadaAgs (+http://www.yourdomain.com)'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False

 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True

 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'laJornadaAgs.pipelines.LajornadaagsPipeline': 300,
-#}
+ITEM_PIPELINES = {
+    'laJornadaAgs.pipelines.JsonWriterPipeline': 300,
+}

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laJornadaAgs/laJornadaAgs/settings.pyc
+++ b/descarga_por_dia/laJornadaAgs/laJornadaAgs/settings.pyc
--- a/descarga_por_dia/laJornadaAgs/laJornadaAgs/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaAgs/laJornadaAgs/spiders/noticias.py
 import scrapy, re
+from laJornadaAgs.items import NoticiasItem

-
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
-
+"""
+MEDIO:
+La Jornada Aguascalientes, Ags.
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""

 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)


-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
-
-
 class QuotesSpider(scrapy.Spider):
    name = "noticias"

@@ -26,28 +20,23 @@ class QuotesSpider(scrapy.Spider):
        year = getattr(self, 'year', None)
        month = getattr(self, 'month', None)
        day = getattr(self, 'day', None)
+
        self.baseURL='http://www.lja.mx/'+year+'/'+month.zfill(2)+'/'+day.zfill(2)

        yield scrapy.Request(url=self.baseURL, callback=self.parse)


    def parse(self, response):
-		pagination = response.css('div.pagination').css('a::attr(href)').extract()
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)

-		if ( len(pagination) > 0 ):
+        pagination = response.css('div.pagination').css('a::attr(href)').extract()
+        if len(pagination) > 0:
            pagination = pagination[-1].strip('/')
            pages = int(pagination[pagination.rfind('/')+1:])

-			for page in range(0, pages):
-				if ( page == 0 ):
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-				
-				else:
+            for page in range(1, pages):
                yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)

-		else:
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-

    def parse_page(self, response):
        for link in response.xpath('//li[@class="infinite-post"]/a/@href').extract():

--- a/descarga_por_dia/laJornadaAgs/laJornadaAgs/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornadaAgs/laJornadaAgs/spiders/noticias.pyc
--- a/descarga_por_dia/laJornadaGro/laJornadaGro/items.py
+++ b/descarga_por_dia/laJornadaGro/laJornadaGro/items.py
@@ -8,7 +8,13 @@
 import scrapy


-class LajornadagroItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laJornadaGro/laJornadaGro/pipelines.py
+++ b/descarga_por_dia/laJornadaGro/laJornadaGro/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()

-class LajornadagroPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
        return item
--- a/descarga_por_dia/laJornadaGro/laJornadaGro/settings.py
+++ b/descarga_por_dia/laJornadaGro/laJornadaGro/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaGro.spiders'
 #USER_AGENT = 'laJornadaGro (+http://www.yourdomain.com)'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
@@ -64,9 +64,9 @@ COOKIES_ENABLED = False

 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'laJornadaGro.pipelines.LajornadagroPipeline': 300,
-#}
+ITEM_PIPELINES = {
+   'laJornadaGro.pipelines.JsonWriterPipeline': 300,
+}

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laJornadaGro/laJornadaGro/settings.pyc
+++ b/descarga_por_dia/laJornadaGro/laJornadaGro/settings.pyc
--- a/descarga_por_dia/laJornadaGro/laJornadaGro/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaGro/laJornadaGro/spiders/noticias.py
 import scrapy, re
+from laJornadaGro.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo

 """
+MEDIO:
+La Jornada Guerrero, Gro.
 Esta version se encarga de la descarga de la nueva pagina de La Jornada Guerrero
 con url: 'http://www.lajornadaguerrero.com.mx'
-Esta version tiene noticias a partir del 2017.08.15
-
+Esta version descarga noticias a partir del 2017.08.15
 Uso:
-
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=18
+scrapy crawl noticias --nolog -s filename=2017-09-18.json -a year=2017 -a month=9 -a day=18
 """

 TAG_RE = re.compile(r'<[^>]+>')
@@ -28,15 +29,6 @@ class UTC(tzinfo):
        return 'UTC-6'


-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
-
 class QuotesSpider(scrapy.Spider):
    name = "noticias"

@@ -50,26 +42,20 @@ class QuotesSpider(scrapy.Spider):
                                                'Jul': '07', 'Ago': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dic': '12'}

        # self.baseURL = 'http://www.ljg.com.mx'
-		self.baseURL = 'http://www.lajornadaguerrero.com.mx'
-		url = '/index.php?option=com_k2&view=itemlist&task=date&year='+year+'&month='+month+'&day='+day+'&Itemid=588'
+        self.baseURL = "http://www.lajornadaguerrero.com.mx"
+        url = "/index.php?option=com_k2&view=itemlist&task=date&year="+year+"&month="+month+"&day="+day+"&Itemid=588"

        yield scrapy.Request(url=self.baseURL+url, callback=self.parse)


    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+
        pagination = response.xpath('//ul[@class="pagination"]/li/a/@href').extract()[:-2]
        if len(pagination) > 0:
-			pagination.insert(0, response.url)
            for page in range(0, len(pagination)):
-				if page == 0:
-					yield scrapy.Request(url=pagination[page], callback=self.parse_page, dont_filter=True)
-				
-				elif page > 0:
                yield scrapy.Request(url=self.baseURL+pagination[page], callback=self.parse_page)

-		else:
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-

    def parse_page(self, response):
        for li in response.xpath('//*[@class="genericItemView"]/div[@class="genericItemHeader"]/h2/a/@href').extract():
@@ -77,7 +63,6 @@ class QuotesSpider(scrapy.Spider):


    def parse_item(self, response):
-		print response.url
        item = NoticiasItem()
        path_list = ['//*[@class="itemIntroText"]/p', '//*[@class="itemFullText"]/p']
        text = ''

--- a/descarga_por_dia/laJornadaGro/laJornadaGro/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornadaGro/laJornadaGro/spiders/noticias.pyc
--- a/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/items.py
+++ b/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/items.py
@@ -8,7 +8,13 @@
 import scrapy


-class LajornadagroantiguoItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/pipelines.py
+++ b/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()

-class LajornadagroantiguoPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
        return item
--- a/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/settings.py
+++ b/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaGroAntiguo.spiders'
 #USER_AGENT = 'laJornadaGroAntiguo (+http://www.yourdomain.com)'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
@@ -64,9 +64,9 @@ COOKIES_ENABLED = False

 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'laJornadaGroAntiguo.pipelines.LajornadagroantiguoPipeline': 300,
-#}
+ITEM_PIPELINES = {
+   'laJornadaGroAntiguo.pipelines.JsonWriterPipeline': 300,
+}

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaGroAntiguo/laJornadaGroAntiguo/spiders/noticias.py
 import scrapy, re
+from laJornadaGroAntiguo.items import NoticiasItem

 """
 Esta version se encarga de la descarga de la pagina de La Jornada Guerrero
 con url: 'http://www.lajornadaguerrero.com.mx/'
-
+--> LA ESCTRUCTURA DE LA PAGINA HA CAMBIADO. VER CRAWLER laJornadaGro.
 Uso:
-
-scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
 """

 TAG_RE = re.compile(r'<[^>]+>')
@@ -14,16 +14,6 @@ def remove_tags(text):
    return TAG_RE.sub('', text)


-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
-
-
 class QuotesSpider(scrapy.Spider):
    name = "noticias"


--- a/descarga_por_dia/laJornadaOte/laJornadaOte/items.py
+++ b/descarga_por_dia/laJornadaOte/laJornadaOte/items.py
@@ -8,7 +8,13 @@
 import scrapy


-class LajornadaoteItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laJornadaOte/laJornadaOte/pipelines.py
+++ b/descarga_por_dia/laJornadaOte/laJornadaOte/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()

-class LajornadaotePipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
        return item
--- a/descarga_por_dia/laJornadaOte/laJornadaOte/settings.py
+++ b/descarga_por_dia/laJornadaOte/laJornadaOte/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaOte.spiders'
 #USER_AGENT = 'laJornadaOte (+http://www.yourdomain.com)'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False

 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True

 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'laJornadaOte.pipelines.LajornadaotePipeline': 300,
-#}
+ITEM_PIPELINES = {
+   'laJornadaOte.pipelines.JsonWriterPipeline': 300,
+}

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laJornadaOte/laJornadaOte/settings.pyc
+++ b/descarga_por_dia/laJornadaOte/laJornadaOte/settings.pyc
--- a/descarga_por_dia/laJornadaOte/laJornadaOte/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaOte/laJornadaOte/spiders/noticias.py
 import scrapy, re
+from laJornadaOte.items import NoticiasItem

-
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
-
+"""
+MEDIO:
+La Jornada de Oriente, Puebla
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""

 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)


-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
-
-
 class QuotesSpider(scrapy.Spider):
    name = "noticias"

@@ -26,6 +20,7 @@ class QuotesSpider(scrapy.Spider):
        year = getattr(self, 'year', None)
        month = getattr(self, 'month', None)
        day = getattr(self, 'day', None)
+
        self.baseURL='http://www.lajornadadeoriente.com.mx/'+year+'/'+month+'/'+day

        yield scrapy.Request(url=self.baseURL, callback=self.parse)

--- a/descarga_por_dia/laJornadaOte/laJornadaOte/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornadaOte/laJornadaOte/spiders/noticias.pyc
--- a/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/items.py
+++ b/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/items.py
@@ -8,7 +8,13 @@
 import scrapy


-class LajornadasanluisItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/pipelines.py
+++ b/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()

-class LajornadasanluisPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
        return item
--- a/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/settings.py
+++ b/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaSanLuis.spiders'
 #USER_AGENT = 'laJornadaSanLuis (+http://www.yourdomain.com)'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False

 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True

 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'laJornadaSanLuis.pipelines.LajornadasanluisPipeline': 300,
-#}
+ITEM_PIPELINES = {
+   'laJornadaSanLuis.pipelines.JsonWriterPipeline': 300,
+}

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/settings.pyc
+++ b/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/settings.pyc
--- a/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/spiders/noticias.py
 import scrapy, re
+from laJornadaSanLuis.items import NoticiasItem

-
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
-
+"""
+MEDIO:
+La Jornada de San Luis, San Luis Potosi
+Uso:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""

 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)


-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
-
-
 class QuotesSpider(scrapy.Spider):
    name = "noticias"

@@ -26,23 +20,21 @@ class QuotesSpider(scrapy.Spider):
        year = getattr(self, 'year', None)
        month = getattr(self, 'month', None)
        day = getattr(self, 'day', None)
+
        self.baseURL='http://lajornadasanluis.com.mx/'+year+'/'+month+'/'+day

        yield scrapy.Request(url=self.baseURL, callback=self.parse)


    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+
        pagination = response.xpath('//div[@class="pages"]/a/@href').extract()
-		if ( len(pagination) > 0 ):
+        if len(pagination) > 0:
            pagination = pagination[-1].strip('/')
            pages = int(pagination[pagination.rfind('/')+1:])
-			for page in range(0,pages):
-				if ( page == 0 ):
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-				else:
+            for page in range(1, pages):
                yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
-		else:
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)


    def parse_page(self, response):

--- a/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornadaSanLuis/laJornadaSanLuis/spiders/noticias.pyc
--- a/descarga_por_dia/laJornadaVer/laJornadaVer/items.py
+++ b/descarga_por_dia/laJornadaVer/laJornadaVer/items.py
@@ -8,7 +8,13 @@
 import scrapy


-class LajornadaverItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laJornadaVer/laJornadaVer/pipelines.py
+++ b/descarga_por_dia/laJornadaVer/laJornadaVer/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()

-class LajornadaverPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
        return item
--- a/descarga_por_dia/laJornadaVer/laJornadaVer/settings.py
+++ b/descarga_por_dia/laJornadaVer/laJornadaVer/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaVer.spiders'
 #USER_AGENT = 'laJornadaVer (+http://www.yourdomain.com)'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False

 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True

 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'laJornadaVer.pipelines.LajornadaverPipeline': 300,
-#}
+ITEM_PIPELINES = {
+   'laJornadaVer.pipelines.JsonWriterPipeline': 300,
+}

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laJornadaVer/laJornadaVer/settings.pyc
+++ b/descarga_por_dia/laJornadaVer/laJornadaVer/settings.pyc
--- a/descarga_por_dia/laJornadaVer/laJornadaVer/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaVer/laJornadaVer/spiders/noticias.py
 import scrapy, re
+from laJornadaVer.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo

-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
-
+"""
+MEDIO:
+La Jornada de Veracruz, Ver.
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""

 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
@@ -21,16 +26,6 @@ class UTC(tzinfo):
        return 'UTC-6'


-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
-
-
 class QuotesSpider(scrapy.Spider):
    name = "noticias"

@@ -40,6 +35,7 @@ class QuotesSpider(scrapy.Spider):
        month = getattr(self, 'month', None)
        day = getattr(self, 'day', None)
        self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
+
        self.baseURL = 'http://www.jornadaveracruz.com.mx/'
        self.builtURL= self.baseURL+'Archive.aspx?date='+day.zfill(2)+'/'+month.zfill(2)+'/'+year

@@ -49,9 +45,10 @@ class QuotesSpider(scrapy.Spider):
    def parse(self, response):
        paths = ['//*[@class="article-header"]/h2/a/@href',
                 '//ul[@class="article-array content-category"]/li/a/@href']
+
        for path in paths:
            links = response.xpath(path).extract()
-			if ( len(links) > 0 ):
+            if len(links) > 0:
                for link in links:
                    yield scrapy.Request(url=self.baseURL+link, callback=self.parse_item)

@@ -62,9 +59,10 @@ class QuotesSpider(scrapy.Spider):
    def parse_page(self, response):
        paths = ['//*[@id="article_list"]/h2/a/@href',
                 '//*[@id="article_list"]/h3/a/@href']
+
        for path in paths:
            for link in response.xpath(path).extract():
-				if not ( link in self.link_list ):
+                if not link in self.link_list:
                    yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item)


@@ -89,7 +87,7 @@ class QuotesSpider(scrapy.Spider):
        # item['author'] = response.xpath('//*[@class="right-side"]/div/a[@rel="author"]/text()').extract_first()

        paragraph = response.xpath('//*[@class="shortcode-content"]/p/text()').extract()
-		if ( len(paragraph) > 0 ):
+        if len(paragraph) > 0:
            for p in paragraph:
                text += paragraph[3]
        else:

--- a/descarga_por_dia/laJornadaVer/laJornadaVer/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornadaVer/laJornadaVer/spiders/noticias.pyc
--- a/descarga_por_dia/laJornadaZac/laJornadaZac/items.py
+++ b/descarga_por_dia/laJornadaZac/laJornadaZac/items.py
@@ -8,7 +8,13 @@
 import scrapy


-class LajornadazacItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/laJornadaZac/laJornadaZac/pipelines.py
+++ b/descarga_por_dia/laJornadaZac/laJornadaZac/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()

-class LajornadazacPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
        return item
--- a/descarga_por_dia/laJornadaZac/laJornadaZac/settings.py
+++ b/descarga_por_dia/laJornadaZac/laJornadaZac/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'laJornadaZac.spiders'
 #USER_AGENT = 'laJornadaZac (+http://www.yourdomain.com)'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16

 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False

 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True

 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'laJornadaZac.pipelines.LajornadazacPipeline': 300,
-#}
+ITEM_PIPELINES = {
+   'laJornadaZac.pipelines.JsonWriterPipeline': 300,
+}

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/laJornadaZac/laJornadaZac/settings.pyc
+++ b/descarga_por_dia/laJornadaZac/laJornadaZac/settings.pyc
--- a/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.py
 import scrapy, re
+from laJornadaZac.items import NoticiasItem

-
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+"""
+MEDIO:
+La Jornada Zacatecas, Zac.
+USO:
+scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""

 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)


-class NoticiasItem(scrapy.Item):
-	title = scrapy.Field()
-	text = scrapy.Field()
-	date = scrapy.Field()
-	location = scrapy.Field()
-	author = scrapy.Field()
-	topic = scrapy.Field()
-	url = scrapy.Field()
-
-
 class QuotesSpider(scrapy.Spider):
    name = "noticias"
+
    def start_requests(self):
        year = getattr(self, 'year', None)
        month = getattr(self, 'month', None)
        day = getattr(self, 'day', None)
-		self.baseURL='http://ljz.mx/'+year+'/'+month+'/'+day
+
+        self.baseURL = "http://ljz.mx/" + year + "/" + month + "/" + day

        yield scrapy.Request(url=self.baseURL, callback=self.parse)


    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+
        pagination = response.xpath('//div[@class="pagination"]/a/@href').extract()
-		if ( len(pagination) > 0 ):
+        if len(pagination) > 0:
            pagination = pagination[-1].strip('/')
            pages = int(pagination[pagination.rfind('/')+1:])
-			for page in range(0,pages):
-				if ( page == 0 ):
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-				else:
+
+            for page in range(1,pages):
                yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
-		else:
-			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)


    def parse_page(self, response):

--- a/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornadaZac/laJornadaZac/spiders/noticias.pyc
--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/pipelines.py
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

+import json
+from collections import OrderedDict
+
+
+class JsonWriterPipeline(object):
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()

-class MipuntodevistaPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+
+        line = OrderedDict(row)
+
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
+
        return item
--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/settings.py
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'miPuntoDeVista.spiders'
 #USER_AGENT = 'miPuntoDeVista (+http://www.yourdomain.com)'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,7 +27,7 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True

 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'miPuntoDeVista.pipelines.MipuntodevistaPipeline': 300,
-#}
+ITEM_PIPELINES = {
+    'miPuntoDeVista.pipelines.JsonWriterPipeline': 300,
+}

 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/settings.pyc
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/settings.pyc
--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/spiders/noticias.py
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/spiders/noticias.py
 import scrapy, re

-
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
-
+"""
+USO:
+scrapy crawl noticias -t json --nolog -s filename=2018-03-22.json -a year=2017 -a month=3 -a day=22
+"""

 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
@@ -26,25 +27,20 @@ class QuotesSpider(scrapy.Spider):
        year = getattr(self, 'year', None)
        month = getattr(self, 'month', None)
        day = getattr(self, 'day', None)
-		self.baseURL='http://www.mipuntodevista.com.mx/'+year+'/'+month+'/'+day
-		urls = [
-			self.baseURL,
-							 ]
-		for url in urls:
-			yield scrapy.Request(url=url, callback=self.parse)
+
+        self.baseURL = 'http://www.mipuntodevista.com.mx/' + year + '/' + month + '/' + day
+
+        yield scrapy.Request(url=self.baseURL, callback=self.parse)


    def parse(self, response):
+        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
+
        pagination = response.css('div.page-nav').css('a::attr(href)').extract()
-		if ( len(pagination) > 0 ):
+        if len(pagination) > 0:
            pagination = pagination[-2].strip('/')
            pages = int(pagination[pagination.rfind('/')+1:])
-			for page in range(0, pages):
-				if ( page == 0 ):
-					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
-				else:
-					yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
-		else:
+            for page in range(1, pages):
                yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)


@@ -65,13 +61,16 @@ class QuotesSpider(scrapy.Spider):
            d = d[:-6] + '-06:00'
        item['date'] = d

+        author = response.xpath('//*[@class="td-post-author-name"]/a/text()').extract_first()
+        if author is not None: item['author'] = author
+        try:
+            item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract()[1]
+        except:
            item['topic'] = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()

        for paragraph in response.css('div.td-post-content').css('p').extract():
-			text += remove_tags(paragraph) + '\n'
-		item['text'] = text
+            text += remove_tags(paragraph) + "\n"
+        item['text'] = text.strip()
        item['url'] = response.url

-		# print item['title']
        yield item
-
--- a/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/spiders/noticias.pyc
+++ b/descarga_por_dia/miPuntoDeVista/miPuntoDeVista/spiders/noticias.pyc