crawlers

ea874dde · Renán Sosa Guillen · 87ed4374 · ea874dde · ea874dde · ea874dde
Commit ea874dde authored Jan 31, 2018 by Renán Sosa Guillen
24 changed files
--- a/descarga_hacia_atras/diarioYucatan/diarioYucatan/items.py
+++ b/descarga_hacia_atras/diarioYucatan/diarioYucatan/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class DiarioyucatanItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_hacia_atras/diarioYucatan/diarioYucatan/pipelines.py
+++ b/descarga_hacia_atras/diarioYucatan/diarioYucatan/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class DiarioyucatanPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_hacia_atras/diarioYucatan/diarioYucatan/settings.py
+++ b/descarga_hacia_atras/diarioYucatan/diarioYucatan/settings.py
@@ -27,7 +27,7 @@ NEWSPIDER_MODULE = 'diarioYucatan.spiders'
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'diarioYucatan.pipelines.DiarioyucatanPipeline': 300,
+   'diarioYucatan.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_hacia_atras/diarioYucatan/diarioYucatan/settings.pyc
+++ b/descarga_hacia_atras/diarioYucatan/diarioYucatan/settings.pyc
--- a/descarga_hacia_atras/diarioYucatan/diarioYucatan/spiders/noticias.py
+++ b/descarga_hacia_atras/diarioYucatan/diarioYucatan/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from diarioYucatan.items import NoticiasItem
 # from datetime import datetime, date, timedelta
 # from scrapy.spidermiddlewares.httperror import HttpError
 """
+MEDIO:
+Diario de Yucatán, Yuc.
 Esta version descarga todas las noticias contenidas en la pagina, sin necesidad
 de una fecha especifica.
 USO:
+scrapy crawl noticias --nolog -s filename=noticias.json
-scrapy crawl noticias -t json --nolog -o noticias.json
 """
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"
@@ -46,7 +37,7 @@ class QuotesSpider(scrapy.Spider):
        #                 'multimedia', 'multimedia/fotos', 'multimedia/videos']
        self.globalLinkSet = set()
-        self.baseURL = 'http://www.yucatan.com.mx/seccion/'
+        self.baseURL = "http://www.yucatan.com.mx/seccion/"
        self.parsing_month = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6, 'julio': 7,
                              'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}

--- a/descarga_hacia_atras/diarioYucatan/diarioYucatan/spiders/noticias.pyc
+++ b/descarga_hacia_atras/diarioYucatan/diarioYucatan/spiders/noticias.pyc
--- a/descarga_por_rss/elFinanciero/elFinanciero/items.py
+++ b/descarga_por_rss/elFinanciero/elFinanciero/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class ElfinancieroItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_rss/elFinanciero/elFinanciero/pipelines.py
+++ b/descarga_por_rss/elFinanciero/elFinanciero/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class ElfinancieroPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_rss/elFinanciero/elFinanciero/settings.py
+++ b/descarga_por_rss/elFinanciero/elFinanciero/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'elFinanciero.spiders'
 #USER_AGENT = 'elFinanciero (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'elFinanciero.pipelines.ElfinancieroPipeline': 300,
+   'elFinanciero.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_rss/elFinanciero/elFinanciero/settings.pyc
+++ b/descarga_por_rss/elFinanciero/elFinanciero/settings.pyc
--- a/descarga_por_rss/elFinanciero/elFinanciero/spiders/noticias.py
+++ b/descarga_por_rss/elFinanciero/elFinanciero/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from elFinanciero.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo
-'''
+"""
-scrapy crawl noticias -t json --nolog -o noticias.json
+MEDIO:
-'''
+El Financiero, CDMX
+USO:
+scrapy crawl noticias --nolog -s filename=2017-12-20.json
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)
@@ -23,16 +27,6 @@ class UTC(tzinfo):
 		return 'UTC-6'
-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"
@@ -42,7 +36,7 @@ class QuotesSpider(scrapy.Spider):
                            'mayo': 5,       'junio': 6,    'julio': 7,      'agosto': 8,
                            'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
-        self.baseURL = 'http://www.elfinanciero.com.mx/rss'
+        self.baseURL = "http://www.elfinanciero.com.mx/rss"
        yield scrapy.Request(url=self.baseURL, callback=self.parse)

--- a/descarga_por_rss/elFinanciero/elFinanciero/spiders/noticias.pyc
+++ b/descarga_por_rss/elFinanciero/elFinanciero/spiders/noticias.pyc
--- a/descarga_por_rss/elUniversal/elUniversal/items.py
+++ b/descarga_por_rss/elUniversal/elUniversal/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class EluniversalItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_rss/elUniversal/elUniversal/pipelines.py
+++ b/descarga_por_rss/elUniversal/elUniversal/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class EluniversalPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_rss/elUniversal/elUniversal/settings.py
+++ b/descarga_por_rss/elUniversal/elUniversal/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'elUniversal.spiders'
 #USER_AGENT = 'elUniversal (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'elUniversal.pipelines.EluniversalPipeline': 300,
+   'elUniversal.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_rss/elUniversal/elUniversal/settings.pyc
+++ b/descarga_por_rss/elUniversal/elUniversal/settings.pyc
--- a/descarga_por_rss/elUniversal/elUniversal/spiders/noticias.py
+++ b/descarga_por_rss/elUniversal/elUniversal/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from elUniversal.items import NoticiasItem
-'''
+"""
-scrapy crawl noticias -t json --nolog -o noticias.json
+MEDIO:
-'''
+El Universal, CDMX
+USO:
+scrapy crawl noticias --nolog -s filename=2017-12-20.json
+"""
 TAG_RE = re.compile(r'<[^>]+>')
@@ -10,16 +15,6 @@ def remove_tags(text):
    return TAG_RE.sub('', text)
-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"

--- a/descarga_por_rss/elUniversal/elUniversal/spiders/noticias.pyc
+++ b/descarga_por_rss/elUniversal/elUniversal/spiders/noticias.pyc
--- a/descarga_por_rss/solDeMex/solDeMex/items.py
+++ b/descarga_por_rss/solDeMex/solDeMex/items.py
@@ -8,7 +8,13 @@
 import scrapy
-class SoldemexItem(scrapy.Item):
+class NoticiasItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
-    pass
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_rss/solDeMex/solDeMex/pipelines.py
+++ b/descarga_por_rss/solDeMex/solDeMex/pipelines.py
@@ -5,7 +5,71 @@
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+from collections import OrderedDict
+class JsonWriterPipeline(object):
+    def __init__(self, filename):
+        self.filename = filename
+    @classmethod
+    def from_crawler(cls, crawler):
+        # Here you get whatever value was passed through the "filename" command line parameter
+        settings = crawler.settings
+        filename = settings.get('filename')
+        # Instantiate the pipeline with the file name
+        return cls(filename)
+    def open_spider(self, spider):
+        self.counter = 0
+        self.file = open(self.filename, 'w')
+        self.file.write("[")
+    def close_spider(self, spider):
+        self.file.write("]")
+        self.file.close()
-class SoldemexPipeline(object):
    def process_item(self, item, spider):
+        # print("this is my item", item)
+        row = []
+        try:
+            row.append(("date", item['date']))
+        except:
+            pass
+        try:
+            row.append(("topic", item['topic']))
+        except:
+            pass
+        try:
+            row.append(("title", item['title']))
+        except:
+            pass
+        try:
+            row.append(("author", item['author']))
+        except:
+            pass
+        try:
+            row.append(("location", item['location']))
+        except:
+            pass
+        try:
+            row.append(("text", item['text']))
+        except:
+            pass
+        try:
+            row.append(("url", item['url']))
+        except:
+            pass
+        line = OrderedDict(row)
+        self.counter += 1
+        if self.counter == 1:
+            self.file.write(json.dumps(line))
+        elif self.counter > 1:
+            self.file.write(",\n" + json.dumps(line))
        return item
--- a/descarga_por_rss/solDeMex/solDeMex/settings.py
+++ b/descarga_por_rss/solDeMex/solDeMex/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'solDeMex.spiders'
 #USER_AGENT = 'solDeMex (+http://www.yourdomain.com)'
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+# ROBOTSTXT_OBEY = True
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+DOWNLOAD_DELAY = 0.5
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+ITEM_PIPELINES = {
-#    'solDeMex.pipelines.SoldemexPipeline': 300,
+   'solDeMex.pipelines.JsonWriterPipeline': 300,
-#}
+}
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html

--- a/descarga_por_rss/solDeMex/solDeMex/settings.pyc
+++ b/descarga_por_rss/solDeMex/solDeMex/settings.pyc
--- a/descarga_por_rss/solDeMex/solDeMex/spiders/noticias.py
+++ b/descarga_por_rss/solDeMex/solDeMex/spiders/noticias.py
+# -*- coding: utf-8 -*-
 import scrapy, re
+from solDeMex.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo
-'''
+"""
-scrapy crawl noticias -t json --nolog -o noticias.json
+MEDIO:
-'''
+El Sol de México, CDMX
+USO:
+scrapy crawl noticias --nolog -s filename=2018-01-20.json
+"""
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)
@@ -23,16 +27,6 @@ class UTC(tzinfo):
 		return 'UTC-6'
-class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
    name = "noticias"

--- a/descarga_por_rss/solDeMex/solDeMex/spiders/noticias.pyc
+++ b/descarga_por_rss/solDeMex/solDeMex/spiders/noticias.pyc