elFinanciero

11d4dc11 · Mario Chirinos Colunga · ffa4478a · 11d4dc11 · 11d4dc11 · 11d4dc11
Commit 11d4dc11 authored Jan 25, 2019 by Mario Chirinos Colunga 💬
20 changed files
--- a/crawler_script/download_backwards.py
+++ b/crawler_script/download_backwards.py
@@ -12,7 +12,7 @@ import datetime


 today = datetime.datetime.now()
-baseDir = "/home/geoint/virtualHDD/m3/noticias/"
+baseDir = "/home/geoint/M3NAS/noticias/"
 scrapyDir = "/home/geoint/crawlersNoticias/"
 with open(sys.argv[1]) as data_file:
    siteList = json.load(data_file)

--- a/crawler_script/download_by_day.py
+++ b/crawler_script/download_by_day.py
@@ -12,7 +12,7 @@ import datetime


 today = datetime.datetime.now()
-baseDir = "/home/geoint/virtualHDD/m3/noticias/"
+baseDir = "/home/geoint/M3NAS/noticias/"
 scrapyDir = "/home/geoint/crawlersNoticias/"
 with open(sys.argv[1]) as data_file:    
    siteList = json.load(data_file)

--- a/crawler_script/download_often.py
+++ b/crawler_script/download_often.py
@@ -48,7 +48,7 @@ def dictRowGenerator(line):


 today = datetime.datetime.now()
-baseDir = "/home/geoint/virtualHDD/m3/noticias/"
+baseDir = "/home/geoint/M3NAS/noticias/"
 scrapyDir = "/home/geoint/crawlersNoticias/"
 with open(sys.argv[1]) as data_file:
    siteList = json.load(data_file)

--- a/crawler_script/tracker_backwards.py
+++ b/crawler_script/tracker_backwards.py
@@ -11,7 +11,7 @@ import os
 import datetime

 today = datetime.datetime.now()
-baseDir = "/home/geoint/virtualHDD/m3/noticias/"
+baseDir = "/home/geoint/M3NAS/noticias/"
 scrapyDir = "/home/geoint/crawlersNoticias/"
 with open(sys.argv[1]) as data_file:
    siteList = json.load(data_file)

--- a/crawler_script/tracker_by_day.py
+++ b/crawler_script/tracker_by_day.py
@@ -14,7 +14,7 @@ import datetime


 # today = datetime.datetime.now()
-baseDir = "/home/geoint/virtualHDD/m3/noticias/"
+baseDir = "/home/geoint/M3NAS/noticias/"
 scrapyDir = "/home/geoint/crawlersNoticias/"
 with open(sys.argv[1]) as data_file:
    siteList = json.load(data_file)

--- a/crawler_script/tracker_proceso.py
+++ b/crawler_script/tracker_proceso.py
@@ -10,7 +10,7 @@ import os



-baseDir   = "/home/geoint/virtualHDD/m3/noticias/"
+baseDir   = "/home/geoint/M3NAS/noticias/"
 scrapyDir = "/home/geoint/crawlersNoticias/"
 s = {"crawler": "descarga_por_mes/proceso"}


--- a/descarga_hacia_atras/elFinanciero/elFinanciero/2019-01-24.json
+++ b/descarga_hacia_atras/elFinanciero/elFinanciero/2019-01-24.json
+[]
\ No newline at end of file
--- a/descarga_hacia_atras/elFinanciero/elFinanciero/spiders/.noticias.py.swp
+++ b/descarga_hacia_atras/elFinanciero/elFinanciero/spiders/.noticias.py.swp
--- a/descarga_hacia_atras/elFinanciero/elFinanciero/spiders/noticias.py
+++ b/descarga_hacia_atras/elFinanciero/elFinanciero/spiders/noticias.py
@@ -85,11 +85,13 @@ class QuotesSpider(scrapy.Spider):
        self.uri_page = "%22}&type=page&page="
        self.uri_complement = "&size=10"

+	print(self.uri_base+self.uri_page+self.uri_complement)
        for s in sectionList:
            yield scrapy.Request(url=self.baseURL + s, callback=self.parse)


    def parse(self, response):
+	print(response.url)
        searchData = ImportantData()

        CONTINUE_SEARCHING = True

--- a/descarga_hacia_atras/foraneos/heraldoHn/news.json
+++ b/descarga_hacia_atras/foraneos/heraldoHn/news.json
--- a/descarga_hacia_atras/laJornadaBC/news.json
+++ b/descarga_hacia_atras/laJornadaBC/news.json
--- a/descarga_por_dia/elFinanciero/elFinanciero/__init__.py
+++ b/descarga_por_dia/elFinanciero/elFinanciero/__init__.py
--- a/descarga_por_dia/elFinanciero/elFinanciero/items.py
+++ b/descarga_por_dia/elFinanciero/elFinanciero/items.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class ElfinancieroItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
+class NoticiasItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    title = scrapy.Field()
+    text = scrapy.Field()
+    date = scrapy.Field()
+    location = scrapy.Field()
+    author = scrapy.Field()
+    topic = scrapy.Field()
+    url = scrapy.Field()
--- a/descarga_por_dia/elFinanciero/elFinanciero/middlewares.py
+++ b/descarga_por_dia/elFinanciero/elFinanciero/middlewares.py
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class ElfinancieroSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class ElfinancieroDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
--- a/descarga_por_dia/elFinanciero/elFinanciero/pipelines.py
+++ b/descarga_por_dia/elFinanciero/elFinanciero/pipelines.py
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+
+class ElfinancieroPipeline(object):
+	def __init__(self, filename):
+		self.filename = filename
+
+	@classmethod
+	def from_crawler(cls, crawler):
+        	# Here you get whatever value was passed through the "filename" command line parameter
+		settings = crawler.settings
+		filename = settings.get('filename')
+		# Instantiate the pipeline with the file name
+		return cls(filename)
+
+	def open_spider(self, spider):
+		self.itemList=[]
+
+	def close_spider(self, spider):
+#		print(len(self.itemList))
+		with open(self.filename, 'w') as fp:
+			json.dump(self.itemList, fp)
+
+	def process_item(self, item, spider):
+		self.itemList.append(dict(item))
+		return item
+
--- a/descarga_por_dia/elFinanciero/elFinanciero/settings.py
+++ b/descarga_por_dia/elFinanciero/elFinanciero/settings.py
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for elFinanciero project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'elFinanciero'
+
+SPIDER_MODULES = ['elFinanciero.spiders']
+NEWSPIDER_MODULE = 'elFinanciero.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'elFinanciero (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'elFinanciero.middlewares.ElfinancieroSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'elFinanciero.middlewares.ElfinancieroDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+    'elFinanciero.pipelines.ElfinancieroPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/descarga_por_dia/elFinanciero/elFinanciero/spiders/.noticias.py.swp
+++ b/descarga_por_dia/elFinanciero/elFinanciero/spiders/.noticias.py.swp
--- a/descarga_por_dia/elFinanciero/elFinanciero/spiders/__init__.py
+++ b/descarga_por_dia/elFinanciero/elFinanciero/spiders/__init__.py
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/descarga_por_dia/elFinanciero/elFinanciero/spiders/noticias.py
+++ b/descarga_por_dia/elFinanciero/elFinanciero/spiders/noticias.py
+# -*- coding: utf-8 -*-
+
+"""
+MEDIA:
+    El Financiero
+USAGE:
+    ## Get all the news from a specific date. ##
+    ---------------------------------------------------------------------------------------------
+    $ cd elFinanciero/
+    $ scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
+"""
+
+import scrapy, re, json
+from elFinanciero.items import NoticiasItem
+from datetime import datetime, timedelta, tzinfo
+#------------------------------------------------------------------------------------------------      
+allSections = [{"name":"Economía","slug":"economia"},{"name":"Empresas","slug":"empresas"},{"name":"Mercados","slug":"mercados"},{"name":"Pyme","slug":"pyme"},{"name":"Franquicias","slug":"franquicias"},{"name":"Nacional","slug":"nacional"},{"name":"Tech","slug":"tech"},{"name":"Mundo","slug":"mundo"},{"name":"Deportes","slug":"deportes"},{"name":"Culturas","slug":"culturas"},{"name":"Buena Vida","slug":"buena-vida"},{"name":"Reflector","slug":"reflector"},{"name":"Ciencia","slug":"ciencia"},{"name":"Mis Finanzas","slug":"mis-finanzas"},{"name":"Opinión","slug":"opinion"},{"name":"Interactivos","slug":"interactivos"},{"name":"Blogs","slug":"blogs"},{"name":"Fotogalerías","slug":"fotogalerias"},{"name":"Financial Times","slug":"financial-times"},{"name":"Power Tools","slug":"power-tools"},{"name":"Bajío","slug":"bajio"},{"name":"Monterrey","slug":"monterrey"},{"name":"Universidades","slug":"universidades"},{"name":"Mundo empresa","slug":"mundo-empresa"},{"name":"Texas","slug":"texas"},{"name":"Suplementos","slug":"suplementos"},{"name":"Archivo","slug":"archivo"},{"name":"Pages","slug":"pages"},{"name":"Licitaciones","slug":"licitaciones"},{"name":"Bloomberg","slug":"bloomberg"},{"name":"Startup","slug":"startup"},{"name":"Mercados - Acciones","slug":"mercados/acciones"},{"name":"Mercados - IPC","slug":"mercados/ipc"},{"name":"Mercados - Divisas","slug":"mercados/divisas"},{"name":"Mercados - Dinero","slug":"mercados/dinero"},{"name":"Mercados - Commodities","slug":"mercados/commodities"},{"name":"TLCAN","slug":"tlcan"},{"name":"Blogs - Territorio Viral","slug":"blogs/territorio-viral"},{"name":"Blogs - Templo del Morbo","slug":"blogs/templo-del-morbo"},{"name":"Sponsor","slug":"sponsor"},{"name":"Bloomberg Businessweek","slug":"bloomberg-businessweek"},{"name":"Millonarios","slug":"millonarios"},{"name":"Management","slug":"management"},{"name":"Viajes","slug":"viajes"},{"name":"Cartones","slug":"cartones"},{"name":"EF Eventos","slug":"ef-eventos"},{"name":"Blogs - Efecto Jazz","slug":"blogs/efecto-jazz"},{"name":"Blogs - Visión CFA","slug":"blogs/vision-cfa"},{"name":"Pages - Eventos","slug":"pages/eventos"},{"name":"Pages - Interactivos","slug":"pages/interactivos"},{"name":"Pages - PDF","slug":"pages/pdf"},{"name":"Pages - Documentos","slug":"pages/documentos"},{"name":"Pages - Docs","slug":"pages/docs"},{"name":"TV","slug":"tv"},{"name":"Tv - Al sonar la campana","slug":"tv/al-sonar-la-campana"},{"name":"Tv - Espresso Doble","slug":"tv/espresso-doble"},{"name":"Tv - Ganadores & Perdedores","slug":"tv/ganadores-y-perdedores"},{"name":"Tv - Entre Mercados","slug":"tv/entre-mercados"},{"name":"Tv - Mesa Central","slug":"tv/mesa-central"},{"name":"Tv - Bitácora Política","slug":"tv/bitacora-politica"},{"name":"Tv - Sin Línea","slug":"tv/sin-linea"},{"name":"Tv - Al Cierre","slug":"tv/al-cierre"},{"name":"Tv - Tiempo de Toros","slug":"tv/tiempo-de-toros"},{"name":"Tv - Nación 321","slug":"tv/nacion321"},{"name":"Tv - El mundo según...","slug":"tv/el-mundo-segun"},{"name":"Tv - En EF y por Adela","slug":"tv/en-ef-y-por-adela"},{"name":"Tv - La Nota Dura","slug":"tv/la-nota-dura"},{"name":"Tv - La Silla Roja","slug":"tv/la-silla-roja"},{"name":"Tv - Personajes","slug":"tv/personajes"},{"name":"Tv - Tech","slug":"tv/tech"},{"name":"Tv - Mundo","slug":"tv/mundo"},{"name":"Tv - Finanzas Personales","slug":"tv/finanzas-personales"},{"name":"Tv - Estilo de Vida","slug":"tv/estilo-de-vida"},{"name":"Tv - Bloomberg","slug":"tv/bloomberg"},{"name":"Tv - Viral","slug":"tv/viral"},{"name":"Tv - Nacional","slug":"tv/nacional"},{"name":"Tv - Empresas","slug":"tv/empresas"},{"name":"Tv - Economía","slug":"tv/economia"},{"name":"Tv - Reflector","slug":"tv/reflector"},{"name":"Tv - Sponsor","slug":"tv/sponsor"},{"name":"Rankings","slug":"rankings"},{"name":"Trivias","slug":"trivias"},{"name":"Elecciones 2018","slug":"elecciones-2018"},{"name":"Pages - Businessweek México","slug":"pages/businessweek-mexico"},{"name":"Fibras","slug":"fibras"},{"name":"After Office","slug":"after-office"},{"name":"New York Times Syndicate","slug":"new-york-times-syndicate"},{"name":"México en Hannover","slug":"mexico-en-hannover"},{"name":"Tv - Opinión","slug":"tv/opinion"},{"name":"Pages - Central Política","slug":"pages/central-politica"},{"name":"Relojes","slug":"relojes"},{"name":"Autos","slug":"autos"},{"name":"Sibarita","slug":"sibarita"},{"name":"Letras Libres","slug":"letras-libres"},{"name":"Rusia 2018","slug":"rusia-2018"},{"name":"Tv - Especiales","slug":"tv/especiales"},{"name":"Tv - Bloomberg Businessweek","slug":"tv/bloomberg-businessweek"},{"name":"Tv - Gabinete de Seguridad","slug":"tv/gabinete-de-seguridad"},{"name":"Transición","slug":"transicion"},{"name":"Emprendedores","slug":"emprendedores"},{"name":"Blogs - Monoblock","slug":"blogs/monoblock"},{"name":"Península","slug":"peninsula"},{"name":"ESPN","slug":"espn"},{"name":"Tv - La Cuarta Transformación","slug":"tv/la-cuarta-transformacion"},{"name":"Primeros 100 días","slug":"primeros-100-dias"}]
+#------------------------------------------------------------------------------------------------
+TAG_RE = re.compile(r'<[^>]+>')
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+
+#------------------------------------------------------------------------------------------------
+class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
+    name = "noticias"
+
+
+    def start_requests(self):
+        year = getattr(self, "year", None)
+        month = getattr(self, "month", None)
+        self.day = getattr(self, "day", None)
+
+        self.this_date = year + "-" + month.zfill(2) + "-" + self.day.zfill(2)
+	self.baseURL1 = "https://api.elfinanciero.com.mx/public/search/typed/?_format=json&json={%22search%22:%22*%22,%22categoriesslug%22:%22"
+	self.baseURL2 = "%22,%22min_date%22:%22"+self.this_date+"%22,%22max_date%22:%22"+self.this_date+"%22}&type=page&page=1&size=10000"
+#	print(self.baseURL)
+
+	for i in allSections:
+		yield scrapy.Request(url=self.baseURL1+i["slug"]+self.baseURL2, callback=self.parse)
+
+
+
+    def parse(self, response):
+	data = json.loads(response.text)["data"][1]
+	for d in data:
+		item = NoticiasItem()
+		item["title"] = d["_source"]["title"]
+                item["date"] = d["_source"]["createdAt"]
+		item["text"]=remove_tags(d["_source"]["html"])
+		item["topic"]=d["_source"]["categoryId"]["slug"]
+                item["author"]=d["_source"]["author"][0]["name"]+" "+d["_source"]["author"][0]["aPaterno"]+" "+d["_source"]["author"][0]["aMaterno"]
+		item["url"]="https://elfinanciero.com.mx/"+d["_source"]["slug"]
+
+		yield item
--- a/descarga_por_dia/elFinanciero/scrapy.cfg
+++ b/descarga_por_dia/elFinanciero/scrapy.cfg
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = elFinanciero.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = elFinanciero