Commit 11d4dc11 authored by Mario Chirinos Colunga's avatar Mario Chirinos Colunga 💬

elFinanciero

parent ffa4478a
......@@ -12,7 +12,7 @@ import datetime
today = datetime.datetime.now()
baseDir = "/home/geoint/virtualHDD/m3/noticias/"
baseDir = "/home/geoint/M3NAS/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/"
with open(sys.argv[1]) as data_file:
siteList = json.load(data_file)
......@@ -98,4 +98,4 @@ with open(sys.argv[1]) as data_file:
os.chdir("..")
# print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
......@@ -12,7 +12,7 @@ import datetime
today = datetime.datetime.now()
baseDir = "/home/geoint/virtualHDD/m3/noticias/"
baseDir = "/home/geoint/M3NAS/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/"
with open(sys.argv[1]) as data_file:
siteList = json.load(data_file)
......@@ -80,4 +80,4 @@ with open(sys.argv[1]) as data_file:
os.chdir("..")
# print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
......@@ -48,7 +48,7 @@ def dictRowGenerator(line):
today = datetime.datetime.now()
baseDir = "/home/geoint/virtualHDD/m3/noticias/"
baseDir = "/home/geoint/M3NAS/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/"
with open(sys.argv[1]) as data_file:
siteList = json.load(data_file)
......@@ -224,4 +224,4 @@ with open(sys.argv[1]) as data_file:
os.chdir("..")
print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
......@@ -11,7 +11,7 @@ import os
import datetime
today = datetime.datetime.now()
baseDir = "/home/geoint/virtualHDD/m3/noticias/"
baseDir = "/home/geoint/M3NAS/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/"
with open(sys.argv[1]) as data_file:
siteList = json.load(data_file)
......@@ -97,4 +97,4 @@ with open(sys.argv[1]) as data_file:
os.chdir("..")
# print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
......@@ -14,7 +14,7 @@ import datetime
# today = datetime.datetime.now()
baseDir = "/home/geoint/virtualHDD/m3/noticias/"
baseDir = "/home/geoint/M3NAS/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/"
with open(sys.argv[1]) as data_file:
siteList = json.load(data_file)
......@@ -84,4 +84,4 @@ with open(sys.argv[1]) as data_file:
os.chdir("..")
# print hasta.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
......@@ -10,7 +10,7 @@ import os
baseDir = "/home/geoint/virtualHDD/m3/noticias/"
baseDir = "/home/geoint/M3NAS/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/"
s = {"crawler": "descarga_por_mes/proceso"}
......
......@@ -85,11 +85,13 @@ class QuotesSpider(scrapy.Spider):
self.uri_page = "%22}&type=page&page="
self.uri_complement = "&size=10"
print(self.uri_base+self.uri_page+self.uri_complement)
for s in sectionList:
yield scrapy.Request(url=self.baseURL + s, callback=self.parse)
def parse(self, response):
print(response.url)
searchData = ImportantData()
CONTINUE_SEARCHING = True
......
This diff is collapsed.
This diff is collapsed.
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class ElfinancieroItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class ElfinancieroSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class ElfinancieroDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
class ElfinancieroPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.itemList=[]
def close_spider(self, spider):
# print(len(self.itemList))
with open(self.filename, 'w') as fp:
json.dump(self.itemList, fp)
def process_item(self, item, spider):
self.itemList.append(dict(item))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for elFinanciero project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'elFinanciero'
SPIDER_MODULES = ['elFinanciero.spiders']
NEWSPIDER_MODULE = 'elFinanciero.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'elFinanciero (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'elFinanciero.middlewares.ElfinancieroSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'elFinanciero.middlewares.ElfinancieroDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'elFinanciero.pipelines.ElfinancieroPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
MEDIA:
El Financiero
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd elFinanciero/
$ scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
import scrapy, re, json
from elFinanciero.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
#------------------------------------------------------------------------------------------------
allSections = [{"name":"Economía","slug":"economia"},{"name":"Empresas","slug":"empresas"},{"name":"Mercados","slug":"mercados"},{"name":"Pyme","slug":"pyme"},{"name":"Franquicias","slug":"franquicias"},{"name":"Nacional","slug":"nacional"},{"name":"Tech","slug":"tech"},{"name":"Mundo","slug":"mundo"},{"name":"Deportes","slug":"deportes"},{"name":"Culturas","slug":"culturas"},{"name":"Buena Vida","slug":"buena-vida"},{"name":"Reflector","slug":"reflector"},{"name":"Ciencia","slug":"ciencia"},{"name":"Mis Finanzas","slug":"mis-finanzas"},{"name":"Opinión","slug":"opinion"},{"name":"Interactivos","slug":"interactivos"},{"name":"Blogs","slug":"blogs"},{"name":"Fotogalerías","slug":"fotogalerias"},{"name":"Financial Times","slug":"financial-times"},{"name":"Power Tools","slug":"power-tools"},{"name":"Bajío","slug":"bajio"},{"name":"Monterrey","slug":"monterrey"},{"name":"Universidades","slug":"universidades"},{"name":"Mundo empresa","slug":"mundo-empresa"},{"name":"Texas","slug":"texas"},{"name":"Suplementos","slug":"suplementos"},{"name":"Archivo","slug":"archivo"},{"name":"Pages","slug":"pages"},{"name":"Licitaciones","slug":"licitaciones"},{"name":"Bloomberg","slug":"bloomberg"},{"name":"Startup","slug":"startup"},{"name":"Mercados - Acciones","slug":"mercados/acciones"},{"name":"Mercados - IPC","slug":"mercados/ipc"},{"name":"Mercados - Divisas","slug":"mercados/divisas"},{"name":"Mercados - Dinero","slug":"mercados/dinero"},{"name":"Mercados - Commodities","slug":"mercados/commodities"},{"name":"TLCAN","slug":"tlcan"},{"name":"Blogs - Territorio Viral","slug":"blogs/territorio-viral"},{"name":"Blogs - Templo del Morbo","slug":"blogs/templo-del-morbo"},{"name":"Sponsor","slug":"sponsor"},{"name":"Bloomberg Businessweek","slug":"bloomberg-businessweek"},{"name":"Millonarios","slug":"millonarios"},{"name":"Management","slug":"management"},{"name":"Viajes","slug":"viajes"},{"name":"Cartones","slug":"cartones"},{"name":"EF Eventos","slug":"ef-eventos"},{"name":"Blogs - Efecto Jazz","slug":"blogs/efecto-jazz"},{"name":"Blogs - Visión CFA","slug":"blogs/vision-cfa"},{"name":"Pages - Eventos","slug":"pages/eventos"},{"name":"Pages - Interactivos","slug":"pages/interactivos"},{"name":"Pages - PDF","slug":"pages/pdf"},{"name":"Pages - Documentos","slug":"pages/documentos"},{"name":"Pages - Docs","slug":"pages/docs"},{"name":"TV","slug":"tv"},{"name":"Tv - Al sonar la campana","slug":"tv/al-sonar-la-campana"},{"name":"Tv - Espresso Doble","slug":"tv/espresso-doble"},{"name":"Tv - Ganadores & Perdedores","slug":"tv/ganadores-y-perdedores"},{"name":"Tv - Entre Mercados","slug":"tv/entre-mercados"},{"name":"Tv - Mesa Central","slug":"tv/mesa-central"},{"name":"Tv - Bitácora Política","slug":"tv/bitacora-politica"},{"name":"Tv - Sin Línea","slug":"tv/sin-linea"},{"name":"Tv - Al Cierre","slug":"tv/al-cierre"},{"name":"Tv - Tiempo de Toros","slug":"tv/tiempo-de-toros"},{"name":"Tv - Nación 321","slug":"tv/nacion321"},{"name":"Tv - El mundo según...","slug":"tv/el-mundo-segun"},{"name":"Tv - En EF y por Adela","slug":"tv/en-ef-y-por-adela"},{"name":"Tv - La Nota Dura","slug":"tv/la-nota-dura"},{"name":"Tv - La Silla Roja","slug":"tv/la-silla-roja"},{"name":"Tv - Personajes","slug":"tv/personajes"},{"name":"Tv - Tech","slug":"tv/tech"},{"name":"Tv - Mundo","slug":"tv/mundo"},{"name":"Tv - Finanzas Personales","slug":"tv/finanzas-personales"},{"name":"Tv - Estilo de Vida","slug":"tv/estilo-de-vida"},{"name":"Tv - Bloomberg","slug":"tv/bloomberg"},{"name":"Tv - Viral","slug":"tv/viral"},{"name":"Tv - Nacional","slug":"tv/nacional"},{"name":"Tv - Empresas","slug":"tv/empresas"},{"name":"Tv - Economía","slug":"tv/economia"},{"name":"Tv - Reflector","slug":"tv/reflector"},{"name":"Tv - Sponsor","slug":"tv/sponsor"},{"name":"Rankings","slug":"rankings"},{"name":"Trivias","slug":"trivias"},{"name":"Elecciones 2018","slug":"elecciones-2018"},{"name":"Pages - Businessweek México","slug":"pages/businessweek-mexico"},{"name":"Fibras","slug":"fibras"},{"name":"After Office","slug":"after-office"},{"name":"New York Times Syndicate","slug":"new-york-times-syndicate"},{"name":"México en Hannover","slug":"mexico-en-hannover"},{"name":"Tv - Opinión","slug":"tv/opinion"},{"name":"Pages - Central Política","slug":"pages/central-politica"},{"name":"Relojes","slug":"relojes"},{"name":"Autos","slug":"autos"},{"name":"Sibarita","slug":"sibarita"},{"name":"Letras Libres","slug":"letras-libres"},{"name":"Rusia 2018","slug":"rusia-2018"},{"name":"Tv - Especiales","slug":"tv/especiales"},{"name":"Tv - Bloomberg Businessweek","slug":"tv/bloomberg-businessweek"},{"name":"Tv - Gabinete de Seguridad","slug":"tv/gabinete-de-seguridad"},{"name":"Transición","slug":"transicion"},{"name":"Emprendedores","slug":"emprendedores"},{"name":"Blogs - Monoblock","slug":"blogs/monoblock"},{"name":"Península","slug":"peninsula"},{"name":"ESPN","slug":"espn"},{"name":"Tv - La Cuarta Transformación","slug":"tv/la-cuarta-transformacion"},{"name":"Primeros 100 días","slug":"primeros-100-dias"}]
#------------------------------------------------------------------------------------------------
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
#------------------------------------------------------------------------------------------------
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
self.this_date = year + "-" + month.zfill(2) + "-" + self.day.zfill(2)
self.baseURL1 = "https://api.elfinanciero.com.mx/public/search/typed/?_format=json&json={%22search%22:%22*%22,%22categoriesslug%22:%22"
self.baseURL2 = "%22,%22min_date%22:%22"+self.this_date+"%22,%22max_date%22:%22"+self.this_date+"%22}&type=page&page=1&size=10000"
# print(self.baseURL)
for i in allSections:
yield scrapy.Request(url=self.baseURL1+i["slug"]+self.baseURL2, callback=self.parse)
def parse(self, response):
data = json.loads(response.text)["data"][1]
for d in data:
item = NoticiasItem()
item["title"] = d["_source"]["title"]
item["date"] = d["_source"]["createdAt"]
item["text"]=remove_tags(d["_source"]["html"])
item["topic"]=d["_source"]["categoryId"]["slug"]
item["author"]=d["_source"]["author"][0]["name"]+" "+d["_source"]["author"][0]["aPaterno"]+" "+d["_source"]["author"][0]["aMaterno"]
item["url"]="https://elfinanciero.com.mx/"+d["_source"]["slug"]
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = elFinanciero.settings
[deploy]
#url = http://localhost:6800/
project = elFinanciero
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment