Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
11d4dc11
Commit
11d4dc11
authored
6 years ago
by
Mario Chirinos Colunga
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
elFinanciero
parent
ffa4478a
Changes
20
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
52030 additions
and
11 deletions
+52030
-11
download_backwards.py
crawler_script/download_backwards.py
+2
-2
download_by_day.py
crawler_script/download_by_day.py
+2
-2
download_often.py
crawler_script/download_often.py
+2
-2
tracker_backwards.py
crawler_script/tracker_backwards.py
+2
-2
tracker_by_day.py
crawler_script/tracker_by_day.py
+2
-2
tracker_proceso.py
crawler_script/tracker_proceso.py
+1
-1
2019-01-24.json
...rga_hacia_atras/elFinanciero/elFinanciero/2019-01-24.json
+1
-0
.noticias.py.swp
..._atras/elFinanciero/elFinanciero/spiders/.noticias.py.swp
+0
-0
noticias.py
...hacia_atras/elFinanciero/elFinanciero/spiders/noticias.py
+2
-0
news.json
descarga_hacia_atras/foraneos/heraldoHn/news.json
+598
-0
news.json
descarga_hacia_atras/laJornadaBC/news.json
+51097
-0
__init__.py
descarga_por_dia/elFinanciero/elFinanciero/__init__.py
+0
-0
items.py
descarga_por_dia/elFinanciero/elFinanciero/items.py
+24
-0
middlewares.py
descarga_por_dia/elFinanciero/elFinanciero/middlewares.py
+103
-0
pipelines.py
descarga_por_dia/elFinanciero/elFinanciero/pipelines.py
+32
-0
settings.py
descarga_por_dia/elFinanciero/elFinanciero/settings.py
+90
-0
.noticias.py.swp
...or_dia/elFinanciero/elFinanciero/spiders/.noticias.py.swp
+0
-0
__init__.py
...rga_por_dia/elFinanciero/elFinanciero/spiders/__init__.py
+4
-0
noticias.py
...rga_por_dia/elFinanciero/elFinanciero/spiders/noticias.py
+57
-0
scrapy.cfg
descarga_por_dia/elFinanciero/scrapy.cfg
+11
-0
No files found.
crawler_script/download_backwards.py
View file @
11d4dc11
...
...
@@ -12,7 +12,7 @@ import datetime
today
=
datetime
.
datetime
.
now
()
baseDir
=
"/home/geoint/
virtualHDD/m3
/noticias/"
baseDir
=
"/home/geoint/
M3NAS
/noticias/"
scrapyDir
=
"/home/geoint/crawlersNoticias/"
with
open
(
sys
.
argv
[
1
])
as
data_file
:
siteList
=
json
.
load
(
data_file
)
...
...
@@ -98,4 +98,4 @@ with open(sys.argv[1]) as data_file:
os
.
chdir
(
".."
)
# print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
This diff is collapsed.
Click to expand it.
crawler_script/download_by_day.py
View file @
11d4dc11
...
...
@@ -12,7 +12,7 @@ import datetime
today
=
datetime
.
datetime
.
now
()
baseDir
=
"/home/geoint/
virtualHDD/m3
/noticias/"
baseDir
=
"/home/geoint/
M3NAS
/noticias/"
scrapyDir
=
"/home/geoint/crawlersNoticias/"
with
open
(
sys
.
argv
[
1
])
as
data_file
:
siteList
=
json
.
load
(
data_file
)
...
...
@@ -80,4 +80,4 @@ with open(sys.argv[1]) as data_file:
os
.
chdir
(
".."
)
# print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
This diff is collapsed.
Click to expand it.
crawler_script/download_often.py
View file @
11d4dc11
...
...
@@ -48,7 +48,7 @@ def dictRowGenerator(line):
today
=
datetime
.
datetime
.
now
()
baseDir
=
"/home/geoint/
virtualHDD/m3
/noticias/"
baseDir
=
"/home/geoint/
M3NAS
/noticias/"
scrapyDir
=
"/home/geoint/crawlersNoticias/"
with
open
(
sys
.
argv
[
1
])
as
data_file
:
siteList
=
json
.
load
(
data_file
)
...
...
@@ -224,4 +224,4 @@ with open(sys.argv[1]) as data_file:
os
.
chdir
(
".."
)
print
today
.
year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
This diff is collapsed.
Click to expand it.
crawler_script/tracker_backwards.py
View file @
11d4dc11
...
...
@@ -11,7 +11,7 @@ import os
import
datetime
today
=
datetime
.
datetime
.
now
()
baseDir
=
"/home/geoint/
virtualHDD/m3
/noticias/"
baseDir
=
"/home/geoint/
M3NAS
/noticias/"
scrapyDir
=
"/home/geoint/crawlersNoticias/"
with
open
(
sys
.
argv
[
1
])
as
data_file
:
siteList
=
json
.
load
(
data_file
)
...
...
@@ -97,4 +97,4 @@ with open(sys.argv[1]) as data_file:
os
.
chdir
(
".."
)
# print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
This diff is collapsed.
Click to expand it.
crawler_script/tracker_by_day.py
View file @
11d4dc11
...
...
@@ -14,7 +14,7 @@ import datetime
# today = datetime.datetime.now()
baseDir
=
"/home/geoint/
virtualHDD/m3
/noticias/"
baseDir
=
"/home/geoint/
M3NAS
/noticias/"
scrapyDir
=
"/home/geoint/crawlersNoticias/"
with
open
(
sys
.
argv
[
1
])
as
data_file
:
siteList
=
json
.
load
(
data_file
)
...
...
@@ -84,4 +84,4 @@ with open(sys.argv[1]) as data_file:
os
.
chdir
(
".."
)
# print hasta.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
This diff is collapsed.
Click to expand it.
crawler_script/tracker_proceso.py
View file @
11d4dc11
...
...
@@ -10,7 +10,7 @@ import os
baseDir
=
"/home/geoint/
virtualHDD/m3
/noticias/"
baseDir
=
"/home/geoint/
M3NAS
/noticias/"
scrapyDir
=
"/home/geoint/crawlersNoticias/"
s
=
{
"crawler"
:
"descarga_por_mes/proceso"
}
...
...
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/elFinanciero/elFinanciero/2019-01-24.json
0 → 100644
View file @
11d4dc11
[]
\ No newline at end of file
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/elFinanciero/elFinanciero/spiders/.noticias.py.swp
0 → 100644
View file @
11d4dc11
File added
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/elFinanciero/elFinanciero/spiders/noticias.py
View file @
11d4dc11
...
...
@@ -85,11 +85,13 @@ class QuotesSpider(scrapy.Spider):
self
.
uri_page
=
"
%22
}&type=page&page="
self
.
uri_complement
=
"&size=10"
print
(
self
.
uri_base
+
self
.
uri_page
+
self
.
uri_complement
)
for
s
in
sectionList
:
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
s
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
print
(
response
.
url
)
searchData
=
ImportantData
()
CONTINUE_SEARCHING
=
True
...
...
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/foraneos/heraldoHn/news.json
0 → 100644
View file @
11d4dc11
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/laJornadaBC/news.json
0 → 100644
View file @
11d4dc11
This diff is collapsed.
Click to expand it.
descarga_por_dia/elFinanciero/elFinanciero/__init__.py
0 → 100644
View file @
11d4dc11
This diff is collapsed.
Click to expand it.
descarga_por_dia/elFinanciero/elFinanciero/items.py
0 → 100644
View file @
11d4dc11
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import
scrapy
class
ElfinancieroItem
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class
NoticiasItem
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
This diff is collapsed.
Click to expand it.
descarga_por_dia/elFinanciero/elFinanciero/middlewares.py
0 → 100644
View file @
11d4dc11
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from
scrapy
import
signals
class
ElfinancieroSpiderMiddleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_spider_input
(
self
,
response
,
spider
):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return
None
def
process_spider_output
(
self
,
response
,
result
,
spider
):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for
i
in
result
:
yield
i
def
process_spider_exception
(
self
,
response
,
exception
,
spider
):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def
process_start_requests
(
self
,
start_requests
,
spider
):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for
r
in
start_requests
:
yield
r
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
class
ElfinancieroDownloaderMiddleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_request
(
self
,
request
,
spider
):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return
None
def
process_response
(
self
,
request
,
response
,
spider
):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return
response
def
process_exception
(
self
,
request
,
exception
,
spider
):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
This diff is collapsed.
Click to expand it.
descarga_por_dia/elFinanciero/elFinanciero/pipelines.py
0 → 100644
View file @
11d4dc11
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import
json
class
ElfinancieroPipeline
(
object
):
def
__init__
(
self
,
filename
):
self
.
filename
=
filename
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# Here you get whatever value was passed through the "filename" command line parameter
settings
=
crawler
.
settings
filename
=
settings
.
get
(
'filename'
)
# Instantiate the pipeline with the file name
return
cls
(
filename
)
def
open_spider
(
self
,
spider
):
self
.
itemList
=
[]
def
close_spider
(
self
,
spider
):
# print(len(self.itemList))
with
open
(
self
.
filename
,
'w'
)
as
fp
:
json
.
dump
(
self
.
itemList
,
fp
)
def
process_item
(
self
,
item
,
spider
):
self
.
itemList
.
append
(
dict
(
item
))
return
item
This diff is collapsed.
Click to expand it.
descarga_por_dia/elFinanciero/elFinanciero/settings.py
0 → 100644
View file @
11d4dc11
# -*- coding: utf-8 -*-
# Scrapy settings for elFinanciero project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME
=
'elFinanciero'
SPIDER_MODULES
=
[
'elFinanciero.spiders'
]
NEWSPIDER_MODULE
=
'elFinanciero.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'elFinanciero (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY
=
True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'elFinanciero.middlewares.ElfinancieroSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'elFinanciero.middlewares.ElfinancieroDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES
=
{
'elFinanciero.pipelines.ElfinancieroPipeline'
:
300
,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
This diff is collapsed.
Click to expand it.
descarga_por_dia/elFinanciero/elFinanciero/spiders/.noticias.py.swp
0 → 100644
View file @
11d4dc11
File added
This diff is collapsed.
Click to expand it.
descarga_por_dia/elFinanciero/elFinanciero/spiders/__init__.py
0 → 100644
View file @
11d4dc11
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
This diff is collapsed.
Click to expand it.
descarga_por_dia/elFinanciero/elFinanciero/spiders/noticias.py
0 → 100644
View file @
11d4dc11
# -*- coding: utf-8 -*-
"""
MEDIA:
El Financiero
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd elFinanciero/
$ scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
import
scrapy
,
re
,
json
from
elFinanciero.items
import
NoticiasItem
from
datetime
import
datetime
,
timedelta
,
tzinfo
#------------------------------------------------------------------------------------------------
allSections
=
[{
"name"
:
"Economía"
,
"slug"
:
"economia"
},{
"name"
:
"Empresas"
,
"slug"
:
"empresas"
},{
"name"
:
"Mercados"
,
"slug"
:
"mercados"
},{
"name"
:
"Pyme"
,
"slug"
:
"pyme"
},{
"name"
:
"Franquicias"
,
"slug"
:
"franquicias"
},{
"name"
:
"Nacional"
,
"slug"
:
"nacional"
},{
"name"
:
"Tech"
,
"slug"
:
"tech"
},{
"name"
:
"Mundo"
,
"slug"
:
"mundo"
},{
"name"
:
"Deportes"
,
"slug"
:
"deportes"
},{
"name"
:
"Culturas"
,
"slug"
:
"culturas"
},{
"name"
:
"Buena Vida"
,
"slug"
:
"buena-vida"
},{
"name"
:
"Reflector"
,
"slug"
:
"reflector"
},{
"name"
:
"Ciencia"
,
"slug"
:
"ciencia"
},{
"name"
:
"Mis Finanzas"
,
"slug"
:
"mis-finanzas"
},{
"name"
:
"Opinión"
,
"slug"
:
"opinion"
},{
"name"
:
"Interactivos"
,
"slug"
:
"interactivos"
},{
"name"
:
"Blogs"
,
"slug"
:
"blogs"
},{
"name"
:
"Fotogalerías"
,
"slug"
:
"fotogalerias"
},{
"name"
:
"Financial Times"
,
"slug"
:
"financial-times"
},{
"name"
:
"Power Tools"
,
"slug"
:
"power-tools"
},{
"name"
:
"Bajío"
,
"slug"
:
"bajio"
},{
"name"
:
"Monterrey"
,
"slug"
:
"monterrey"
},{
"name"
:
"Universidades"
,
"slug"
:
"universidades"
},{
"name"
:
"Mundo empresa"
,
"slug"
:
"mundo-empresa"
},{
"name"
:
"Texas"
,
"slug"
:
"texas"
},{
"name"
:
"Suplementos"
,
"slug"
:
"suplementos"
},{
"name"
:
"Archivo"
,
"slug"
:
"archivo"
},{
"name"
:
"Pages"
,
"slug"
:
"pages"
},{
"name"
:
"Licitaciones"
,
"slug"
:
"licitaciones"
},{
"name"
:
"Bloomberg"
,
"slug"
:
"bloomberg"
},{
"name"
:
"Startup"
,
"slug"
:
"startup"
},{
"name"
:
"Mercados - Acciones"
,
"slug"
:
"mercados/acciones"
},{
"name"
:
"Mercados - IPC"
,
"slug"
:
"mercados/ipc"
},{
"name"
:
"Mercados - Divisas"
,
"slug"
:
"mercados/divisas"
},{
"name"
:
"Mercados - Dinero"
,
"slug"
:
"mercados/dinero"
},{
"name"
:
"Mercados - Commodities"
,
"slug"
:
"mercados/commodities"
},{
"name"
:
"TLCAN"
,
"slug"
:
"tlcan"
},{
"name"
:
"Blogs - Territorio Viral"
,
"slug"
:
"blogs/territorio-viral"
},{
"name"
:
"Blogs - Templo del Morbo"
,
"slug"
:
"blogs/templo-del-morbo"
},{
"name"
:
"Sponsor"
,
"slug"
:
"sponsor"
},{
"name"
:
"Bloomberg Businessweek"
,
"slug"
:
"bloomberg-businessweek"
},{
"name"
:
"Millonarios"
,
"slug"
:
"millonarios"
},{
"name"
:
"Management"
,
"slug"
:
"management"
},{
"name"
:
"Viajes"
,
"slug"
:
"viajes"
},{
"name"
:
"Cartones"
,
"slug"
:
"cartones"
},{
"name"
:
"EF Eventos"
,
"slug"
:
"ef-eventos"
},{
"name"
:
"Blogs - Efecto Jazz"
,
"slug"
:
"blogs/efecto-jazz"
},{
"name"
:
"Blogs - Visión CFA"
,
"slug"
:
"blogs/vision-cfa"
},{
"name"
:
"Pages - Eventos"
,
"slug"
:
"pages/eventos"
},{
"name"
:
"Pages - Interactivos"
,
"slug"
:
"pages/interactivos"
},{
"name"
:
"Pages - PDF"
,
"slug"
:
"pages/pdf"
},{
"name"
:
"Pages - Documentos"
,
"slug"
:
"pages/documentos"
},{
"name"
:
"Pages - Docs"
,
"slug"
:
"pages/docs"
},{
"name"
:
"TV"
,
"slug"
:
"tv"
},{
"name"
:
"Tv - Al sonar la campana"
,
"slug"
:
"tv/al-sonar-la-campana"
},{
"name"
:
"Tv - Espresso Doble"
,
"slug"
:
"tv/espresso-doble"
},{
"name"
:
"Tv - Ganadores & Perdedores"
,
"slug"
:
"tv/ganadores-y-perdedores"
},{
"name"
:
"Tv - Entre Mercados"
,
"slug"
:
"tv/entre-mercados"
},{
"name"
:
"Tv - Mesa Central"
,
"slug"
:
"tv/mesa-central"
},{
"name"
:
"Tv - Bitácora Política"
,
"slug"
:
"tv/bitacora-politica"
},{
"name"
:
"Tv - Sin Línea"
,
"slug"
:
"tv/sin-linea"
},{
"name"
:
"Tv - Al Cierre"
,
"slug"
:
"tv/al-cierre"
},{
"name"
:
"Tv - Tiempo de Toros"
,
"slug"
:
"tv/tiempo-de-toros"
},{
"name"
:
"Tv - Nación 321"
,
"slug"
:
"tv/nacion321"
},{
"name"
:
"Tv - El mundo según..."
,
"slug"
:
"tv/el-mundo-segun"
},{
"name"
:
"Tv - En EF y por Adela"
,
"slug"
:
"tv/en-ef-y-por-adela"
},{
"name"
:
"Tv - La Nota Dura"
,
"slug"
:
"tv/la-nota-dura"
},{
"name"
:
"Tv - La Silla Roja"
,
"slug"
:
"tv/la-silla-roja"
},{
"name"
:
"Tv - Personajes"
,
"slug"
:
"tv/personajes"
},{
"name"
:
"Tv - Tech"
,
"slug"
:
"tv/tech"
},{
"name"
:
"Tv - Mundo"
,
"slug"
:
"tv/mundo"
},{
"name"
:
"Tv - Finanzas Personales"
,
"slug"
:
"tv/finanzas-personales"
},{
"name"
:
"Tv - Estilo de Vida"
,
"slug"
:
"tv/estilo-de-vida"
},{
"name"
:
"Tv - Bloomberg"
,
"slug"
:
"tv/bloomberg"
},{
"name"
:
"Tv - Viral"
,
"slug"
:
"tv/viral"
},{
"name"
:
"Tv - Nacional"
,
"slug"
:
"tv/nacional"
},{
"name"
:
"Tv - Empresas"
,
"slug"
:
"tv/empresas"
},{
"name"
:
"Tv - Economía"
,
"slug"
:
"tv/economia"
},{
"name"
:
"Tv - Reflector"
,
"slug"
:
"tv/reflector"
},{
"name"
:
"Tv - Sponsor"
,
"slug"
:
"tv/sponsor"
},{
"name"
:
"Rankings"
,
"slug"
:
"rankings"
},{
"name"
:
"Trivias"
,
"slug"
:
"trivias"
},{
"name"
:
"Elecciones 2018"
,
"slug"
:
"elecciones-2018"
},{
"name"
:
"Pages - Businessweek México"
,
"slug"
:
"pages/businessweek-mexico"
},{
"name"
:
"Fibras"
,
"slug"
:
"fibras"
},{
"name"
:
"After Office"
,
"slug"
:
"after-office"
},{
"name"
:
"New York Times Syndicate"
,
"slug"
:
"new-york-times-syndicate"
},{
"name"
:
"México en Hannover"
,
"slug"
:
"mexico-en-hannover"
},{
"name"
:
"Tv - Opinión"
,
"slug"
:
"tv/opinion"
},{
"name"
:
"Pages - Central Política"
,
"slug"
:
"pages/central-politica"
},{
"name"
:
"Relojes"
,
"slug"
:
"relojes"
},{
"name"
:
"Autos"
,
"slug"
:
"autos"
},{
"name"
:
"Sibarita"
,
"slug"
:
"sibarita"
},{
"name"
:
"Letras Libres"
,
"slug"
:
"letras-libres"
},{
"name"
:
"Rusia 2018"
,
"slug"
:
"rusia-2018"
},{
"name"
:
"Tv - Especiales"
,
"slug"
:
"tv/especiales"
},{
"name"
:
"Tv - Bloomberg Businessweek"
,
"slug"
:
"tv/bloomberg-businessweek"
},{
"name"
:
"Tv - Gabinete de Seguridad"
,
"slug"
:
"tv/gabinete-de-seguridad"
},{
"name"
:
"Transición"
,
"slug"
:
"transicion"
},{
"name"
:
"Emprendedores"
,
"slug"
:
"emprendedores"
},{
"name"
:
"Blogs - Monoblock"
,
"slug"
:
"blogs/monoblock"
},{
"name"
:
"Península"
,
"slug"
:
"peninsula"
},{
"name"
:
"ESPN"
,
"slug"
:
"espn"
},{
"name"
:
"Tv - La Cuarta Transformación"
,
"slug"
:
"tv/la-cuarta-transformacion"
},{
"name"
:
"Primeros 100 días"
,
"slug"
:
"primeros-100-dias"
}]
#------------------------------------------------------------------------------------------------
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
#------------------------------------------------------------------------------------------------
class
QuotesSpider
(
scrapy
.
Spider
):
"""
Basic Scrapy Spider class
"""
name
=
"noticias"
def
start_requests
(
self
):
year
=
getattr
(
self
,
"year"
,
None
)
month
=
getattr
(
self
,
"month"
,
None
)
self
.
day
=
getattr
(
self
,
"day"
,
None
)
self
.
this_date
=
year
+
"-"
+
month
.
zfill
(
2
)
+
"-"
+
self
.
day
.
zfill
(
2
)
self
.
baseURL1
=
"https://api.elfinanciero.com.mx/public/search/typed/?_format=json&json={
%22
search
%22
:
%22
*
%22
,
%22
categoriesslug
%22
:
%22
"
self
.
baseURL2
=
"
%22
,
%22
min_date
%22
:
%22
"
+
self
.
this_date
+
"
%22
,
%22
max_date
%22
:
%22
"
+
self
.
this_date
+
"
%22
}&type=page&page=1&size=10000"
# print(self.baseURL)
for
i
in
allSections
:
yield
scrapy
.
Request
(
url
=
self
.
baseURL1
+
i
[
"slug"
]
+
self
.
baseURL2
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
data
=
json
.
loads
(
response
.
text
)[
"data"
][
1
]
for
d
in
data
:
item
=
NoticiasItem
()
item
[
"title"
]
=
d
[
"_source"
][
"title"
]
item
[
"date"
]
=
d
[
"_source"
][
"createdAt"
]
item
[
"text"
]
=
remove_tags
(
d
[
"_source"
][
"html"
])
item
[
"topic"
]
=
d
[
"_source"
][
"categoryId"
][
"slug"
]
item
[
"author"
]
=
d
[
"_source"
][
"author"
][
0
][
"name"
]
+
" "
+
d
[
"_source"
][
"author"
][
0
][
"aPaterno"
]
+
" "
+
d
[
"_source"
][
"author"
][
0
][
"aMaterno"
]
item
[
"url"
]
=
"https://elfinanciero.com.mx/"
+
d
[
"_source"
][
"slug"
]
yield
item
This diff is collapsed.
Click to expand it.
descarga_por_dia/elFinanciero/scrapy.cfg
0 → 100644
View file @
11d4dc11
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = elFinanciero.settings
[deploy]
#url = http://localhost:6800/
project = elFinanciero
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment