Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
0ff76f58
Commit
0ff76f58
authored
6 years ago
by
Renán Sosa Guillen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
deleted
parent
093e0e82
Changes
24
Show whitespace changes
Inline
Side-by-side
Showing
24 changed files
with
0 additions
and
1351 deletions
+0
-1351
__init__.py
...rga_hacia_atras/laJornadaMaya2/laJornadaMaya2/__init__.py
+0
-0
items.py
descarga_hacia_atras/laJornadaMaya2/laJornadaMaya2/items.py
+0
-14
middlewares.py
..._hacia_atras/laJornadaMaya2/laJornadaMaya2/middlewares.py
+0
-56
pipelines.py
...ga_hacia_atras/laJornadaMaya2/laJornadaMaya2/pipelines.py
+0
-11
settings.py
...rga_hacia_atras/laJornadaMaya2/laJornadaMaya2/settings.py
+0
-90
__init__.py
...a_atras/laJornadaMaya2/laJornadaMaya2/spiders/__init__.py
+0
-4
noticias.py
...a_atras/laJornadaMaya2/laJornadaMaya2/spiders/noticias.py
+0
-106
scrapy.cfg
descarga_hacia_atras/laJornadaMaya2/scrapy.cfg
+0
-11
__init__.py
..._atras/laJornadaMaya_deprecated/laJornadaMaya/__init__.py
+0
-0
items.py
...cia_atras/laJornadaMaya_deprecated/laJornadaMaya/items.py
+0
-14
middlewares.py
...ras/laJornadaMaya_deprecated/laJornadaMaya/middlewares.py
+0
-56
pipelines.py
...atras/laJornadaMaya_deprecated/laJornadaMaya/pipelines.py
+0
-11
settings.py
..._atras/laJornadaMaya_deprecated/laJornadaMaya/settings.py
+0
-90
__init__.py
...aJornadaMaya_deprecated/laJornadaMaya/spiders/__init__.py
+0
-4
noticias.py
...aJornadaMaya_deprecated/laJornadaMaya/spiders/noticias.py
+0
-208
scrapy.cfg
descarga_hacia_atras/laJornadaMaya_deprecated/scrapy.cfg
+0
-11
__init__.py
descarga_por_mes/proceso_prueba/proceso_prueba/__init__.py
+0
-0
items.py
descarga_por_mes/proceso_prueba/proceso_prueba/items.py
+0
-20
middlewares.py
...arga_por_mes/proceso_prueba/proceso_prueba/middlewares.py
+0
-103
pipelines.py
descarga_por_mes/proceso_prueba/proceso_prueba/pipelines.py
+0
-75
settings.py
descarga_por_mes/proceso_prueba/proceso_prueba/settings.py
+0
-100
__init__.py
...por_mes/proceso_prueba/proceso_prueba/spiders/__init__.py
+0
-4
noticias.py
...por_mes/proceso_prueba/proceso_prueba/spiders/noticias.py
+0
-352
scrapy.cfg
descarga_por_mes/proceso_prueba/scrapy.cfg
+0
-11
No files found.
descarga_hacia_atras/laJornadaMaya2/laJornadaMaya2/__init__.py
deleted
100644 → 0
View file @
093e0e82
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/laJornadaMaya2/laJornadaMaya2/items.py
deleted
100644 → 0
View file @
093e0e82
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import
scrapy
class
Lajornadamaya2Item
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
pass
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/laJornadaMaya2/laJornadaMaya2/middlewares.py
deleted
100644 → 0
View file @
093e0e82
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from
scrapy
import
signals
class
Lajornadamaya2SpiderMiddleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_spider_input
(
response
,
spider
):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return
None
def
process_spider_output
(
response
,
result
,
spider
):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for
i
in
result
:
yield
i
def
process_spider_exception
(
response
,
exception
,
spider
):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def
process_start_requests
(
start_requests
,
spider
):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for
r
in
start_requests
:
yield
r
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/laJornadaMaya2/laJornadaMaya2/pipelines.py
deleted
100644 → 0
View file @
093e0e82
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class
Lajornadamaya2Pipeline
(
object
):
def
process_item
(
self
,
item
,
spider
):
return
item
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/laJornadaMaya2/laJornadaMaya2/settings.py
deleted
100644 → 0
View file @
093e0e82
# -*- coding: utf-8 -*-
# Scrapy settings for laJornadaMaya2 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME
=
'laJornadaMaya2'
SPIDER_MODULES
=
[
'laJornadaMaya2.spiders'
]
NEWSPIDER_MODULE
=
'laJornadaMaya2.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'laJornadaMaya2 (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY
=
True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'laJornadaMaya2.middlewares.Lajornadamaya2SpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'laJornadaMaya2.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'laJornadaMaya2.pipelines.Lajornadamaya2Pipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/laJornadaMaya2/laJornadaMaya2/spiders/__init__.py
deleted
100644 → 0
View file @
093e0e82
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/laJornadaMaya2/laJornadaMaya2/spiders/noticias.py
deleted
100644 → 0
View file @
093e0e82
import
scrapy
,
json
,
re
from
datetime
import
datetime
,
date
,
timedelta
,
tzinfo
"""
Esta version descarga todas las noticias contenidas en la pagina, sin necesidad
de una fecha especifica.
USO:
scrapy crawl noticias -t json --nolog -o noticias.json
Genera un archivo JSON con todas las noticias disponibles. El archivo 'parse_date_file.py'
puede servir para clasificar dichas noticias en sus respectivas fechas.
"""
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
class
UTC
(
tzinfo
):
"""clase para el 'time zone' (zona horaria)"""
def
utcoffset
(
self
,
dt
):
# zona horaria para yucatan (centro de mexico): utc-6
return
timedelta
(
hours
=-
6
)
def
tzname
(
self
,
dt
):
# nombre de la zona horaria
return
'UTC-6'
class
NoticiasItem
(
scrapy
.
Item
):
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
class
QuotesSpider
(
scrapy
.
Spider
):
name
=
"noticias"
def
start_requests
(
self
):
self
.
tz
=
UTC
()
self
.
baseURL
=
'https://www.lajornadamaya.mx'
section_list
=
[(
'yucatan'
,
123
,
'Yucatan'
),
(
'quintana-roo'
,
52
,
'Quintana Roo'
),
(
'campeche'
,
32
,
'Campeche'
),
(
'opinion'
,
0
,
'Opinion'
),
(
'deportes'
,
91
,
'Deportes'
),
(
'nacional'
,
100
,
'Nacional'
),
(
'internacional'
,
87
,
'Internacional'
)]
# section_list = [('opinion',0,'Opinion')]
for
section
in
section_list
:
self
.
section
=
section
if
not
(
section
[
0
]
==
'opinion'
):
for
page
in
range
(
0
,
section
[
1
]
+
1
):
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
'/'
+
section
[
0
]
+
'?p='
+
str
(
page
),
callback
=
self
.
parse
)
else
:
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
'/notas?opinion'
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
json_response
=
json
.
loads
(
response
.
text
)
if
not
(
self
.
section
[
0
]
==
'opinion'
):
json_list
=
json_response
else
:
json_list
=
json_response
[
'articles'
]
for
line
in
json_list
:
item
=
NoticiasItem
()
d
=
line
[
'publishDate'
]
if
len
(
d
)
==
10
:
d
=
map
(
int
,
d
.
split
(
'-'
))
d
=
datetime
(
d
[
0
],
d
[
1
],
d
[
2
],
tzinfo
=
self
.
tz
)
.
isoformat
(
'T'
)
elif
len
(
d
)
==
19
:
d
,
t
=
d
.
split
(
' '
)
d
=
map
(
int
,
d
.
split
(
'-'
))
t
=
map
(
int
,
t
.
split
(
':'
))
d
=
datetime
(
d
[
0
],
d
[
1
],
d
[
2
],
t
[
0
],
t
[
1
],
t
[
2
],
tzinfo
=
self
.
tz
)
.
isoformat
(
'T'
)
item
[
'date'
]
=
d
item
[
'topic'
]
=
self
.
section
[
2
]
item
[
'title'
]
=
line
[
'name'
]
if
not
(
self
.
section
[
0
]
==
'opinion'
):
request
=
scrapy
.
Request
(
url
=
self
.
baseURL
+
line
[
'url'
],
callback
=
self
.
parse_item
)
else
:
request
=
scrapy
.
Request
(
url
=
self
.
baseURL
+
'/'
+
line
[
'publishDate'
][:
line
[
'publishDate'
]
.
rfind
(
' '
)]
+
'/'
+
line
[
'uriComponent'
],
callback
=
self
.
parse_item
)
request
.
meta
[
'item'
]
=
item
yield
request
def
parse_item
(
self
,
response
):
item
=
response
.
meta
[
'item'
]
text
=
''
for
paragraph
in
response
.
xpath
(
'//*[@class="txt"]'
)
.
extract
():
text
+=
remove_tags
(
paragraph
)
+
'
\n
'
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
# print item['title']
yield
item
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/laJornadaMaya2/scrapy.cfg
deleted
100644 → 0
View file @
093e0e82
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = laJornadaMaya2.settings
[deploy]
#url = http://localhost:6800/
project = laJornadaMaya2
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/__init__.py
deleted
100644 → 0
View file @
093e0e82
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/items.py
deleted
100644 → 0
View file @
093e0e82
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import
scrapy
class
LajornadamayaItem
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
pass
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/middlewares.py
deleted
100644 → 0
View file @
093e0e82
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from
scrapy
import
signals
class
LajornadamayaSpiderMiddleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_spider_input
(
response
,
spider
):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return
None
def
process_spider_output
(
response
,
result
,
spider
):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for
i
in
result
:
yield
i
def
process_spider_exception
(
response
,
exception
,
spider
):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def
process_start_requests
(
start_requests
,
spider
):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for
r
in
start_requests
:
yield
r
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/pipelines.py
deleted
100644 → 0
View file @
093e0e82
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class
LajornadamayaPipeline
(
object
):
def
process_item
(
self
,
item
,
spider
):
return
item
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/settings.py
deleted
100644 → 0
View file @
093e0e82
# -*- coding: utf-8 -*-
# Scrapy settings for laJornadaMaya project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME
=
'laJornadaMaya'
SPIDER_MODULES
=
[
'laJornadaMaya.spiders'
]
NEWSPIDER_MODULE
=
'laJornadaMaya.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'laJornadaMaya (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY
=
True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'laJornadaMaya.middlewares.LajornadamayaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'laJornadaMaya.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'laJornadaMaya.pipelines.LajornadamayaPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/spiders/__init__.py
deleted
100644 → 0
View file @
093e0e82
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/laJornadaMaya_deprecated/laJornadaMaya/spiders/noticias.py
deleted
100644 → 0
View file @
093e0e82
import
scrapy
,
json
,
re
from
datetime
import
datetime
,
date
,
timedelta
,
tzinfo
"""
Esta version descarga ingresando una fecha.
USO:
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
No es recomendable para fechas de mas de un mes de antiguas.
"""
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
class
UTC
(
tzinfo
):
"""clase para el 'time zone' (zona horaria)"""
def
utcoffset
(
self
,
dt
):
# zona horaria para yucatan (centro de mexico): utc-6
return
timedelta
(
hours
=-
6
)
def
tzname
(
self
,
dt
):
# nombre de la zona horaria
return
'UTC-6'
class
NoticiasItem
(
scrapy
.
Item
):
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
class
QuotesSpider
(
scrapy
.
Spider
):
name
=
"noticias"
def
start_requests
(
self
):
# self.found = False
# self.flag = False
self
.
tz
=
UTC
()
self
.
year
=
getattr
(
self
,
'year'
,
None
)
self
.
month
=
getattr
(
self
,
'month'
,
None
)
self
.
day
=
getattr
(
self
,
'day'
,
None
)
self
.
req_date
=
date
(
int
(
self
.
year
),
int
(
self
.
month
),
int
(
self
.
day
))
self
.
date_format
=
"
%
Y-
%
m-
%
d"
self
.
baseURL
=
'https://www.lajornadamaya.mx'
section_list
=
[
'yucatan'
,
'quintana-roo'
,
'campeche'
,
'deportes'
,
'nacional'
,
'internacional'
,
'opinion'
]
# section_list = ['deportes']
for
section
in
section_list
:
self
.
section
=
section
for
count
in
range
(
0
,
2
):
if
(
count
==
0
):
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
'/'
+
section
,
callback
=
self
.
parse_2
)
elif
(
count
==
1
):
# self.section = section
self
.
page
=
0
self
.
flag
=
False
self
.
found
=
False
page
=
-
1
if
not
(
section
==
'opinion'
):
while
True
:
if
(
self
.
flag
):
self
.
flag
=
False
break
page
+=
1
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
'/'
+
section
+
'?p='
+
str
(
page
),
callback
=
self
.
parse
)
if
(
self
.
found
):
self
.
found
=
False
self
.
page
-=
1
if
(
self
.
page
>
0
):
self
.
page
-=
1
for
pag
in
range
(
self
.
page
,
self
.
page
+
6
):
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
'/'
+
section
+
'?p='
+
str
(
pag
),
callback
=
self
.
parse_page
,
dont_filter
=
True
)
else
:
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
'/notas?opinion'
,
callback
=
self
.
parse_page
)
def
parse_2
(
self
,
response
):
# para las primeras noticias
path_list
=
[
'//h1[@class="title"]/a/@href'
,
'//h2[@class="title"]/a/@href'
]
link_list
=
[]
for
path
in
path_list
:
link_list
+=
response
.
xpath
(
path
)
.
extract
()
for
link
in
link_list
:
if
(
link
[:
link
.
rfind
(
'/'
)]
==
self
.
year
+
'-'
+
self
.
month
.
zfill
(
2
)
+
'-'
+
self
.
day
.
zfill
(
2
)
):
item
=
NoticiasItem
()
d
=
link
[:
link
.
rfind
(
'/'
)]
if
len
(
d
)
==
10
:
d
=
map
(
int
,
d
.
split
(
'-'
))
d
=
datetime
(
d
[
0
],
d
[
1
],
d
[
2
],
tzinfo
=
self
.
tz
)
.
isoformat
(
'T'
)
elif
len
(
d
)
==
19
:
d
,
t
=
d
.
split
(
' '
)
d
=
map
(
int
,
d
.
split
(
'-'
))
t
=
map
(
int
,
t
.
split
(
':'
))
d
=
datetime
(
d
[
0
],
d
[
1
],
d
[
2
],
t
[
0
],
t
[
1
],
t
[
2
],
tzinfo
=
self
.
tz
)
.
isoformat
(
'T'
)
item
[
'date'
]
=
d
item
[
'topic'
]
=
response
.
url
[
response
.
url
.
rfind
(
'/'
)
+
1
:]
.
title
()
# yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item_2)
request
=
scrapy
.
Request
(
url
=
self
.
baseURL
+
'/'
+
link
,
callback
=
self
.
parse_item_2
)
request
.
meta
[
'item'
]
=
item
yield
request
def
parse
(
self
,
response
):
# para los json
json_response
=
json
.
loads
(
response
.
text
)
if
not
(
response
.
url
[
response
.
url
.
rfind
(
'/'
)
+
1
:]
==
'notas?opinion'
):
json_list
=
json_response
else
:
json_list
=
json_response
[
'articles'
]
for
line
in
json_list
:
this_date
=
datetime
.
strptime
(
line
[
'publishDate'
][:
line
[
'publishDate'
]
.
rfind
(
' '
)],
self
.
date_format
)
this_date
=
this_date
.
date
()
if
(
this_date
==
self
.
req_date
):
self
.
page
=
int
(
response
.
url
[
response
.
url
.
rfind
(
'='
)
+
1
:])
self
.
found
=
True
self
.
flag
=
True
break
elif
(
this_date
<
self
.
req_date
):
self
.
flag
=
True
break
def
parse_item_2
(
self
,
response
):
# para las primeras noticias
item
=
response
.
meta
[
'item'
]
# item = NoticiasItem()
text
=
''
# item['date'] = response.url[:response.url.rfind('/')][response.url[:response.url.rfind('/')].rfind('/')+1:]
# item['topic'] = self.section.title()
item
[
'title'
]
=
response
.
xpath
(
'//article/h1/text()'
)
.
extract_first
()
for
paragraph
in
response
.
xpath
(
'//*[@class="txt"]'
)
.
extract
():
text
+=
remove_tags
(
paragraph
)
+
'
\n
'
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
print
item
[
'title'
]
yield
item
def
parse_page
(
self
,
response
):
# para los json
json_response
=
json
.
loads
(
response
.
text
)
if
not
(
response
.
url
[
response
.
url
.
rfind
(
'/'
)
+
1
:]
==
'notas?opinion'
):
topic
=
response
.
url
[
response
.
url
.
rfind
(
'/'
)
+
1
:
response
.
url
.
rfind
(
'='
)
-
2
]
.
title
()
json_list
=
json_response
else
:
json_list
=
json_response
[
'articles'
]
topic
=
'Opinion'
for
line
in
json_list
:
this_date
=
datetime
.
strptime
(
line
[
'publishDate'
][:
line
[
'publishDate'
]
.
rfind
(
' '
)],
self
.
date_format
)
this_date
=
this_date
.
date
()
if
(
this_date
==
self
.
req_date
):
item
=
NoticiasItem
()
# item['date'] = line['publishDate']
d
=
line
[
'publishDate'
]
if
len
(
d
)
==
10
:
d
=
map
(
int
,
d
.
split
(
'-'
))
d
=
datetime
(
d
[
0
],
d
[
1
],
d
[
2
],
tzinfo
=
self
.
tz
)
.
isoformat
(
'T'
)
elif
len
(
d
)
==
19
:
d
,
t
=
d
.
split
(
' '
)
d
=
map
(
int
,
d
.
split
(
'-'
))
t
=
map
(
int
,
t
.
split
(
':'
))
d
=
datetime
(
d
[
0
],
d
[
1
],
d
[
2
],
t
[
0
],
t
[
1
],
t
[
2
],
tzinfo
=
self
.
tz
)
.
isoformat
(
'T'
)
item
[
'date'
]
=
d
item
[
'topic'
]
=
topic
item
[
'title'
]
=
line
[
'name'
]
if
not
(
response
.
url
[
response
.
url
.
rfind
(
'/'
)
+
1
:]
==
'notas?opinion'
):
request
=
scrapy
.
Request
(
url
=
self
.
baseURL
+
line
[
'url'
],
callback
=
self
.
parse_item
)
else
:
request
=
scrapy
.
Request
(
url
=
self
.
baseURL
+
'/'
+
line
[
'publishDate'
][:
line
[
'publishDate'
]
.
rfind
(
' '
)]
+
'/'
+
line
[
'uriComponent'
],
callback
=
self
.
parse_item
)
request
.
meta
[
'item'
]
=
item
yield
request
def
parse_item
(
self
,
response
):
# para los json
item
=
response
.
meta
[
'item'
]
text
=
''
for
paragraph
in
response
.
xpath
(
'//*[@class="txt"]'
)
.
extract
():
text
+=
remove_tags
(
paragraph
)
+
'
\n
'
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
print
item
[
'title'
]
yield
item
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/laJornadaMaya_deprecated/scrapy.cfg
deleted
100644 → 0
View file @
093e0e82
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = laJornadaMaya.settings
[deploy]
#url = http://localhost:6800/
project = laJornadaMaya
This diff is collapsed.
Click to expand it.
descarga_por_mes/proceso_prueba/proceso_prueba/__init__.py
deleted
100644 → 0
View file @
093e0e82
This diff is collapsed.
Click to expand it.
descarga_por_mes/proceso_prueba/proceso_prueba/items.py
deleted
100644 → 0
View file @
093e0e82
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import
scrapy
class
NoticiasItem
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
This diff is collapsed.
Click to expand it.
descarga_por_mes/proceso_prueba/proceso_prueba/middlewares.py
deleted
100644 → 0
View file @
093e0e82
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from
scrapy
import
signals
class
ProcesoPruebaSpiderMiddleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_spider_input
(
self
,
response
,
spider
):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return
None
def
process_spider_output
(
self
,
response
,
result
,
spider
):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for
i
in
result
:
yield
i
def
process_spider_exception
(
self
,
response
,
exception
,
spider
):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def
process_start_requests
(
self
,
start_requests
,
spider
):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for
r
in
start_requests
:
yield
r
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
class
ProcesoPruebaDownloaderMiddleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_request
(
self
,
request
,
spider
):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return
None
def
process_response
(
self
,
request
,
response
,
spider
):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return
response
def
process_exception
(
self
,
request
,
exception
,
spider
):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
This diff is collapsed.
Click to expand it.
descarga_por_mes/proceso_prueba/proceso_prueba/pipelines.py
deleted
100644 → 0
View file @
093e0e82
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import
json
from
collections
import
OrderedDict
class
JsonWriterPipeline
(
object
):
def
__init__
(
self
,
filename
):
self
.
filename
=
filename
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# Here you get whatever value was passed through the "filename" command line parameter
settings
=
crawler
.
settings
filename
=
settings
.
get
(
'filename'
)
# Instantiate the pipeline with the file name
return
cls
(
filename
)
def
open_spider
(
self
,
spider
):
self
.
counter
=
0
self
.
file
=
open
(
self
.
filename
,
'w'
)
self
.
file
.
write
(
"["
)
def
close_spider
(
self
,
spider
):
self
.
file
.
write
(
"]"
)
self
.
file
.
close
()
def
process_item
(
self
,
item
,
spider
):
# print("this is my item", item)
row
=
[]
try
:
row
.
append
((
"date"
,
item
[
'date'
]))
except
:
pass
try
:
row
.
append
((
"topic"
,
item
[
'topic'
]))
except
:
pass
try
:
row
.
append
((
"title"
,
item
[
'title'
]))
except
:
pass
try
:
row
.
append
((
"author"
,
item
[
'author'
]))
except
:
pass
try
:
row
.
append
((
"location"
,
item
[
'location'
]))
except
:
pass
try
:
row
.
append
((
"text"
,
item
[
'text'
]))
except
:
pass
try
:
row
.
append
((
"url"
,
item
[
'url'
]))
except
:
pass
line
=
OrderedDict
(
row
)
self
.
counter
+=
1
if
self
.
counter
==
1
:
self
.
file
.
write
(
json
.
dumps
(
line
))
elif
self
.
counter
>
1
:
self
.
file
.
write
(
",
\n
"
+
json
.
dumps
(
line
))
return
item
This diff is collapsed.
Click to expand it.
descarga_por_mes/proceso_prueba/proceso_prueba/settings.py
deleted
100644 → 0
View file @
093e0e82
# -*- coding: utf-8 -*-
# Scrapy settings for proceso_prueba project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME
=
'proceso_prueba'
SPIDER_MODULES
=
[
'proceso_prueba.spiders'
]
NEWSPIDER_MODULE
=
'proceso_prueba.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT
=
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY
=
1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED
=
True
COOKIES_DEBUG
=
True
SPLASH_COOKIES_DEBUG
=
True
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES
=
{
# 'proceso_prueba.middlewares.ProcesoPruebaSpiderMiddleware': 543,
'scrapy_splash.SplashDeduplicateArgsMiddleware'
:
100
,
}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES
=
{
'scrapy_splash.SplashCookiesMiddleware'
:
723
,
'scrapy_splash.SplashMiddleware'
:
725
,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware'
:
810
,
}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES
=
{
'proceso_prueba.pipelines.JsonWriterPipeline'
:
300
,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
HTTPCACHE_STORAGE
=
'scrapy_splash.SplashAwareFSCacheStorage'
SPLASH_URL
=
'http://localhost:8050/'
DUPEFILTER_CLASS
=
'scrapy_splash.SplashAwareDupeFilter'
This diff is collapsed.
Click to expand it.
descarga_por_mes/proceso_prueba/proceso_prueba/spiders/__init__.py
deleted
100644 → 0
View file @
093e0e82
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
This diff is collapsed.
Click to expand it.
descarga_por_mes/proceso_prueba/proceso_prueba/spiders/noticias.py
deleted
100644 → 0
View file @
093e0e82
# -*- coding: utf-8 -*-
"""
MEDIA:
Proceso, CDMX
USAGE:
## For this crawler 'scrapy-splash' is used because the content is loaded through javascript. ##
## Read especs_sitio_proceso.txt file. ##
$ cd proceso_prueba/
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
for the news contained in noticias.json being splitted into files by date. ##
$ scrapy crawl noticias --nolog -s filename=noticias.json
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to a specific year-month date. ##
$ scrapy crawl noticias --nolog -s filename=2018-09.json -a year=2018 -a month=9
"""
import
scrapy
,
re
,
cfscrape
from
proceso_prueba.items
import
NoticiasItem
from
datetime
import
datetime
,
date
,
timedelta
,
tzinfo
from
scrapy.http.cookies
import
CookieJar
from
scrapy_splash
import
SplashRequest
,
SplashFormRequest
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
class
UTC
(
tzinfo
):
"""
Class for Time Zone
"""
def
utcoffset
(
self
,
dt
):
## Time zone for CDMX: UTC-6 ##
return
timedelta
(
hours
=-
6
)
def
tzname
(
self
,
dt
):
## Time zone name ##
return
'UTC-6'
USER_AGENT
=
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
script
=
"""
function main(splash)
splash:init_cookies(splash.args.cookies)
assert(splash:go{
splash.args.url,
headers=splash.args.headers,
http_method=splash.args.http_method,
body=splash.args.body,
})
assert(splash:wait(0.5))
local entries = splash:history()
local last_response = entries[#entries].response
return {
url = splash:url(),
headers = last_response.headers,
http_status = last_response.status,
cookies = splash:get_cookies(),
html = splash:html(),
}
end
"""
class
QuotesSpider
(
scrapy
.
Spider
):
"""
Basic Scrapy Spider class
"""
name
=
"noticias"
def
start_requests
(
self
):
self
.
tz
=
UTC
()
year
=
getattr
(
self
,
'year'
,
None
)
month
=
getattr
(
self
,
'month'
,
None
)
self
.
month_parser
=
{
'enero'
:
'01'
,
'febrero'
:
'02'
,
'marzo'
:
'03'
,
'abril'
:
'04'
,
'mayo'
:
'05'
,
'junio'
:
'06'
,
'julio'
:
'07'
,
'agosto'
:
'08'
,
'septiembre'
:
'09'
,
'octubre'
:
'10'
,
'noviembre'
:
'11'
,
'diciembre'
:
'12'
}
self
.
baseURL
=
"https://hemeroteca.proceso.com.mx/"
login_url
=
"https://hemeroteca.proceso.com.mx/wp-login.php"
if
year
is
not
None
and
month
is
not
None
:
self
.
stop_date
=
date
(
int
(
year
),
int
(
month
),
15
)
# yield scrapy.Request(url=login_url, callback=self.parse_with_stop_date)
else
:
self
.
stop_date
=
None
# yield scrapy.Request(url=login_url, callback=self.parse)
self
.
token
,
self
.
agent
=
cfscrape
.
get_tokens
(
login_url
,
user_agent
=
USER_AGENT
)
print
"token"
print
self
.
token
yield
SplashRequest
(
url
=
login_url
,
callback
=
self
.
parse_login
,
endpoint
=
'render.html'
,
args
=
{
'wait'
:
0.5
},
cookies
=
self
.
token
,
headers
=
{
'User-Agent'
:
self
.
agent
})
def
parse_login
(
self
,
response
):
return
SplashFormRequest
.
from_response
(
response
,
method
=
"POST"
,
formdata
=
{
'log'
:
'carlos_silvaforne@yahoo.com.mx'
,
'pwd'
:
'carlos_silvaforne@'
},
callback
=
self
.
after_login
,
dont_click
=
True
)
def
after_login
(
self
,
response
):
print
"
\n
response.request.headers"
print
response
.
request
.
meta
print
"
\n
response.headers"
print
response
.
headers
session_legend
=
response
.
css
(
'div.topnav > a'
)
.
extract
()[
-
1
]
if
session_legend
is
not
None
:
print
"
\n
session_legend"
print
remove_tags
(
session_legend
)
else
:
print
"No log in."
item_link
=
"https://hemeroteca.proceso.com.mx/?page_id=420325"
yield
SplashRequest
(
url
=
item_link
,
callback
=
self
.
parse_links
,
meta
=
response
.
request
.
meta
,
endpoint
=
'execute'
,
cache_args
=
[
'lua_source'
],
args
=
{
'lua_source'
:
script
},
headers
=
{
'User-Agent'
:
self
.
agent
}
)
# yield SplashRequest(url=item_link, callback=self.parse_links, endpoint='render.html', args={ 'wait': 0.5 }, cookies=self.token,
# headers={'User-Agent' : self.agent})
def
parse_links
(
self
,
response
):
session_legend
=
response
.
css
(
'div.topnav > a'
)
.
extract
()
print
"
\n
response.body"
print
response
.
body
# if session_legend is not None :
# print "\nsession_legend"
# print remove_tags(session_legend)
# else :
# print "No log in."
print
"
\n
response.request.headers"
print
response
.
request
.
headers
print
"
\n
"
def
after_login_org
(
self
,
response
):
## Check login succeed before going on ##
print
response
.
headers
print
"
\n
"
print
response
.
real_url
print
"
\n
"
print
response
.
request
.
headers
print
"
\n
"
cookie_list
=
response
.
request
.
headers
.
getlist
(
'Cookie'
)
cfc
,
cfd
=
cookie_list
[
0
]
.
split
(
';'
)
cfc
=
cfc
.
strip
()
.
split
(
'='
)
cfd
=
cfd
.
strip
()
.
split
(
'='
)
cookies
=
[
cfc
[
1
],
cfd
[
1
]]
cookies
=
{
cfc
[
0
]:
cfc
[
1
],
cfd
[
0
]:
cfd
[
1
]}
session_legend
=
response
.
css
(
'div.topnav > a'
)
.
extract
()[
-
1
]
print
response
.
css
(
'h1.entry-title'
)
.
extract_first
()
print
"
\n
"
if
session_legend
is
not
None
:
session_legend
=
remove_tags
(
session_legend
)
if
not
"Cerrar"
in
session_legend
:
print
"Login failed."
else
:
print
session_legend
print
"
\n
"
token
,
agent
=
cfscrape
.
get_tokens
(
self
.
baseURL
,
user_agent
=
USER_AGENT
)
print
token
print
"
\n
"
if
self
.
stop_date
is
None
:
yield
scrapy
.
Request
(
url
=
self
.
baseURL
,
callback
=
self
.
parse
)
else
:
yield
SplashRequest
(
url
=
self
.
baseURL
,
callback
=
self
.
parse_with_stop_date
,
cookies
=
cookies
,
endpoint
=
'execute'
,
cache_args
=
[
'lua_source'
],
args
=
{
'lua_source'
:
script
},
headers
=
{
'User-Agent'
:
USER_AGENT
}
)
# request = SplashRequest(url=self.baseURL, callback=self.parse_with_stop_date,
# endpoint='execute',
# cache_args=['lua_source'],
# args={'lua_source': script},
# headers={'User-Agent': USER_AGENT}
# )
# request.meta['splash']['session_id'] = cookie_list[0]
# yield request
# if "authentication failed" in response.body:
# self.logger.error("Login failed.")
# return
# else:
# # token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
# if self.stop_date is None:
# yield scrapy.Request(url=self.baseURL, callback=self.parse, dont_filter=True)
# else:
# yield scrapy.Request(url=self.baseURL, callback=self.parse_with_stop_date, dont_filter=True)
# yield scrapy.Request(
# url=self.baseURL,
# callback=self.parse_with_stop_date,
# cookies=token,
# headers={'User-Agent' : agent}
# )
def
parse_with_stop_date
(
self
,
response
):
print
"parse_with_stop_date"
print
"
\n
"
print
response
.
css
(
'h1.entry-title'
)
.
extract_first
()
print
"
\n
"
print
response
.
cookiejar
print
"
\n
"
print
response
.
headers
print
"
\n
"
session_legend
=
response
.
css
(
'div.topnav > a'
)
.
extract
()[
-
1
]
if
session_legend
is
not
None
:
print
remove_tags
(
session_legend
)
print
"
\n
"
else
:
print
"No log in."
TO_NEXT_PAGE
=
True
for
item
in
response
.
css
(
'div.catpor-box > div'
):
item_date
=
item
.
css
(
'span.catpor-published'
)
.
extract_first
()
if
item_date
is
not
None
:
item_date
=
remove_tags
(
item_date
)
.
replace
(
","
,
''
)
item_date
=
item_date
.
split
(
' '
)
item_date
[
1
]
=
self
.
month_parser
[
item_date
[
1
]]
item_date
=
map
(
int
,
item_date
)
item_date
=
date
(
item_date
[
2
],
item_date
[
1
],
item_date
[
0
])
if
item_date
>=
self
.
stop_date
:
item_link
=
item
.
css
(
'span.catpor-title > a::attr(href)'
)
.
extract_first
()
print
item_link
# token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
# yield scrapy.Request(url=item_link, callback=self.parse_links, cookies=cookies)
# yield scrapy.Request(url=item_link, callback=self.parse_links, cookies=token,
# headers={'User-Agent' : agent})
# yield SplashRequest(url=item_link, callback=self.parse_links,
# endpoint='execute',
# cache_args=['lua_source'],
# args={'lua_source': script},
# headers={'User-Agent': USER_AGENT}
# )
else
:
TO_NEXT_PAGE
=
False
break
if
TO_NEXT_PAGE
:
next_page
=
response
.
css
(
'div.page-navigation > div.nav-next > a::attr(href)'
)
.
extract_first
()
if
next_page
is
not
None
:
# yield scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
yield
SplashRequest
(
url
=
next_page
,
callback
=
self
.
parse_with_stop_date
,
endpoint
=
'execute'
,
cache_args
=
[
'lua_source'
],
args
=
{
'lua_source'
:
script
},
headers
=
{
'User-Agent'
:
USER_AGENT
}
)
def
parse_links_org
(
self
,
response
):
print
"
\n\n
"
print
response
.
headers
for
link
in
response
.
css
(
'div.post-container > h2 > a::attr(href)'
)
.
extract
():
# print link
# token, agent = cfscrape.get_tokens(self.baseURL, user_agent=USER_AGENT)
# yield SplashRequest(url=link, callback=self.parse_item, endpoint='render.html', args={ 'wait': 0.5 }, cookies=token,
# headers={'User-Agent' : agent})
yield
SplashRequest
(
url
=
link
,
callback
=
self
.
parse_item
,
endpoint
=
'execute'
,
cache_args
=
[
'lua_source'
],
args
=
{
'lua_source'
:
script
},
headers
=
{
'User-Agent'
:
USER_AGENT
}
)
def
parse_item_org
(
self
,
response
):
# if response.url == "https://hemeroteca.proceso.com.mx/?page_id=278958&a51dc26366d99bb5fa29cea4747565fec=420203":
# print response.body
item
=
NoticiasItem
()
text
=
''
news_date
=
response
.
xpath
(
'//div[@id="primary"]'
)
.
css
(
'span.published'
)
.
extract_first
()
if
news_date
is
not
None
:
news_date
=
remove_tags
(
news_date
)
print
news_date
d
,
t
=
news_date
.
split
(
' '
)
d
=
map
(
int
,
d
.
split
(
"-"
))
t
=
map
(
int
,
t
.
split
(
":"
))
news_date
=
datetime
(
d
[
0
],
d
[
1
],
d
[
2
],
t
[
0
],
t
[
1
],
t
[
2
],
tzinfo
=
self
.
tz
)
.
isoformat
(
'T'
)
title
=
response
.
xpath
(
'//div[@id="primary"]/div/h1'
)
.
extract_first
()
if
title
is
not
None
:
title
=
remove_tags
(
title
)
topic
=
response
.
css
(
'span.entry-categories'
)
.
extract_first
()
if
topic
is
not
None
:
topic
=
remove_tags
(
topic
)
for
paragraph
in
response
.
xpath
(
'//div[@id="primary"]'
)
.
css
(
'div.entry-content > div'
)
.
css
(
'p'
)
.
extract
():
text
+=
remove_tags
(
paragraph
)
+
'
\n
'
## News item info ##
item
[
'date'
]
=
news_date
item
[
'title'
]
=
title
item
[
'topic'
]
=
topic
item
[
'text'
]
=
text
.
strip
()
item
[
'url'
]
=
response
.
url
yield
item
This diff is collapsed.
Click to expand it.
descarga_por_mes/proceso_prueba/scrapy.cfg
deleted
100644 → 0
View file @
093e0e82
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = proceso_prueba.settings
[deploy]
#url = http://localhost:6800/
project = proceso_prueba
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment