Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
b3a05891
Commit
b3a05891
authored
7 years ago
by
Renán Sosa Guillen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
crawlers
parent
3e1b2011
Changes
24
Hide whitespace changes
Inline
Side-by-side
Showing
24 changed files
with
534 additions
and
0 deletions
+534
-0
__init__.py
descarga_por_dia/laRazon/laRazon/__init__.py
+0
-0
__init__.pyc
descarga_por_dia/laRazon/laRazon/__init__.pyc
+0
-0
items.py
descarga_por_dia/laRazon/laRazon/items.py
+14
-0
middlewares.py
descarga_por_dia/laRazon/laRazon/middlewares.py
+56
-0
pipelines.py
descarga_por_dia/laRazon/laRazon/pipelines.py
+11
-0
settings.py
descarga_por_dia/laRazon/laRazon/settings.py
+90
-0
settings.pyc
descarga_por_dia/laRazon/laRazon/settings.pyc
+0
-0
__init__.py
descarga_por_dia/laRazon/laRazon/spiders/__init__.py
+4
-0
__init__.pyc
descarga_por_dia/laRazon/laRazon/spiders/__init__.pyc
+0
-0
noticias.py
descarga_por_dia/laRazon/laRazon/spiders/noticias.py
+69
-0
noticias.pyc
descarga_por_dia/laRazon/laRazon/spiders/noticias.pyc
+0
-0
scrapy.cfg
descarga_por_dia/laRazon/scrapy.cfg
+11
-0
scrapy.cfg
descarga_por_dia/unoMasUno/scrapy.cfg
+11
-0
__init__.py
descarga_por_dia/unoMasUno/unoMasUno/__init__.py
+0
-0
__init__.pyc
descarga_por_dia/unoMasUno/unoMasUno/__init__.pyc
+0
-0
items.py
descarga_por_dia/unoMasUno/unoMasUno/items.py
+14
-0
middlewares.py
descarga_por_dia/unoMasUno/unoMasUno/middlewares.py
+56
-0
pipelines.py
descarga_por_dia/unoMasUno/unoMasUno/pipelines.py
+11
-0
settings.py
descarga_por_dia/unoMasUno/unoMasUno/settings.py
+90
-0
settings.pyc
descarga_por_dia/unoMasUno/unoMasUno/settings.pyc
+0
-0
__init__.py
descarga_por_dia/unoMasUno/unoMasUno/spiders/__init__.py
+4
-0
__init__.pyc
descarga_por_dia/unoMasUno/unoMasUno/spiders/__init__.pyc
+0
-0
noticias.py
descarga_por_dia/unoMasUno/unoMasUno/spiders/noticias.py
+93
-0
noticias.pyc
descarga_por_dia/unoMasUno/unoMasUno/spiders/noticias.pyc
+0
-0
No files found.
descarga_por_dia/laRazon/laRazon/__init__.py
0 → 100644
View file @
b3a05891
This diff is collapsed.
Click to expand it.
descarga_por_dia/laRazon/laRazon/__init__.pyc
0 → 100644
View file @
b3a05891
File added
This diff is collapsed.
Click to expand it.
descarga_por_dia/laRazon/laRazon/items.py
0 → 100644
View file @
b3a05891
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import
scrapy
class
LarazonItem
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
pass
This diff is collapsed.
Click to expand it.
descarga_por_dia/laRazon/laRazon/middlewares.py
0 → 100644
View file @
b3a05891
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from
scrapy
import
signals
class
LarazonSpiderMiddleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_spider_input
(
response
,
spider
):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return
None
def
process_spider_output
(
response
,
result
,
spider
):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for
i
in
result
:
yield
i
def
process_spider_exception
(
response
,
exception
,
spider
):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def
process_start_requests
(
start_requests
,
spider
):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for
r
in
start_requests
:
yield
r
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
This diff is collapsed.
Click to expand it.
descarga_por_dia/laRazon/laRazon/pipelines.py
0 → 100644
View file @
b3a05891
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class
LarazonPipeline
(
object
):
def
process_item
(
self
,
item
,
spider
):
return
item
This diff is collapsed.
Click to expand it.
descarga_por_dia/laRazon/laRazon/settings.py
0 → 100644
View file @
b3a05891
# -*- coding: utf-8 -*-
# Scrapy settings for laRazon project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME
=
'laRazon'
SPIDER_MODULES
=
[
'laRazon.spiders'
]
NEWSPIDER_MODULE
=
'laRazon.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'laRazon (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY
=
True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED
=
False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'laRazon.middlewares.LarazonSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'laRazon.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'laRazon.pipelines.LarazonPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
This diff is collapsed.
Click to expand it.
descarga_por_dia/laRazon/laRazon/settings.pyc
0 → 100644
View file @
b3a05891
File added
This diff is collapsed.
Click to expand it.
descarga_por_dia/laRazon/laRazon/spiders/__init__.py
0 → 100644
View file @
b3a05891
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
This diff is collapsed.
Click to expand it.
descarga_por_dia/laRazon/laRazon/spiders/__init__.pyc
0 → 100644
View file @
b3a05891
File added
This diff is collapsed.
Click to expand it.
descarga_por_dia/laRazon/laRazon/spiders/noticias.py
0 → 100644
View file @
b3a05891
import
scrapy
,
re
'''
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=28
'''
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
class
NoticiasItem
(
scrapy
.
Item
):
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
class
QuotesSpider
(
scrapy
.
Spider
):
name
=
"noticias"
def
start_requests
(
self
):
year
=
getattr
(
self
,
'year'
,
None
)
month
=
getattr
(
self
,
'month'
,
None
)
day
=
getattr
(
self
,
'day'
,
None
)
self
.
baseURL
=
'https://www.razon.com.mx/'
+
year
+
'/'
+
month
+
'/'
+
day
yield
scrapy
.
Request
(
url
=
self
.
baseURL
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
pagination
=
response
.
xpath
(
'//*[@class="page-nav td-pb-padding-side"]/a[@class="last"]/@href'
)
.
extract_first
()
pagination
=
pagination
.
strip
(
'/'
)
pages
=
int
(
pagination
[
pagination
.
rfind
(
'/'
)
+
1
:])
for
page
in
range
(
0
,
pages
):
if
page
==
0
:
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
else
:
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
'/page/'
+
str
(
page
+
1
),
callback
=
self
.
parse_page
)
def
parse_page
(
self
,
response
):
for
link
in
response
.
xpath
(
'//*[@class="td_module_1 td_module_wrap td-animation-stack"]/h3[@class="entry-title td-module-title"]/a/@href'
)
.
extract
():
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
def
parse_item
(
self
,
response
):
item
=
NoticiasItem
()
text
=
''
item
[
'date'
]
=
response
.
xpath
(
'//span[@class="td-post-date td-post-date-no-dot"]/time/@datetime'
)
.
extract_first
()
item
[
'topic'
]
=
response
.
xpath
(
'//*[@class="entry-crumbs"]/span/a[@class="entry-crumb"]/text()'
)
.
extract
()[
2
]
item
[
'title'
]
=
response
.
xpath
(
'//*[@class="td-post-header"]/header/h1/text()'
)
.
extract_first
()
for
p
in
response
.
xpath
(
'//*[@class="td-post-content"]/p'
)
.
extract
():
text
+=
remove_tags
(
p
)
+
'
\n
'
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
# print item['title']
yield
item
This diff is collapsed.
Click to expand it.
descarga_por_dia/laRazon/laRazon/spiders/noticias.pyc
0 → 100644
View file @
b3a05891
File added
This diff is collapsed.
Click to expand it.
descarga_por_dia/laRazon/scrapy.cfg
0 → 100644
View file @
b3a05891
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = laRazon.settings
[deploy]
#url = http://localhost:6800/
project = laRazon
This diff is collapsed.
Click to expand it.
descarga_por_dia/unoMasUno/scrapy.cfg
0 → 100644
View file @
b3a05891
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = unoMasUno.settings
[deploy]
#url = http://localhost:6800/
project = unoMasUno
This diff is collapsed.
Click to expand it.
descarga_por_dia/unoMasUno/unoMasUno/__init__.py
0 → 100644
View file @
b3a05891
This diff is collapsed.
Click to expand it.
descarga_por_dia/unoMasUno/unoMasUno/__init__.pyc
0 → 100644
View file @
b3a05891
File added
This diff is collapsed.
Click to expand it.
descarga_por_dia/unoMasUno/unoMasUno/items.py
0 → 100644
View file @
b3a05891
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import
scrapy
class
UnomasunoItem
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
pass
This diff is collapsed.
Click to expand it.
descarga_por_dia/unoMasUno/unoMasUno/middlewares.py
0 → 100644
View file @
b3a05891
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from
scrapy
import
signals
class
UnomasunoSpiderMiddleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_spider_input
(
response
,
spider
):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return
None
def
process_spider_output
(
response
,
result
,
spider
):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for
i
in
result
:
yield
i
def
process_spider_exception
(
response
,
exception
,
spider
):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def
process_start_requests
(
start_requests
,
spider
):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for
r
in
start_requests
:
yield
r
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
This diff is collapsed.
Click to expand it.
descarga_por_dia/unoMasUno/unoMasUno/pipelines.py
0 → 100644
View file @
b3a05891
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class
UnomasunoPipeline
(
object
):
def
process_item
(
self
,
item
,
spider
):
return
item
This diff is collapsed.
Click to expand it.
descarga_por_dia/unoMasUno/unoMasUno/settings.py
0 → 100644
View file @
b3a05891
# -*- coding: utf-8 -*-
# Scrapy settings for unoMasUno project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME
=
'unoMasUno'
SPIDER_MODULES
=
[
'unoMasUno.spiders'
]
NEWSPIDER_MODULE
=
'unoMasUno.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'unoMasUno (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY
=
True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED
=
False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'unoMasUno.middlewares.UnomasunoSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'unoMasUno.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'unoMasUno.pipelines.UnomasunoPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
This diff is collapsed.
Click to expand it.
descarga_por_dia/unoMasUno/unoMasUno/settings.pyc
0 → 100644
View file @
b3a05891
File added
This diff is collapsed.
Click to expand it.
descarga_por_dia/unoMasUno/unoMasUno/spiders/__init__.py
0 → 100644
View file @
b3a05891
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
This diff is collapsed.
Click to expand it.
descarga_por_dia/unoMasUno/unoMasUno/spiders/__init__.pyc
0 → 100644
View file @
b3a05891
File added
This diff is collapsed.
Click to expand it.
descarga_por_dia/unoMasUno/unoMasUno/spiders/noticias.py
0 → 100644
View file @
b3a05891
import
scrapy
,
re
,
datetime
'''
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=22
'''
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
class
NoticiasItem
(
scrapy
.
Item
):
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
class
QuotesSpider
(
scrapy
.
Spider
):
name
=
"noticias"
def
start_requests
(
self
):
year
=
getattr
(
self
,
'year'
,
None
)
month
=
getattr
(
self
,
'month'
,
None
)
day
=
getattr
(
self
,
'day'
,
None
)
self
.
date_parser
=
{
'enero'
:
1
,
'febrero'
:
2
,
'marzo'
:
3
,
'abril'
:
4
,
'mayo'
:
5
,
'junio'
:
6
,
'julio'
:
7
,
'agosto'
:
8
,
'septiembre'
:
9
,
'octubre'
:
10
,
'noviembre'
:
9
,
'diciembre'
:
12
}
self
.
baseURL
=
'http://www.unomasuno.com.mx/index.php/'
+
year
+
'/'
+
month
+
'/'
+
day
yield
scrapy
.
Request
(
url
=
self
.
baseURL
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
pagination
=
response
.
xpath
(
'//*[@class="pagination"]/a[@class="last"]/@href'
)
.
extract_first
()
if
pagination
is
None
:
pagination
=
response
.
xpath
(
'//*[@class="pagination"]/a/@href'
)
.
extract
()
if
len
(
pagination
)
>
0
:
pagination
=
pagination
[
-
1
]
.
strip
(
'/'
)
pages
=
int
(
pagination
[
pagination
.
rfind
(
'/'
)
+
1
:])
for
page
in
range
(
0
,
pages
):
if
page
==
0
:
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
else
:
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
'/page/'
+
str
(
page
+
1
),
callback
=
self
.
parse_page
)
else
:
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
else
:
pagination
=
pagination
.
strip
(
'/'
)
pages
=
int
(
pagination
[
pagination
.
rfind
(
'/'
)
+
1
:])
for
page
in
range
(
0
,
pages
):
if
page
==
0
:
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
else
:
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
'/page/'
+
str
(
page
+
1
),
callback
=
self
.
parse_page
)
def
parse_page
(
self
,
response
):
for
link
in
response
.
xpath
(
'//h2[@class="post-box-title"]/a/@href'
)
.
extract
():
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
def
parse_item
(
self
,
response
):
item
=
NoticiasItem
()
text
=
''
d
=
response
.
xpath
(
'//p[@class="post-meta"]/span/text()'
)
.
extract_first
()
d
=
d
.
replace
(
','
,
''
)
.
split
(
' '
)
item
[
'date'
]
=
datetime
.
date
(
int
(
d
[
2
]),
self
.
date_parser
[
d
[
0
]
.
lower
()],
int
(
d
[
1
]))
item
[
'topic'
]
=
response
.
xpath
(
'//span[@typeof="v:Breadcrumb"]/a/text()'
)
.
extract
()[
1
]
item
[
'title'
]
=
response
.
xpath
(
'//*[@class="post-inner"]/h1/span/text()'
)
.
extract_first
()
for
p
in
response
.
xpath
(
'//*[@class="entry"]/p'
)
.
extract
():
text
+=
remove_tags
(
p
)
+
'
\n
'
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
# print item['title']
yield
item
This diff is collapsed.
Click to expand it.
descarga_por_dia/unoMasUno/unoMasUno/spiders/noticias.pyc
0 → 100644
View file @
b3a05891
File added
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment