Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
9cf59fb3
Commit
9cf59fb3
authored
2 years ago
by
Mario Chirinos
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
modificacion el comentario
parent
673e21e0
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
13 additions
and
13 deletions
+13
-13
noticias.py
spiders/daily/elComentario/elComentario/spiders/noticias.py
+13
-13
No files found.
spiders/daily/elComentario/elComentario/spiders/noticias.py
View file @
9cf59fb3
...
...
@@ -12,11 +12,11 @@ class NoticiasSpider(scrapy.Spider):
start_urls
=
[
'http://elcomentario.ucol.mx/'
]
def
start_requests
(
self
):
year
=
getattr
(
self
,
"year"
,
None
)
month
=
getattr
(
self
,
"month"
,
None
)
day
=
getattr
(
self
,
"day"
,
None
)
self
.
year
=
getattr
(
self
,
"year"
,
None
)
self
.
month
=
getattr
(
self
,
"month"
,
None
)
self
.
day
=
getattr
(
self
,
"day"
,
None
)
baseURL
=
"https://elcomentario.ucol.mx/{0}/{1}/{2}/"
.
format
(
year
,
month
.
zfill
(
2
),
day
.
zfill
(
2
))
baseURL
=
"https://elcomentario.ucol.mx/{0}/{1}/{2}/"
.
format
(
self
.
year
,
self
.
month
.
zfill
(
2
),
self
.
day
.
zfill
(
2
))
yield
scrapy
.
Request
(
url
=
baseURL
,
callback
=
self
.
parse
)
...
...
@@ -24,26 +24,26 @@ class NoticiasSpider(scrapy.Spider):
def
parse
(
self
,
response
):
print
(
response
.
url
)
for
link
in
response
.
xpath
(
'//h
5[@class="mkd-pt-six
-title"]/a/@href'
)
.
extract
():
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
for
link
in
response
.
xpath
(
'//h
2[@class="thumb
-title"]/a/@href'
)
.
extract
():
yield
scrapy
.
Request
(
url
=
"https://elcomentario.ucol.mx"
+
link
,
callback
=
self
.
parse_item
)
next_page
=
response
.
xpath
(
'//li[@class="
mkd-pagination-next
"]/a/@href'
)
.
extract_first
()
next_page
=
response
.
xpath
(
'//li[@class="
the-next-page
"]/a/@href'
)
.
extract_first
()
print
(
"next_page"
,
next_page
)
if
next_page
is
not
None
:
yield
scrapy
.
Request
(
url
=
next_page
,
callback
=
self
.
parse
)
#-----------------------------------------------------------------------
def
parse_item
(
self
,
response
):
print
(
response
.
url
)
#
print(response.url)
item
=
ElcomentarioItem
()
item
[
"date"
]
=
response
.
xpath
(
"//meta[@property='article:published_time']/@content"
)
.
extract_first
(
)
item
[
"title"
]
=
response
.
xpath
(
"//
meta[@property='og:title']/@content
"
)
.
extract_first
()
item
[
"topic"
]
=
""
item
[
"date"
]
=
self
.
year
+
"-"
+
self
.
month
.
zfill
(
2
)
+
"-"
+
self
.
day
.
zfill
(
2
)
item
[
"title"
]
=
response
.
xpath
(
"//
div[@class='entry-header']/h1/text()
"
)
.
extract_first
()
item
[
"topic"
]
=
response
.
xpath
(
"//div[@class='entry-header']/span/a/text()"
)
.
extract_first
()
text
=
""
for
p
in
response
.
xpath
(
'//div[@class="
pf-content
"]/p'
)
.
extract
():
for
p
in
response
.
xpath
(
'//div[@class="
entry-content entry clearfix
"]/p'
)
.
extract
():
text
+=
remove_tags
(
p
)
+
"
\n
"
item
[
"text"
]
=
text
item
[
"url"
]
=
response
.
url
print
(
item
[
"title"
])
print
(
self
.
allowed_domains
,
item
[
"title"
])
yield
(
item
)
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment