Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
9fded341
Commit
9fded341
authored
Jan 20, 2025
by
umorales
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
cambios en cuestion de polemicua
parent
8b9f5b5f
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
54 additions
and
34 deletions
+54
-34
noticias.py
...cuestionDePolemica/cuestionDePolemica/spiders/noticias.py
+54
-34
No files found.
spiders/daily/cuestionDePolemica/cuestionDePolemica/spiders/noticias.py
View file @
9fded341
...
...
@@ -7,7 +7,7 @@ from cuestionDePolemica.items import CuestiondepolemicaItem
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
"""Elimina etiquetas HTML del texto."""
if
not
isinstance
(
text
,
str
):
return
text
# Devuelve el valor original si no es una cadena
return
TAG_RE
.
sub
(
''
,
text
)
...
...
@@ -15,50 +15,70 @@ def remove_tags(text):
class
NoticiasSpider
(
scrapy
.
Spider
):
name
=
"noticias"
allowed_domains
=
[
"www.cuestiondepolemica.com"
]
def
__init__
(
self
,
year
=
None
,
month
=
None
,
day
=
None
,
*
args
,
**
kwargs
):
super
(
NoticiasSpider
,
self
)
.
__init__
(
*
args
,
**
kwargs
)
self
.
year
=
year
self
.
month
=
month
.
zfill
(
2
)
if
month
else
None
self
.
day
=
day
.
zfill
(
2
)
if
day
else
None
if
self
.
year
and
self
.
month
and
self
.
day
:
self
.
start_urls
=
[
f
"https://www.cuestiondepolemica.com/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
f
"https://www.cuestiondepolemica.com/wp-json/wp/v2/posts?"
f
"after={self.year}-{self.month}-{self.day}T00:00:00&"
f
"before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
]
else
:
self
.
logger
.
error
(
"Year, month, and day must be provided to generate start_urls."
)
self
.
start_urls
=
[]
def
parse
(
self
,
response
):
data
=
json
.
loads
(
response
.
text
)
try
:
data
=
json
.
loads
(
response
.
text
)
self
.
logger
.
info
(
f
"Received {len(data)} posts from API."
)
except
json
.
JSONDecodeError
as
e
:
self
.
logger
.
error
(
f
"Failed to parse JSON: {e}"
)
return
for
post
in
data
:
# Validar que el contenido no esté vacío
content
=
post
.
get
(
'content'
,
{})
.
get
(
'rendered'
,
''
)
.
strip
()
if
not
content
:
self
.
logger
.
info
(
f
"Post {post.get('id')} skipped due to empty content."
)
continue
# Obtener el séptimo elemento de class_list si existe
class_list
=
post
.
get
(
'class_list'
,
[])
topic
=
class_list
[
'7'
]
if
len
(
class_list
)
>
7
else
None
# Obtener el enlace del autor
author_link
=
post
.
get
(
'_links'
,
{})
.
get
(
'author'
,
[{}])[
0
]
.
get
(
'href'
)
# Crear un item con los campos requeridos
item
=
CuestiondepolemicaItem
()
item
[
'date'
]
=
post
.
get
(
'date'
)
item
[
'title'
]
=
remove_tags
(
post
.
get
(
'title'
,
{})
.
get
(
'rendered'
,
''
))
item
[
'text'
]
=
remove_tags
(
content
)
item
[
'topic'
]
=
topic
.
split
(
"category-"
)[
1
]
item
[
'url'
]
=
post
.
get
(
'link'
)
if
author_link
:
# Hacer una solicitud adicional para obtener el nombre del autor
yield
scrapy
.
Request
(
url
=
author_link
,
callback
=
self
.
parse_author
,
meta
=
{
'item'
:
item
})
else
:
yield
item
# Si no hay URL del autor, se devuelve el item sin autor
try
:
# Validar contenido
content
=
post
.
get
(
'content'
,
{})
.
get
(
'rendered'
,
''
)
.
strip
()
if
not
remove_tags
(
content
):
self
.
logger
.
warning
(
f
"Skipped post {post.get('id')}: No meaningful content."
)
continue
def
parse_author
(
self
,
response
):
item
=
response
.
meta
[
'item'
]
# Recupera el item pasado a través de meta
author_data
=
json
.
loads
(
response
.
text
)
item
[
'author'
]
=
author_data
.
get
(
'name'
,
'Unknown'
)
# Asigna el nombre del autor o 'Unknown' si no está disponible
print
(
item
[
"title"
])
yield
item
# Devuelve el item completo con el nombre del autor incluido
# Obtener categoría del artículo
class_list
=
post
.
get
(
'class_list'
,
{})
topic
=
None
if
isinstance
(
class_list
,
dict
):
topic
=
class_list
.
get
(
'7'
,
''
)
.
split
(
"category-"
)[
1
]
if
'7'
in
class_list
else
None
# Preparar item
item
=
CuestiondepolemicaItem
()
item
[
'date'
]
=
post
.
get
(
'date'
)
item
[
'title'
]
=
remove_tags
(
post
.
get
(
'title'
,
{})
.
get
(
'rendered'
,
''
))
item
[
'text'
]
=
remove_tags
(
content
)
item
[
'topic'
]
=
topic
item
[
'url'
]
=
post
.
get
(
'link'
)
# Enlace al autor
author_link
=
post
.
get
(
'_links'
,
{})
.
get
(
'author'
,
[{}])[
0
]
.
get
(
'href'
,
None
)
if
author_link
:
yield
scrapy
.
Request
(
url
=
author_link
,
callback
=
self
.
parse_author
,
meta
=
{
'item'
:
item
})
else
:
item
[
'author'
]
=
'Unknown'
yield
item
except
Exception
as
e
:
self
.
logger
.
error
(
f
"Error processing post {post.get('id')}: {e}"
)
continue
def
parse_author
(
self
,
response
):
"""Procesa la información del autor de un artículo."""
try
:
item
=
response
.
meta
[
'item'
]
# Recupera el item pasado a través de meta
author_data
=
json
.
loads
(
response
.
text
)
item
[
'author'
]
=
author_data
.
get
(
'name'
,
'Unknown'
)
yield
item
# Devuelve el item completo con el nombre del autor incluido
except
Exception
as
e
:
self
.
logger
.
error
(
f
"Failed to parse author data: {e}"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment