Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
9fded341
Commit
9fded341
authored
3 months ago
by
umorales
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
cambios en cuestion de polemicua
parent
8b9f5b5f
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
54 additions
and
34 deletions
+54
-34
noticias.py
...cuestionDePolemica/cuestionDePolemica/spiders/noticias.py
+54
-34
No files found.
spiders/daily/cuestionDePolemica/cuestionDePolemica/spiders/noticias.py
View file @
9fded341
...
...
@@ -7,7 +7,7 @@ from cuestionDePolemica.items import CuestiondepolemicaItem
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
"""Elimina etiquetas HTML del texto."""
if
not
isinstance
(
text
,
str
):
return
text
# Devuelve el valor original si no es una cadena
return
TAG_RE
.
sub
(
''
,
text
)
...
...
@@ -15,50 +15,70 @@ def remove_tags(text):
class
NoticiasSpider
(
scrapy
.
Spider
):
name
=
"noticias"
allowed_domains
=
[
"www.cuestiondepolemica.com"
]
def
__init__
(
self
,
year
=
None
,
month
=
None
,
day
=
None
,
*
args
,
**
kwargs
):
super
(
NoticiasSpider
,
self
)
.
__init__
(
*
args
,
**
kwargs
)
self
.
year
=
year
self
.
month
=
month
.
zfill
(
2
)
if
month
else
None
self
.
day
=
day
.
zfill
(
2
)
if
day
else
None
if
self
.
year
and
self
.
month
and
self
.
day
:
self
.
start_urls
=
[
f
"https://www.cuestiondepolemica.com/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
f
"https://www.cuestiondepolemica.com/wp-json/wp/v2/posts?"
f
"after={self.year}-{self.month}-{self.day}T00:00:00&"
f
"before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
]
else
:
self
.
logger
.
error
(
"Year, month, and day must be provided to generate start_urls."
)
self
.
start_urls
=
[]
def
parse
(
self
,
response
):
data
=
json
.
loads
(
response
.
text
)
try
:
data
=
json
.
loads
(
response
.
text
)
self
.
logger
.
info
(
f
"Received {len(data)} posts from API."
)
except
json
.
JSONDecodeError
as
e
:
self
.
logger
.
error
(
f
"Failed to parse JSON: {e}"
)
return
for
post
in
data
:
# Validar que el contenido no esté vacío
content
=
post
.
get
(
'content'
,
{})
.
get
(
'rendered'
,
''
)
.
strip
()
if
not
content
:
self
.
logger
.
info
(
f
"Post {post.get('id')} skipped due to empty content."
)
continue
# Obtener el séptimo elemento de class_list si existe
class_list
=
post
.
get
(
'class_list'
,
[])
topic
=
class_list
[
'7'
]
if
len
(
class_list
)
>
7
else
None
# Obtener el enlace del autor
author_link
=
post
.
get
(
'_links'
,
{})
.
get
(
'author'
,
[{}])[
0
]
.
get
(
'href'
)
# Crear un item con los campos requeridos
item
=
CuestiondepolemicaItem
()
item
[
'date'
]
=
post
.
get
(
'date'
)
item
[
'title'
]
=
remove_tags
(
post
.
get
(
'title'
,
{})
.
get
(
'rendered'
,
''
))
item
[
'text'
]
=
remove_tags
(
content
)
item
[
'topic'
]
=
topic
.
split
(
"category-"
)[
1
]
item
[
'url'
]
=
post
.
get
(
'link'
)
if
author_link
:
# Hacer una solicitud adicional para obtener el nombre del autor
yield
scrapy
.
Request
(
url
=
author_link
,
callback
=
self
.
parse_author
,
meta
=
{
'item'
:
item
})
else
:
yield
item
# Si no hay URL del autor, se devuelve el item sin autor
try
:
# Validar contenido
content
=
post
.
get
(
'content'
,
{})
.
get
(
'rendered'
,
''
)
.
strip
()
if
not
remove_tags
(
content
):
self
.
logger
.
warning
(
f
"Skipped post {post.get('id')}: No meaningful content."
)
continue
def
parse_author
(
self
,
response
):
item
=
response
.
meta
[
'item'
]
# Recupera el item pasado a través de meta
author_data
=
json
.
loads
(
response
.
text
)
item
[
'author'
]
=
author_data
.
get
(
'name'
,
'Unknown'
)
# Asigna el nombre del autor o 'Unknown' si no está disponible
print
(
item
[
"title"
])
yield
item
# Devuelve el item completo con el nombre del autor incluido
# Obtener categoría del artículo
class_list
=
post
.
get
(
'class_list'
,
{})
topic
=
None
if
isinstance
(
class_list
,
dict
):
topic
=
class_list
.
get
(
'7'
,
''
)
.
split
(
"category-"
)[
1
]
if
'7'
in
class_list
else
None
# Preparar item
item
=
CuestiondepolemicaItem
()
item
[
'date'
]
=
post
.
get
(
'date'
)
item
[
'title'
]
=
remove_tags
(
post
.
get
(
'title'
,
{})
.
get
(
'rendered'
,
''
))
item
[
'text'
]
=
remove_tags
(
content
)
item
[
'topic'
]
=
topic
item
[
'url'
]
=
post
.
get
(
'link'
)
# Enlace al autor
author_link
=
post
.
get
(
'_links'
,
{})
.
get
(
'author'
,
[{}])[
0
]
.
get
(
'href'
,
None
)
if
author_link
:
yield
scrapy
.
Request
(
url
=
author_link
,
callback
=
self
.
parse_author
,
meta
=
{
'item'
:
item
})
else
:
item
[
'author'
]
=
'Unknown'
yield
item
except
Exception
as
e
:
self
.
logger
.
error
(
f
"Error processing post {post.get('id')}: {e}"
)
continue
def
parse_author
(
self
,
response
):
"""Procesa la información del autor de un artículo."""
try
:
item
=
response
.
meta
[
'item'
]
# Recupera el item pasado a través de meta
author_data
=
json
.
loads
(
response
.
text
)
item
[
'author'
]
=
author_data
.
get
(
'name'
,
'Unknown'
)
yield
item
# Devuelve el item completo con el nombre del autor incluido
except
Exception
as
e
:
self
.
logger
.
error
(
f
"Failed to parse author data: {e}"
)
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment