Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
c14dec2a
Commit
c14dec2a
authored
Feb 17, 2025
by
umorales
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
capitalEdomex
parent
1f690aab
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
12 additions
and
5 deletions
+12
-5
noticias.py
...lEstadoDeMexico/capitalEstadoDeMexico/spiders/noticias.py
+12
-5
No files found.
spiders/daily/capitalEstadoDeMexico/capitalEstadoDeMexico/spiders/noticias.py
View file @
c14dec2a
...
@@ -28,6 +28,7 @@ def remove_tags(text):
...
@@ -28,6 +28,7 @@ def remove_tags(text):
class
NoticiasSpider
(
scrapy
.
Spider
):
class
NoticiasSpider
(
scrapy
.
Spider
):
name
=
"noticias"
name
=
"noticias"
allowed_domains
=
[
"www.capitaledomex.com.mx"
]
allowed_domains
=
[
"www.capitaledomex.com.mx"
]
def
__init__
(
self
,
year
=
None
,
month
=
None
,
day
=
None
,
*
args
,
**
kwargs
):
def
__init__
(
self
,
year
=
None
,
month
=
None
,
day
=
None
,
*
args
,
**
kwargs
):
super
(
NoticiasSpider
,
self
)
.
__init__
(
*
args
,
**
kwargs
)
super
(
NoticiasSpider
,
self
)
.
__init__
(
*
args
,
**
kwargs
)
self
.
year
=
year
self
.
year
=
year
...
@@ -35,7 +36,7 @@ class NoticiasSpider(scrapy.Spider):
...
@@ -35,7 +36,7 @@ class NoticiasSpider(scrapy.Spider):
self
.
day
=
day
.
zfill
(
2
)
if
day
else
None
self
.
day
=
day
.
zfill
(
2
)
if
day
else
None
if
self
.
year
and
self
.
month
and
self
.
day
:
if
self
.
year
and
self
.
month
and
self
.
day
:
self
.
start_urls
=
[
self
.
start_urls
=
[
f
"
{
allowed_domains[0]}/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
f
"
https://{self.
allowed_domains[0]}/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
]
]
def
parse
(
self
,
response
):
def
parse
(
self
,
response
):
...
@@ -43,14 +44,20 @@ class NoticiasSpider(scrapy.Spider):
...
@@ -43,14 +44,20 @@ class NoticiasSpider(scrapy.Spider):
Procesa una respuesta de la API de WordPress y devuelve los posts que
Procesa una respuesta de la API de WordPress y devuelve los posts que
contienen contenido no vacío. Crea un item con los campos 'date', 'title',
contienen contenido no vacío. Crea un item con los campos 'date', 'title',
'text', 'author', 'topic' y 'url' y lo devuelve como un objeto de tipo
'text', 'author', 'topic' y 'url' y lo devuelve como un objeto de tipo
c
apitalestadodemexicoItem.
C
apitalestadodemexicoItem.
"""
"""
data
=
json
.
loads
(
response
.
text
)
try
:
data
=
json
.
loads
(
response
.
text
)
except
json
.
JSONDecodeError
as
e
:
self
.
logger
.
error
(
f
"Error al decodificar JSON: {e}"
)
self
.
logger
.
error
(
f
"Contenido recibido: {response.text[:500]}"
)
# Muestra los primeros 500 caracteres
return
for
post
in
data
:
for
post
in
data
:
# Validar que el contenido no esté vacío
# Validar que el contenido no esté vacío
content
=
post
.
get
(
'content'
,
{})
.
get
(
'rendered'
,
''
)
.
strip
()
content
=
post
.
get
(
'content'
,
{})
.
get
(
'rendered'
,
''
)
.
strip
()
if
not
content
:
if
not
content
:
self
.
logger
.
info
(
f
"Post {post.get('id')}
skipped due to empty content
."
)
self
.
logger
.
info
(
f
"Post {post.get('id')}
omitido debido a contenido vacío
."
)
continue
continue
# Crear un item con los campos requeridos
# Crear un item con los campos requeridos
...
@@ -84,4 +91,4 @@ class NoticiasSpider(scrapy.Spider):
...
@@ -84,4 +91,4 @@ class NoticiasSpider(scrapy.Spider):
article_section
=
schema_graph
[
5
]
.
get
(
'articleSection'
,
[])
article_section
=
schema_graph
[
5
]
.
get
(
'articleSection'
,
[])
if
isinstance
(
article_section
,
list
)
and
article_section
:
if
isinstance
(
article_section
,
list
)
and
article_section
:
return
article_section
[
0
]
# Devuelve el primer elemento si existe
return
article_section
[
0
]
# Devuelve el primer elemento si existe
return
"Sin tema"
return
"Sin tema"
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment