Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
a9862377
Commit
a9862377
authored
Mar 24, 2025
by
Ulises Morales Ramírez
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
reviviendo unoMasuno
parent
753e5c57
Changes
11
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
99 additions
and
183 deletions
+99
-183
pipelines.py
spiders/daily/BROKEN/unoMasUno/unoMasUno/pipelines.py
+0
-75
noticias.py
spiders/daily/BROKEN/unoMasUno/unoMasUno/spiders/noticias.py
+0
-103
2025-03-18.json
spiders/daily/unoMasUno/2025-03-18.json
+16
-0
scrapy.cfg
spiders/daily/unoMasUno/scrapy.cfg
+0
-0
__init__.py
spiders/daily/unoMasUno/unoMasUno/__init__.py
+0
-0
items.py
spiders/daily/unoMasUno/unoMasUno/items.py
+4
-3
middlewares.py
spiders/daily/unoMasUno/unoMasUno/middlewares.py
+0
-0
pipelines.py
spiders/daily/unoMasUno/unoMasUno/pipelines.py
+14
-0
settings.py
spiders/daily/unoMasUno/unoMasUno/settings.py
+3
-2
__init__.py
spiders/daily/unoMasUno/unoMasUno/spiders/__init__.py
+0
-0
noticias.py
spiders/daily/unoMasUno/unoMasUno/spiders/noticias.py
+62
-0
No files found.
spiders/daily/BROKEN/unoMasUno/unoMasUno/pipelines.py
deleted
100644 → 0
View file @
753e5c57
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import
json
from
collections
import
OrderedDict
class
JsonWriterPipeline
(
object
):
def
__init__
(
self
,
filename
):
self
.
filename
=
filename
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# Here you get whatever value was passed through the "filename" command line parameter
settings
=
crawler
.
settings
filename
=
settings
.
get
(
'filename'
)
# Instantiate the pipeline with the file name
return
cls
(
filename
)
def
open_spider
(
self
,
spider
):
self
.
counter
=
0
self
.
file
=
open
(
self
.
filename
,
'w'
)
self
.
file
.
write
(
"["
)
def
close_spider
(
self
,
spider
):
self
.
file
.
write
(
"]"
)
self
.
file
.
close
()
def
process_item
(
self
,
item
,
spider
):
# print("this is my item", item)
row
=
[]
try
:
row
.
append
((
"date"
,
item
[
'date'
]))
except
:
pass
try
:
row
.
append
((
"topic"
,
item
[
'topic'
]))
except
:
pass
try
:
row
.
append
((
"title"
,
item
[
'title'
]))
except
:
pass
try
:
row
.
append
((
"author"
,
item
[
'author'
]))
except
:
pass
try
:
row
.
append
((
"location"
,
item
[
'location'
]))
except
:
pass
try
:
row
.
append
((
"text"
,
item
[
'text'
]))
except
:
pass
try
:
row
.
append
((
"url"
,
item
[
'url'
]))
except
:
pass
line
=
OrderedDict
(
row
)
self
.
counter
+=
1
if
self
.
counter
==
1
:
self
.
file
.
write
(
json
.
dumps
(
line
))
elif
self
.
counter
>
1
:
self
.
file
.
write
(
",
\n
"
+
json
.
dumps
(
line
))
return
item
spiders/daily/BROKEN/unoMasUno/unoMasUno/spiders/noticias.py
deleted
100644 → 0
View file @
753e5c57
# -*- coding: utf-8 -*-
import
scrapy
,
re
,
json
from
unoMasUno.items
import
NoticiasItem
from
datetime
import
datetime
,
timedelta
,
tzinfo
"""
MEDIO:
Uno Más Uno, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-09-22.json -a year=2017 -a month=9 -a day=22
"""
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
class
UTC
(
tzinfo
):
"""clase para el 'time zone' (zona horaria)"""
def
utcoffset
(
self
,
dt
):
# zona horaria para hidalgo (centro de mexico): utc-6
return
timedelta
(
hours
=-
6
)
def
tzname
(
self
,
dt
):
# nombre de la zona horaria
return
'UTC-6'
class
QuotesSpider
(
scrapy
.
Spider
):
name
=
"noticias"
def
start_requests
(
self
):
self
.
tz
=
UTC
()
self
.
year
=
getattr
(
self
,
'year'
,
None
)
self
.
month
=
getattr
(
self
,
'month'
,
None
)
self
.
day
=
getattr
(
self
,
'day'
,
None
)
self
.
date_parser
=
{
'enero'
:
1
,
'febrero'
:
2
,
'marzo'
:
3
,
'abril'
:
4
,
'mayo'
:
5
,
'junio'
:
6
,
'julio'
:
7
,
'agosto'
:
8
,
'septiembre'
:
9
,
'octubre'
:
10
,
'noviembre'
:
11
,
'diciembre'
:
12
}
self
.
baseURL
=
"http://www.unomasuno.com.mx/"
+
self
.
year
+
"/"
+
self
.
month
+
"/"
+
self
.
day
yield
scrapy
.
Request
(
url
=
self
.
baseURL
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
pagination
=
response
.
xpath
(
'//*[@class="pagination"]/a[@class="last"]/@href'
)
.
extract_first
()
if
pagination
is
None
:
pagination
=
response
.
xpath
(
'//*[@class="pagination"]/a/@href'
)
.
extract
()
if
len
(
pagination
)
>
0
:
pagination
=
pagination
[
-
1
]
.
strip
(
'/'
)
pages
=
int
(
pagination
[
pagination
.
rfind
(
'/'
)
+
1
:])
for
page
in
range
(
1
,
pages
):
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
"/page/"
+
str
(
page
+
1
),
callback
=
self
.
parse_page
)
else
:
pagination
=
pagination
.
strip
(
'/'
)
pages
=
int
(
pagination
[
pagination
.
rfind
(
'/'
)
+
1
:])
for
page
in
range
(
1
,
pages
):
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
"/page/"
+
str
(
page
+
1
),
callback
=
self
.
parse_page
)
def
parse_page
(
self
,
response
):
for
link
in
response
.
xpath
(
'//h2[@class="post-box-title"]/a/@href'
)
.
extract
():
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
def
parse_item
(
self
,
response
):
item
=
NoticiasItem
()
text
=
''
try
:
jsonInfo
=
response
.
xpath
(
'//script[@type="application/ld+json"]'
)
.
extract_first
()
jsonInfo
=
json
.
loads
(
remove_tags
(
jsonInfo
))
dat
=
jsonInfo
[
'datePublished'
]
except
:
try
:
d
=
response
.
xpath
(
'//p[@class="post-meta"]/span/text()'
)
.
extract_first
()
d
=
d
.
replace
(
','
,
''
)
.
split
(
' '
)
dat
=
datetime
(
int
(
d
[
2
]),
self
.
date_parser
[
d
[
1
]
.
lower
()],
int
(
d
[
0
]),
tzinfo
=
self
.
tz
)
.
isoformat
(
"T"
)
except
:
dat
=
datetime
(
int
(
self
.
year
),
int
(
self
.
month
),
int
(
self
.
day
),
tzinfo
=
self
.
tz
)
.
isoformat
(
"T"
)
item
[
'date'
]
=
dat
item
[
'topic'
]
=
response
.
xpath
(
'//span[@typeof="v:Breadcrumb"]/a/text()'
)
.
extract
()[
1
]
item
[
'title'
]
=
response
.
xpath
(
'//*[@class="post-inner"]/h1/span/text()'
)
.
extract_first
()
for
p
in
response
.
xpath
(
'//*[@class="entry"]/p'
)
.
extract
():
text
+=
remove_tags
(
p
)
+
'
\n
'
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
# print item['title']
yield
item
spiders/daily/unoMasUno/2025-03-18.json
0 → 100644
View file @
a9862377
This diff is collapsed.
Click to expand it.
spiders/daily/
BROKEN/
unoMasUno/scrapy.cfg
→
spiders/daily/unoMasUno/scrapy.cfg
View file @
a9862377
File moved
spiders/daily/
BROKEN/
unoMasUno/unoMasUno/__init__.py
→
spiders/daily/unoMasUno/unoMasUno/__init__.py
View file @
a9862377
File moved
spiders/daily/
BROKEN/
unoMasUno/unoMasUno/items.py
→
spiders/daily/unoMasUno/unoMasUno/items.py
View file @
a9862377
...
...
@@ -9,12 +9,13 @@ import scrapy
class
NoticiasItem
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
date
=
scrapy
.
Field
()
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
# define the fields for your item here like:
# name = scrapy.Field()
pass
spiders/daily/
BROKEN/
unoMasUno/unoMasUno/middlewares.py
→
spiders/daily/unoMasUno/unoMasUno/middlewares.py
View file @
a9862377
File moved
spiders/daily/unoMasUno/unoMasUno/pipelines.py
0 → 100644
View file @
a9862377
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import
json
from
collections
import
OrderedDict
class
JsonWriterPipeline
(
object
):
def
process_item
(
self
,
item
,
spider
):
return
item
\ No newline at end of file
spiders/daily/
BROKEN/
unoMasUno/unoMasUno/settings.py
→
spiders/daily/unoMasUno/unoMasUno/settings.py
View file @
a9862377
...
...
@@ -15,11 +15,12 @@ SPIDER_MODULES = ['unoMasUno.spiders']
NEWSPIDER_MODULE
=
'unoMasUno.spiders'
FEED_EXPORT_ENCODING
=
"utf-8"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT =
'unoMasUno (+http://www.yourdomain.com)'
#USER_AGENT =
"planaMayor (+http://www.yourdomain.com)"
# Obey robots.txt rules
# ROBOTSTXT_OBEY = Tru
e
ROBOTSTXT_OBEY
=
Fals
e
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
...
...
spiders/daily/
BROKEN/
unoMasUno/unoMasUno/spiders/__init__.py
→
spiders/daily/unoMasUno/unoMasUno/spiders/__init__.py
View file @
a9862377
File moved
spiders/daily/unoMasUno/unoMasUno/spiders/noticias.py
0 → 100644
View file @
a9862377
import
scrapy
import
json
import
re
from
unoMasUno.items
import
NoticiasItem
# Expresión regular para eliminar etiquetas HTML
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
if
not
isinstance
(
text
,
str
):
return
text
# Devuelve el valor original si no es una cadena
return
TAG_RE
.
sub
(
''
,
text
)
class
NoticiasSpider
(
scrapy
.
Spider
):
name
=
"noticias"
allowed_domains
=
[
"unomasuno.com.mx"
]
start_urls
=
[
"https://unomasuno.com.mx/"
]
def
__init__
(
self
,
year
=
None
,
month
=
None
,
day
=
None
,
*
args
,
**
kwargs
):
super
(
NoticiasSpider
,
self
)
.
__init__
(
*
args
,
**
kwargs
)
self
.
year
=
year
self
.
month
=
month
.
zfill
(
2
)
if
month
else
None
self
.
day
=
day
.
zfill
(
2
)
if
day
else
None
if
self
.
year
and
self
.
month
and
self
.
day
:
self
.
start_urls
=
[
f
"https://unomasuno.com.mx/wp-json/wp/v2/posts?after={self.year}-{self.month}-{self.day}T00:00:00&before={self.year}-{self.month}-{self.day}T23:59:59&per_page=100"
]
print
(
self
.
start_urls
[
0
])
def
parse
(
self
,
response
):
try
:
# Intenta decodificar el JSON de la respuesta
data
=
json
.
loads
(
response
.
text
)
self
.
logger
.
info
(
f
"Received {len(data)} posts from API."
)
except
json
.
JSONDecodeError
as
e
:
# Si hay un error al decodificar el JSON, registra el error y detén el procesamiento
self
.
logger
.
error
(
f
"Failed to parse JSON: {e}"
)
self
.
logger
.
error
(
f
"Response content: {response.text[:500]}..."
)
# Logea los primeros 500 caracteres de la respuesta
return
for
post
in
data
:
try
:
content
=
post
.
get
(
'content'
,
{})
.
get
(
'rendered'
,
''
)
.
strip
()
if
content
:
class_list
=
post
.
get
(
'class_list'
,
{})
topic
=
None
if
isinstance
(
class_list
,
dict
):
topic
=
class_list
.
get
(
'7'
,
''
)
.
split
(
"category-"
)[
1
]
if
'7'
in
class_list
else
None
# Preparar item
item
=
NoticiasItem
()
item
[
'date'
]
=
post
.
get
(
'date'
)
item
[
'title'
]
=
remove_tags
(
post
.
get
(
'title'
,
{})
.
get
(
'rendered'
,
''
))
item
[
'text'
]
=
remove_tags
(
content
)
item
[
'topic'
]
=
topic
item
[
'url'
]
=
post
.
get
(
'link'
)
print
(
item
[
'title'
])
yield
item
except
Exception
as
e
:
# Si hay un error al procesar un post, registra el error y continúa con el siguiente
self
.
logger
.
error
(
f
"Error processing post {post.get('id')}: {e}"
)
continue
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment