Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
3ea10cf1
Commit
3ea10cf1
authored
5 months ago
by
Mario Chirinos
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
heraldo leon
parent
43fbb106
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
25 additions
and
23 deletions
+25
-23
noticias.py
spiders/daily/adNoticas/adNoticas/spiders/noticias.py
+5
-5
settings.py
spiders/daily/heraldoLeon/heraldoLeon/settings.py
+6
-3
noticias.py
spiders/daily/heraldoLeon/heraldoLeon/spiders/noticias.py
+14
-15
No files found.
spiders/daily/adNoticas/adNoticas/spiders/noticias.py
View file @
3ea10cf1
...
...
@@ -20,12 +20,12 @@ class NoticiasSpider(scrapy.Spider):
start_urls
=
[
'http://adnoticias.mx/'
]
#-----------------------------------------------------------------------
def
start_requests
(
self
):
year
=
getattr
(
self
,
"year"
,
None
)
month
=
getattr
(
self
,
"month"
,
None
)
day
=
getattr
(
self
,
"day"
,
None
)
self
.
year
=
getattr
(
self
,
"year"
,
None
)
self
.
month
=
getattr
(
self
,
"month"
,
None
)
self
.
day
=
getattr
(
self
,
"day"
,
None
)
self
.
date
=
year
+
"-"
+
month
.
zfill
(
2
)
+
"-"
+
self
.
day
.
zfill
(
2
)
self
.
baseURL
=
"https://adnoticias.mx/"
+
year
+
"/"
+
month
.
zfill
(
2
)
+
"/"
+
self
.
day
.
zfill
(
2
)
+
"/"
self
.
date
=
self
.
year
+
"-"
+
self
.
month
.
zfill
(
2
)
+
"-"
+
self
.
day
.
zfill
(
2
)
self
.
baseURL
=
"https://adnoticias.mx/"
+
self
.
year
+
"/"
+
self
.
month
.
zfill
(
2
)
+
"/"
+
self
.
day
.
zfill
(
2
)
+
"/"
yield
scrapy
.
Request
(
url
=
self
.
baseURL
,
callback
=
self
.
parseList
)
#-----------------------------------------------------------------------
def
parseList
(
self
,
response
):
...
...
This diff is collapsed.
Click to expand it.
spiders/daily/heraldoLeon/heraldoLeon/settings.py
View file @
3ea10cf1
...
...
@@ -17,7 +17,7 @@ FEED_EXPORT_ENCODING="utf-8"
#USER_AGENT = 'heraldoLeon (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY
=
Tru
e
ROBOTSTXT_OBEY
=
Fals
e
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
...
...
@@ -25,7 +25,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY
=
0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
...
...
@@ -41,7 +41,10 @@ ROBOTSTXT_OBEY = True
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
DEFAULT_REQUEST_HEADERS
=
{
# ... Other headers
'User-Agent'
:
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
,
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
...
...
This diff is collapsed.
Click to expand it.
spiders/daily/heraldoLeon/heraldoLeon/spiders/noticias.py
View file @
3ea10cf1
...
...
@@ -16,15 +16,14 @@ def remove_tags(text):
class
NoticiasSpider
(
scrapy
.
Spider
):
name
=
'noticias'
allowed_domains
=
[
'heraldoleon.mx'
]
start_urls
=
[
'http://heraldoleon.mx/'
]
start_urls
=
[
'http
s
://heraldoleon.mx/'
]
def
start_requests
(
self
):
self
.
year
=
getattr
(
self
,
"year"
,
None
)
self
.
month
=
getattr
(
self
,
"month"
,
None
)
self
.
day
=
getattr
(
self
,
"day"
,
None
)
self
.
baseURL
=
"http://www.heraldoleon.mx/"
+
self
.
year
+
"/"
+
self
.
month
+
"/"
+
self
.
day
self
.
baseURL
=
"https://www.heraldoleon.mx/"
+
self
.
year
+
"/"
+
self
.
month
.
zfill
(
2
)
+
"/"
+
self
.
day
.
zfill
(
2
)
+
"/"
yield
scrapy
.
Request
(
url
=
self
.
baseURL
,
callback
=
self
.
parse
)
#-----------------------------------------------------------------------
def
parse
(
self
,
response
):
...
...
@@ -32,8 +31,8 @@ class NoticiasSpider(scrapy.Spider):
for
link
in
response
.
xpath
(
'//h3[@class="entry-title td-module-title"]/a/@href'
)
.
extract
():
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
next_page
=
response
.
xpath
(
'//div[@class="page-nav td-pb-padding-side"]/a/i[@class="td-icon-menu-right"]/../@href'
)
.
extract_first
()
next_page
=
response
.
xpath
(
'//div[contains(@class,"page-nav")]/a[@aria-label="next-page"]/@href'
)
.
extract_first
()
#
next_page = response.xpath('//div[@class="page-nav td-pb-padding-side"]/a/i[@class="td-icon-menu-right"]/../@href').extract_first()
print
(
"nextPage"
,
next_page
)
if
next_page
is
not
None
:
yield
scrapy
.
Request
(
url
=
next_page
,
callback
=
self
.
parse
)
...
...
@@ -42,18 +41,18 @@ class NoticiasSpider(scrapy.Spider):
# print(response.url)
item
=
HeraldoleonItem
()
item
[
'date'
]
=
response
.
xpath
(
"//meta[@property='article:published_time']/@content"
)
.
extract_first
()
item
[
'title'
]
=
response
.
xpath
(
'//h1[@class="entry-title"]/text()'
)
.
extract_first
()
# item['title'] = response.xpath("//meta[@property='og:title']/@content").extract_first()
item
[
'title'
]
=
response
.
xpath
(
'//h1[@class="tdb-title-text"]/text()'
)
.
extract_first
()
text
=
""
for
p
in
response
.
xpath
(
'//div[@class="td-post-content"]/p/text()'
)
.
extract
():
nt
=
remove_tags
(
p
)
.
replace
(
"
\n
"
,
""
)
.
replace
(
"
\r
"
,
""
)
.
strip
()
text
+=
nt
if
len
(
nt
)
>
0
:
text
+=
"
\n
"
for
p
in
response
.
xpath
(
'//div[contains(@class, "td-post-content")]/div/p'
)
.
extract
():
tt
=
remove_tags
(
p
)
+
"
\n
"
text
+=
tt
item
[
'text'
]
=
text
.
strip
()
item
[
'topic'
]
=
", "
.
join
(
response
.
xpath
(
'//ul[@class="td-tags td-post-small-box clearfix"]/li/a/text()'
)
.
extract
()
)
item
[
'topic'
]
=
response
.
xpath
(
'//ul[@class="tdb-tags"]/li/a/text()'
)
.
extract
(
)
item
[
'url'
]
=
response
.
url
item
[
"author"
]
=
", "
.
join
(
response
.
xpath
(
'//div[
@class="td-post-source-via "
]/div/a/text()'
)
.
extract
())
item
[
"location"
]
=
""
print
(
self
.
allowed_domains
,
item
[
"title"
])
item
[
"author"
]
=
", "
.
join
(
response
.
xpath
(
'//div[
contains(@class, "tdb_single_via ")
]/div/a/text()'
)
.
extract
())
#
item["location"]=""
print
(
item
[
"title"
])
yield
item
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment