Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
d895ebe3
Commit
d895ebe3
authored
7 years ago
by
Renán Sosa Guillen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
crawlers
parent
3885bd5c
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
274 additions
and
272 deletions
+274
-272
noticias.py
descarga_por_dia/alChile/alChile/spiders/noticias.py
+62
-62
noticias.py
...a_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.py
+75
-73
noticias.pyc
..._por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.pyc
+0
-0
noticias.py
descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.py
+66
-66
noticias.py
descarga_por_dia/laVerdadYuc/laVerdadYuc/spiders/noticias.py
+71
-71
No files found.
descarga_por_dia/alChile/alChile/spiders/noticias.py
View file @
d895ebe3
import
scrapy
,
re
##
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
return
TAG_RE
.
sub
(
''
,
text
)
class
NoticiasItem
(
scrapy
.
Item
):
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
class
QuotesSpider
(
scrapy
.
Spider
):
name
=
"noticias"
def
start_requests
(
self
):
year
=
getattr
(
self
,
'year'
,
None
)
month
=
getattr
(
self
,
'month'
,
None
)
day
=
getattr
(
self
,
'day'
,
None
)
self
.
baseURL
=
'http://alchile.com.mx/'
+
year
+
'/'
+
month
+
'/'
+
day
yield
scrapy
.
Request
(
url
=
self
.
baseURL
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
pagination
=
response
.
css
(
'div.page-nav'
)
.
css
(
'a.last::attr(href)'
)
.
extract
()
if
(
len
(
pagination
)
>
0
):
pagination
=
pagination
[
0
]
.
strip
(
'/'
)
pages
=
int
(
pagination
[
pagination
.
rfind
(
'/'
)
+
1
:])
for
page
in
range
(
0
,
pages
):
if
(
page
==
0
):
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
else
:
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
"/page/"
+
str
(
page
+
1
),
callback
=
self
.
parse_page
)
else
:
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
def
parse_page
(
self
,
response
):
for
link
in
response
.
css
(
'div.td-block-span6'
)
.
css
(
'h3.entry-title'
)
.
css
(
'a::attr(href)'
)
.
extract
():
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
def
parse_item
(
self
,
response
):
item
=
NoticiasItem
()
text
=
''
item
[
'title'
]
=
response
.
css
(
'header.td-post-title'
)
.
css
(
'h1.entry-title::text'
)
.
extract_first
()
d
=
response
.
css
(
'span.td-post-date'
)
.
css
(
'time.entry-date::attr(datetime)'
)
.
extract_first
()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if
d
[
-
6
:]
!=
'-06:00'
:
d
=
d
[:
-
6
]
+
'-06:00'
item
[
'date'
]
=
d
item
[
'topic'
]
=
response
.
css
(
'div.td-post-header'
)
.
css
(
'a::text'
)
.
extract_first
()
for
paragraph
in
response
.
css
(
'div.td-post-content'
)
.
css
(
'p'
)
.
extract
():
text
+=
remove_tags
(
paragraph
)
+
'
\n
'
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
# print item['title']
yield
item
name
=
"noticias"
def
start_requests
(
self
):
year
=
getattr
(
self
,
'year'
,
None
)
month
=
getattr
(
self
,
'month'
,
None
)
day
=
getattr
(
self
,
'day'
,
None
)
self
.
baseURL
=
'http://alchile.com.mx/'
+
year
+
'/'
+
month
+
'/'
+
day
yield
scrapy
.
Request
(
url
=
self
.
baseURL
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
pagination
=
response
.
css
(
'div.page-nav'
)
.
css
(
'a.last::attr(href)'
)
.
extract
()
if
(
len
(
pagination
)
>
0
):
pagination
=
pagination
[
0
]
.
strip
(
'/'
)
pages
=
int
(
pagination
[
pagination
.
rfind
(
'/'
)
+
1
:])
for
page
in
range
(
0
,
pages
):
if
(
page
==
0
):
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
else
:
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
"/page/"
+
str
(
page
+
1
),
callback
=
self
.
parse_page
)
else
:
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
def
parse_page
(
self
,
response
):
for
link
in
response
.
css
(
'div.td-block-span6'
)
.
css
(
'h3.entry-title'
)
.
css
(
'a::attr(href)'
)
.
extract
():
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
def
parse_item
(
self
,
response
):
item
=
NoticiasItem
()
text
=
''
item
[
'title'
]
=
response
.
css
(
'header.td-post-title'
)
.
css
(
'h1.entry-title::text'
)
.
extract_first
()
d
=
response
.
css
(
'span.td-post-date'
)
.
css
(
'time.entry-date::attr(datetime)'
)
.
extract_first
()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if
d
[
-
6
:]
!=
'-06:00'
:
d
=
d
[:
-
6
]
+
'-06:00'
item
[
'date'
]
=
d
item
[
'topic'
]
=
response
.
css
(
'div.td-post-header'
)
.
css
(
'a::text'
)
.
extract_first
()
for
paragraph
in
response
.
css
(
'div.td-post-content'
)
.
css
(
'p'
)
.
extract
():
text
+=
remove_tags
(
paragraph
)
+
'
\n
'
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
# print item['title']
yield
item
This diff is collapsed.
Click to expand it.
descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.py
View file @
d895ebe3
import
scrapy
,
re
from
datetime
import
datetime
,
timedelta
,
tzinfo
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
return
TAG_RE
.
sub
(
''
,
text
)
class
UTC
(
tzinfo
):
"""clase para el 'time zone' (zona horaria)"""
"""clase para el 'time zone' (zona horaria)"""
def
utcoffset
(
self
,
dt
):
# zona horaria para yucatan (centro de mexico): utc-6
return
timedelta
(
hours
=-
6
)
def
utcoffset
(
self
,
dt
):
# zona horaria para yucatan (centro de mexico): utc-6
return
timedelta
(
hours
=-
6
)
def
tzname
(
self
,
dt
):
# nombre de la zona horaria
return
'UTC-6'
def
tzname
(
self
,
dt
):
# nombre de la zona horaria
return
'UTC-6'
class
NoticiasItem
(
scrapy
.
Item
):
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
class
QuotesSpider
(
scrapy
.
Spider
):
name
=
"noticias"
def
start_requests
(
self
):
self
.
tz
=
UTC
()
self
.
year
=
getattr
(
self
,
'year'
,
None
)
self
.
month
=
getattr
(
self
,
'month'
,
None
)
self
.
day
=
getattr
(
self
,
'day'
,
None
)
self
.
baseURL
=
'http://www.desdeelbalcon.com/'
+
self
.
year
+
'/'
+
self
.
month
+
'/'
+
self
.
day
yield
scrapy
.
Request
(
url
=
self
.
baseURL
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
pagination
=
response
.
xpath
(
'//*[@class="pagination"]/a[@class="page-numbers"]/@href'
)
.
extract
()
if
(
len
(
pagination
)
>
0
)
:
pagination
=
pagination
[
-
1
]
.
strip
(
'/'
)
pages
=
int
(
pagination
[
pagination
.
rfind
(
'/'
)
+
1
:])
for
page
in
range
(
0
,
pages
):
if
(
page
==
0
)
:
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
else
:
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
"/page/"
+
str
(
page
+
1
),
callback
=
self
.
parse_page
)
else
:
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
def
parse_page
(
self
,
response
):
item
=
NoticiasItem
()
for
post
in
response
.
xpath
(
'//ul[@class="archivepost"]/li'
):
# item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
item
[
'date'
]
=
datetime
(
int
(
self
.
year
),
int
(
self
.
month
),
int
(
self
.
day
),
tzinfo
=
self
.
tz
)
.
isoformat
(
'T'
)
item
[
'topic'
]
=
post
.
xpath
(
'./p/a/text()'
)
.
extract
()
request
=
scrapy
.
Request
(
url
=
post
.
xpath
(
'./h2/a/@href'
)
.
extract_first
(),
callback
=
self
.
parse_item
)
request
.
meta
[
'item'
]
=
item
yield
request
def
parse_item
(
self
,
response
):
text
=
''
item
=
response
.
meta
[
'item'
]
item
[
'title'
]
=
response
.
xpath
(
'//h1[@class="post entry-title"]/a/text()'
)
.
extract_first
()
for
paragraph
in
response
.
xpath
(
'//div[@itemprop="text"]/p'
)
.
extract
():
text
+=
remove_tags
(
paragraph
)
+
'
\n
'
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
# print item['title']
yield
item
name
=
"noticias"
def
start_requests
(
self
):
self
.
tz
=
UTC
()
self
.
year
=
getattr
(
self
,
'year'
,
None
)
self
.
month
=
getattr
(
self
,
'month'
,
None
)
self
.
day
=
getattr
(
self
,
'day'
,
None
)
self
.
baseURL
=
'http://www.desdeelbalcon.com/'
+
self
.
year
+
'/'
+
self
.
month
+
'/'
+
self
.
day
yield
scrapy
.
Request
(
url
=
self
.
baseURL
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
pagination
=
response
.
xpath
(
'//*[@class="pagination"]/a[@class="page-numbers"]/@href'
)
.
extract
()
if
len
(
pagination
)
>
0
:
pagination
=
pagination
[
-
1
]
.
strip
(
'/'
)
pages
=
int
(
pagination
[
pagination
.
rfind
(
'/'
)
+
1
:])
for
page
in
range
(
0
,
pages
):
if
page
==
0
:
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
else
:
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
"/page/"
+
str
(
page
+
1
),
callback
=
self
.
parse_page
)
else
:
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
def
parse_page
(
self
,
response
):
item
=
NoticiasItem
()
for
post
in
response
.
xpath
(
'//ul[@class="archivepost"]/li'
):
# item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
item
[
'date'
]
=
datetime
(
int
(
self
.
year
),
int
(
self
.
month
),
int
(
self
.
day
),
tzinfo
=
self
.
tz
)
.
isoformat
(
'T'
)
item
[
'topic'
]
=
post
.
xpath
(
'./p/a/text()'
)
.
extract
()
request
=
scrapy
.
Request
(
url
=
post
.
xpath
(
'./h2/a/@href'
)
.
extract_first
(),
callback
=
self
.
parse_item
)
request
.
meta
[
'item'
]
=
item
yield
request
def
parse_item
(
self
,
response
):
text
=
''
item
=
response
.
meta
[
'item'
]
item
[
'title'
]
=
response
.
xpath
(
'//h1[@class="post entry-title"]/a/text()'
)
.
extract_first
()
for
paragraph
in
response
.
xpath
(
'//div[@itemprop="text"]/p'
)
.
extract
():
text
+=
remove_tags
(
paragraph
)
+
'
\n
'
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
# print item['title']
yield
item
This diff is collapsed.
Click to expand it.
descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.pyc
View file @
d895ebe3
No preview for this file type
This diff is collapsed.
Click to expand it.
descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.py
View file @
d895ebe3
...
...
@@ -6,81 +6,81 @@ from datetime import datetime, timedelta, tzinfo
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
return
TAG_RE
.
sub
(
''
,
text
)
class
UTC
(
tzinfo
):
"""clase para el 'time zone' (zona horaria)"""
"""clase para el 'time zone' (zona horaria)"""
def
utcoffset
(
self
,
dt
):
# zona horaria para sonora (tiempo del pacifico): utc-7
return
timedelta
(
hours
=-
7
)
def
utcoffset
(
self
,
dt
):
# zona horaria para sonora (tiempo del pacifico): utc-7
return
timedelta
(
hours
=-
7
)
def
tzname
(
self
,
dt
):
# nombre de la zona horaria
return
'UTC-7'
def
tzname
(
self
,
dt
):
# nombre de la zona horaria
return
'UTC-7'
class
NoticiasItem
(
scrapy
.
Item
):
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
class
QuotesSpider
(
scrapy
.
Spider
):
name
=
"noticias"
def
start_requests
(
self
):
tz
=
UTC
()
year
=
getattr
(
self
,
'year'
,
None
)
month
=
getattr
(
self
,
'month'
,
None
)
day
=
getattr
(
self
,
'day'
,
None
)
self
.
date
=
datetime
(
int
(
year
),
int
(
month
),
int
(
day
),
tzinfo
=
tz
)
.
isoformat
(
'T'
)
self
.
baseURL
=
'http://diariodelyaqui.mx/'
+
year
+
'/'
+
month
+
'/'
+
day
yield
scrapy
.
Request
(
url
=
self
.
baseURL
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
pagination
=
response
.
xpath
(
'//ul[@class="page-numbers"]/li/a/@href'
)
.
extract
()
if
(
len
(
pagination
)
>
0
):
pagination
=
pagination
[
-
2
]
.
strip
(
'/'
)
pages
=
int
(
pagination
[
pagination
.
rfind
(
'/'
)
+
1
:])
for
page
in
range
(
0
,
pages
):
if
(
page
==
0
):
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
else
:
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
'/page/'
+
str
(
page
+
1
),
callback
=
self
.
parse_page
)
else
:
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
def
parse_page
(
self
,
response
):
for
link
in
response
.
xpath
(
'//h2[@class="entry-title"]/a/@href'
)
.
extract
():
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
def
parse_item
(
self
,
response
):
item
=
NoticiasItem
()
text
=
''
item
[
'date'
]
=
self
.
date
item
[
'title'
]
=
response
.
xpath
(
'//h1[@class="entry-title"]/text()'
)
.
extract_first
()
item
[
'topic'
]
=
response
.
xpath
(
'//ul[@class="post-categories"]/li/a/text()'
)
.
extract_first
()
for
paragraph
in
response
.
xpath
(
'//div[@class="clearfix"]/p'
)
.
extract
():
text
+=
remove_tags
(
paragraph
)
+
'
\n
'
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
# print item['title']
yield
item
name
=
"noticias"
def
start_requests
(
self
):
tz
=
UTC
()
year
=
getattr
(
self
,
'year'
,
None
)
month
=
getattr
(
self
,
'month'
,
None
)
day
=
getattr
(
self
,
'day'
,
None
)
self
.
date
=
datetime
(
int
(
year
),
int
(
month
),
int
(
day
),
tzinfo
=
tz
)
.
isoformat
(
'T'
)
self
.
baseURL
=
'http://diariodelyaqui.mx/'
+
year
+
'/'
+
month
+
'/'
+
day
yield
scrapy
.
Request
(
url
=
self
.
baseURL
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
pagination
=
response
.
xpath
(
'//ul[@class="page-numbers"]/li/a/@href'
)
.
extract
()
if
(
len
(
pagination
)
>
0
):
pagination
=
pagination
[
-
2
]
.
strip
(
'/'
)
pages
=
int
(
pagination
[
pagination
.
rfind
(
'/'
)
+
1
:])
for
page
in
range
(
0
,
pages
):
if
(
page
==
0
):
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
else
:
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
'/page/'
+
str
(
page
+
1
),
callback
=
self
.
parse_page
)
else
:
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
def
parse_page
(
self
,
response
):
for
link
in
response
.
xpath
(
'//h2[@class="entry-title"]/a/@href'
)
.
extract
():
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
def
parse_item
(
self
,
response
):
item
=
NoticiasItem
()
text
=
''
item
[
'date'
]
=
self
.
date
item
[
'title'
]
=
response
.
xpath
(
'//h1[@class="entry-title"]/text()'
)
.
extract_first
()
item
[
'topic'
]
=
response
.
xpath
(
'//ul[@class="post-categories"]/li/a/text()'
)
.
extract_first
()
for
paragraph
in
response
.
xpath
(
'//div[@class="clearfix"]/p'
)
.
extract
():
text
+=
remove_tags
(
paragraph
)
+
'
\n
'
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
# print item['title']
yield
item
This diff is collapsed.
Click to expand it.
descarga_por_dia/laVerdadYuc/laVerdadYuc/spiders/noticias.py
View file @
d895ebe3
...
...
@@ -6,16 +6,16 @@ scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
return
TAG_RE
.
sub
(
''
,
text
)
class
NoticiasItem
(
scrapy
.
Item
):
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
# class QuotesSpider(scrapy.Spider):
# name = "noticias"
...
...
@@ -65,66 +65,66 @@ class NoticiasItem(scrapy.Item):
# yield item
class
QuotesSpider
(
scrapy
.
Spider
):
# handle_httpstatus_list = [404]
name
=
"noticias"
def
start_requests
(
self
):
year
=
getattr
(
self
,
'year'
,
None
)
month
=
getattr
(
self
,
'month'
,
None
)
day
=
getattr
(
self
,
'day'
,
None
)
self
.
baseURL
=
'http://laverdadnoticias.com/'
+
year
+
'/'
+
month
+
'/'
+
day
yield
scrapy
.
Request
(
url
=
self
.
baseURL
,
callback
=
self
.
parse
)
# while not self.stop:
# # for page in range(0, 50):
# if page == 0:
# yield scrapy.Request(url=self.baseURL, callback=self.parse, errback=self.errback_http)
# elif page > 0:
# yield scrapy.Request(url=self.baseURL+'/page/'+str(page), callback=self.parse, errback=self.errback_http)
# page += 1
def
parse
(
self
,
response
):
# if response.status == 404:
# print('**********hey, 404! TRUE!!!')
# self.stop = True
# else:
linkList
=
response
.
xpath
(
'//*[@class="two_third post_header"]/h5/a/@href'
)
.
extract
()
linkList
.
extend
(
response
.
xpath
(
'//*[@class="post_header_title two_third last"]/h5/a/@href'
)
.
extract
())
linkList
.
extend
(
response
.
xpath
(
'//*[@class="post_header_title one"]/h5/a/@href'
)
.
extract
())
for
link
in
linkList
:
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
nextPage
=
response
.
xpath
(
'//*[@class="pagination"]/a/@href'
)
.
extract
()[
-
1
]
yield
scrapy
.
Request
(
url
=
nextPage
,
callback
=
self
.
parse
)
def
parse_item
(
self
,
response
):
item
=
NoticiasItem
()
text
=
''
d
=
response
.
xpath
(
'//meta[@property="article:published_time"]/@content'
)
.
extract_first
()
if
d
is
None
or
d
==
''
:
d
=
response
.
xpath
(
'//meta[@property="DC.date.issued"]/@content'
)
.
extract_first
()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if
d
[
-
6
:]
!=
'-06:00'
:
d
=
d
[:
-
6
]
+
'-06:00'
item
[
'date'
]
=
d
item
[
'title'
]
=
response
.
xpath
(
'//*[@class="page_title_inner"]/h1/text()'
)
.
extract_first
()
item
[
'topic'
]
=
response
.
xpath
(
'//*[@class="post_info_cat"]/a/text()'
)
.
extract_first
()
paragraph
=
response
.
xpath
(
'//*[@class="post_content_wrapper"]/p'
)
.
extract
()
paragraph
.
extend
(
response
.
xpath
(
'//*[@title="Page 1"]/div/p'
)
.
extract
())
paragraph
.
extend
(
response
.
xpath
(
'//*[@class="text_exposed_root text_exposed"]/p'
)
.
extract
())
for
p
in
paragraph
:
text
+=
remove_tags
(
p
)
+
'
\n
'
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
# print item['title']
yield
item
\ No newline at end of file
# handle_httpstatus_list = [404]
name
=
"noticias"
def
start_requests
(
self
):
year
=
getattr
(
self
,
'year'
,
None
)
month
=
getattr
(
self
,
'month'
,
None
)
day
=
getattr
(
self
,
'day'
,
None
)
self
.
baseURL
=
'http://laverdadnoticias.com/'
+
year
+
'/'
+
month
+
'/'
+
day
yield
scrapy
.
Request
(
url
=
self
.
baseURL
,
callback
=
self
.
parse
)
# while not self.stop:
# # for page in range(0, 50):
# if page == 0:
# yield scrapy.Request(url=self.baseURL, callback=self.parse, errback=self.errback_http)
# elif page > 0:
# yield scrapy.Request(url=self.baseURL+'/page/'+str(page), callback=self.parse, errback=self.errback_http)
# page += 1
def
parse
(
self
,
response
):
# if response.status == 404:
# print('**********hey, 404! TRUE!!!')
# self.stop = True
# else:
linkList
=
response
.
xpath
(
'//*[@class="two_third post_header"]/h5/a/@href'
)
.
extract
()
linkList
.
extend
(
response
.
xpath
(
'//*[@class="post_header_title two_third last"]/h5/a/@href'
)
.
extract
())
linkList
.
extend
(
response
.
xpath
(
'//*[@class="post_header_title one"]/h5/a/@href'
)
.
extract
())
for
link
in
linkList
:
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
nextPage
=
response
.
xpath
(
'//*[@class="pagination"]/a/@href'
)
.
extract
()[
-
1
]
yield
scrapy
.
Request
(
url
=
nextPage
,
callback
=
self
.
parse
)
def
parse_item
(
self
,
response
):
item
=
NoticiasItem
()
text
=
''
d
=
response
.
xpath
(
'//meta[@property="article:published_time"]/@content'
)
.
extract_first
()
if
d
is
None
or
d
==
''
:
d
=
response
.
xpath
(
'//meta[@property="DC.date.issued"]/@content'
)
.
extract_first
()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if
d
[
-
6
:]
!=
'-06:00'
:
d
=
d
[:
-
6
]
+
'-06:00'
item
[
'date'
]
=
d
item
[
'title'
]
=
response
.
xpath
(
'//*[@class="page_title_inner"]/h1/text()'
)
.
extract_first
()
item
[
'topic'
]
=
response
.
xpath
(
'//*[@class="post_info_cat"]/a/text()'
)
.
extract_first
()
paragraph
=
response
.
xpath
(
'//*[@class="post_content_wrapper"]/p'
)
.
extract
()
paragraph
.
extend
(
response
.
xpath
(
'//*[@title="Page 1"]/div/p'
)
.
extract
())
paragraph
.
extend
(
response
.
xpath
(
'//*[@class="text_exposed_root text_exposed"]/p'
)
.
extract
())
for
p
in
paragraph
:
text
+=
remove_tags
(
p
)
+
'
\n
'
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
# print item['title']
yield
item
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment