Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
11c4aa01
Commit
11c4aa01
authored
Aug 25, 2017
by
Renán Sosa Guillen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
crawlers
parent
f1dfa7e9
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
348 additions
and
165 deletions
+348
-165
noticias.py
descarga_por_dia/laJornada/laJornada/spiders/noticias.py
+348
-28
noticias.pyc
descarga_por_dia/laJornada/laJornada/spiders/noticias.pyc
+0
-0
noticias.json
descarga_por_dia/laJornada/noticias.json
+0
-137
No files found.
descarga_por_dia/laJornada/laJornada/spiders/noticias.py
View file @
11c4aa01
...
@@ -24,27 +24,213 @@ class NoticiasItem(scrapy.Item):
...
@@ -24,27 +24,213 @@ class NoticiasItem(scrapy.Item):
class
QuotesSpider
(
scrapy
.
Spider
):
class
QuotesSpider
(
scrapy
.
Spider
):
name
=
"noticias"
name
=
"noticias"
def
start_requests
(
self
):
def
start_requests
(
self
):
self
.
year
=
getattr
(
self
,
'year'
,
None
)
year
=
getattr
(
self
,
'year'
,
None
)
self
.
month
=
getattr
(
self
,
'month'
,
None
)
month
=
getattr
(
self
,
'month'
,
None
)
self
.
day
=
getattr
(
self
,
'day'
,
None
)
day
=
getattr
(
self
,
'day'
,
None
)
self
.
baseURL
=
'http://www.jornada.unam.mx/'
+
self
.
year
+
'/'
+
self
.
month
.
zfill
(
2
)
+
'/'
+
self
.
day
.
zfill
(
2
)
+
'/'
self
.
baseURL
=
'http://www.jornada.unam.mx/'
+
year
+
'/'
+
month
.
zfill
(
2
)
+
'/'
+
day
.
zfill
(
2
)
+
'/'
comparison_date
=
date
(
2009
,
2
,
15
)
self
.
comparison_date_1
=
date
(
2001
,
12
,
7
)
requested_date
=
date
(
int
(
self
.
year
),
int
(
self
.
month
),
int
(
self
.
day
))
self
.
comparison_date_2
=
date
(
2002
,
1
,
8
)
self
.
comparison_date_3
=
date
(
2003
,
4
,
25
)
self
.
comparison_date_4
=
date
(
2004
,
11
,
16
)
self
.
comparison_date_5
=
date
(
2004
,
12
,
12
)
self
.
comparison_date_6
=
date
(
2005
,
1
,
31
)
self
.
comparison_date_7
=
date
(
2009
,
2
,
15
)
self
.
date
=
date
(
int
(
year
),
int
(
month
),
int
(
day
))
section_list
=
[
'opinion'
,
'politica'
,
'economia'
,
'mundo'
,
'estados'
,
'ciencias'
,
# self.
section_list = ['opinion', 'politica', 'economia', 'mundo', 'estados', 'ciencias',
'capital'
,
'sociedad'
,
'cultura'
,
'espectaculos'
,
'deportes'
]
#
'capital', 'sociedad', 'cultura', 'espectaculos', 'deportes']
for
section
in
section_list
:
#
for section in section_list:
# para la fecha 2009/02/15 o menores, se tiene una estructura determinada de pagina
# para la fecha 2009/02/15 o menores, se tiene una estructura determinada de pagina
# para las fechas mayores a esa la estructura cambia
# para las fechas mayores a esa la estructura cambia
if
(
requested_date
<=
comparison_date
):
# if ( requested_date <= comparison_date_1 ):
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
section
,
callback
=
self
.
parse
)
# yield scrapy.Request(url=self.baseURL+section, callback=self.parse)
else
:
# else:
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
section
,
callback
=
self
.
parse_2
)
# yield scrapy.Request(url=self.baseURL+section, callback=self.parse_2)
if
self
.
date
<=
self
.
comparison_date_2
:
section_list
=
[
'index.html'
,
'edito.html'
,
'opinion.html'
,
'correo.html'
,
'politica.html'
,
'economia.html'
,
'cultura.html'
,
'espectaculos.html'
,
'estados.html'
,
'capital.html'
,
'mundo.html'
,
'soc-jus.html'
,
'deportes.html'
]
parse_s
=
{
'index.html'
:
'Portada'
,
'edito.html'
:
'Editorial'
,
'opinion.html'
:
'Opinion'
,
'correo.html'
:
'Correo'
,
'politica.html'
:
'Politica'
,
'economia.html'
:
'Economia'
,
'cultura.html'
:
'Cultura'
,
'espectaculos.html'
:
'Espectaculos'
,
'estados.html'
:
'Estados'
,
'capital.html'
:
'Capital'
,
'mundo.html'
:
'Mundo'
,
'soc-jus.html'
:
'Sociedad'
,
'deportes.html'
:
'Deportes'
}
for
s
in
section_list
:
item
=
NoticiasItem
()
item
[
'date'
]
=
self
.
date
item
[
'topic'
]
=
parse_s
[
s
]
if
s
==
'edito.html'
or
s
==
'correo.html'
:
request
=
scrapy
.
Request
(
url
=
self
.
baseURL
+
s
,
callback
=
self
.
parse_item
)
else
:
request
=
scrapy
.
Request
(
url
=
self
.
baseURL
+
s
,
callback
=
self
.
parse
)
request
.
meta
[
'item'
]
=
item
yield
request
elif
self
.
date
>
self
.
comparison_date_2
and
self
.
date
<=
self
.
comparison_date_3
:
section_list
=
[
'index.html'
,
'edito.html'
,
'opinion.html'
,
'correo.html'
,
'politica.html'
,
'economia.html'
,
'cultura.html'
,
'espectaculos.html'
,
'estados.html'
,
'capital.html'
,
'mundo.html'
,
'soc-jus.html'
,
'deportes.html'
,
'index.php'
,
'edito.php'
,
'opinion.php'
,
'correo.php'
,
'politica.php'
,
'economia.php'
,
'cultura.php'
,
'espectaculos.php'
,
'estados.php'
,
'capital.php'
,
'mundo.php'
,
'soc-jus.php'
,
'deportes.php'
]
parse_s
=
{
'index.html'
:
'Portada'
,
'edito.html'
:
'Editorial'
,
'opinion.html'
:
'Opinion'
,
'correo.html'
:
'Correo'
,
'politica.html'
:
'Politica'
,
'economia.html'
:
'Economia'
,
'cultura.html'
:
'Cultura'
,
'espectaculos.html'
:
'Espectaculos'
,
'estados.html'
:
'Estados'
,
'capital.html'
:
'Capital'
,
'mundo.html'
:
'Mundo'
,
'soc-jus.html'
:
'Sociedad'
,
'deportes.html'
:
'Deportes'
,
'index.php'
:
'Portada'
,
'edito.php'
:
'Editorial'
,
'opinion.php'
:
'Opinion'
,
'correo.php'
:
'Correo'
,
'politica.php'
:
'Politica'
,
'economia.php'
:
'Economia'
,
'cultura.php'
:
'Cultura'
,
'espectaculos.php'
:
'Espectaculos'
,
'estados.php'
:
'Estados'
,
'capital.php'
:
'Capital'
,
'mundo.php'
:
'Mundo'
,
'soc-jus.php'
:
'Sociedad'
,
'deportes.php'
:
'Deportes'
}
for
s
in
section_list
:
item
=
NoticiasItem
()
item
[
'date'
]
=
self
.
date
item
[
'topic'
]
=
parse_s
[
s
]
if
s
==
'edito.html'
or
s
==
'correo.html'
or
s
==
'edito.php'
or
s
==
'correo.php'
:
request
=
scrapy
.
Request
(
url
=
self
.
baseURL
+
s
,
callback
=
self
.
parse_item
)
else
:
request
=
scrapy
.
Request
(
url
=
self
.
baseURL
+
s
,
callback
=
self
.
parse_2
)
request
.
meta
[
'item'
]
=
item
yield
request
elif
self
.
date
>
self
.
comparison_date_3
and
self
.
date
<=
self
.
comparison_date_6
:
section_list
=
[
'indexfla.php'
,
'edito.php'
,
'opinion.php'
,
'correo.php'
,
'politica.php'
,
'economia.php'
,
'cultura.php'
,
'espectaculos.php'
,
'estados.php'
,
'capital.php'
,
'mundo.php'
,
'soc-jus.php'
,
'deportes.php'
,
'index.php'
]
parse_s
=
{
'indexfla.php'
:
'Portada'
,
'edito.php'
:
'Editorial'
,
'opinion.php'
:
'Opinion'
,
'correo.php'
:
'Correo'
,
'politica.php'
:
'Politica'
,
'economia.php'
:
'Economia'
,
'cultura.php'
:
'Cultura'
,
'espectaculos.php'
:
'Espectaculos'
,
'estados.php'
:
'Estados'
,
'capital.php'
:
'Capital'
,
'mundo.php'
:
'Mundo'
,
'soc-jus.php'
:
'Sociedad'
,
'deportes.php'
:
'Deportes'
,
'index.php'
:
'Portada'
}
for
s
in
section_list
:
item
=
NoticiasItem
()
item
[
'date'
]
=
self
.
date
item
[
'topic'
]
=
parse_s
[
s
]
if
s
==
'edito.php'
or
s
==
'correo.php'
:
if
self
.
date
>
self
.
comparison_date_3
and
self
.
date
<=
self
.
comparison_date_5
:
request
=
scrapy
.
Request
(
url
=
self
.
baseURL
+
s
,
callback
=
self
.
parse_item
)
elif
self
.
date
>
self
.
comparison_date_5
and
self
.
date
<=
self
.
comparison_date_6
:
request
=
scrapy
.
Request
(
url
=
self
.
baseURL
+
s
,
callback
=
self
.
parse_item_2
)
else
:
request
=
scrapy
.
Request
(
url
=
self
.
baseURL
+
s
,
callback
=
self
.
parse_3
)
request
.
meta
[
'item'
]
=
item
yield
request
elif
self
.
date
>
self
.
comparison_date_6
:
section_list
=
[
'opinion'
,
'politica'
,
'economia'
,
'mundo'
,
'estados'
,
'ciencias'
,
'capital'
,
'sociedad'
,
'cultura'
,
'espectaculos'
,
'deportes'
]
for
s
in
section_list
:
# para las fechas menores a 2009/02/15 y mayores a 2005/01/31, se tiene una estructura determinada de pagina
# para las fechas mayores a esa la estructura cambia
if
self
.
date
<=
self
.
comparison_date_7
:
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
s
,
callback
=
self
.
parse_5
)
elif
self
.
date
>
self
.
comparison_date_7
:
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
s
,
callback
=
self
.
parse_6
)
def
parse
(
self
,
response
):
def
parse
(
self
,
response
):
item
=
response
.
meta
[
'item'
]
if
self
.
date
<=
self
.
comparison_date_1
:
if
item
[
'topic'
]
==
'Portada'
:
path
=
'//td[@rowspan="3"]'
else
:
if
len
(
response
.
xpath
(
'//td[@align="center"]'
)
.
css
(
'a::attr(href)'
)
.
extract
())
>
0
:
path
=
'//td[@align="center"]'
else
:
path
=
'//td[@align="CENTER"]'
elif
self
.
date
>
self
.
comparison_date_1
and
self
.
date
<=
self
.
comparison_date_2
:
if
item
[
'topic'
]
==
'Portada'
:
path
=
'//empieza'
else
:
path
=
'//table[@bordercolor="#CCCCCC"]'
for
r
in
response
.
xpath
(
path
)
.
css
(
'a::attr(href)'
)
.
extract
():
if
r
[
-
5
:]
==
'.html'
:
request
=
scrapy
.
Request
(
url
=
self
.
baseURL
+
r
,
callback
=
self
.
parse_item
)
request
.
meta
[
'item'
]
=
item
yield
request
def
parse_2
(
self
,
response
):
item
=
response
.
meta
[
'item'
]
for
r
in
response
.
xpath
(
'//table[@bordercolor="#CCCCCC"]'
)
.
css
(
'a::attr(href)'
)
.
extract
():
if
r
[
-
5
:]
==
'.html'
:
request
=
scrapy
.
Request
(
url
=
self
.
baseURL
+
r
,
callback
=
self
.
parse_item
)
request
.
meta
[
'item'
]
=
item
yield
request
def
parse_3
(
self
,
response
):
item
=
response
.
meta
[
'item'
]
link_list
=
[]
link_list
.
extend
(
response
.
xpath
(
'//td[@width="100
%
"]'
)
.
css
(
'a::attr(href)'
)
.
extract
())
link_list
.
extend
(
response
.
xpath
(
'//td[@width="52
%
"]'
)
.
css
(
'a::attr(href)'
)
.
extract
())
link_list
.
extend
(
response
.
xpath
(
'//td[@width="24
%
"]'
)
.
css
(
'a::attr(href)'
)
.
extract
())
link_list
.
extend
(
response
.
xpath
(
'//td[@width="646"]'
)
.
css
(
'a::attr(href)'
)
.
extract
())
link_list
.
extend
(
response
.
xpath
(
'//table[@width="100
%
"]'
)
.
css
(
'a::attr(href)'
)
.
extract
())
for
r
in
link_list
:
if
r
[
-
11
:]
==
'.html&fly=1'
or
r
[
-
9
:]
==
'.php&fly='
or
r
[
-
4
:]
==
'.php'
:
if
self
.
date
>
self
.
comparison_date_3
and
self
.
date
<=
self
.
comparison_date_6
:
if
self
.
date
<=
self
.
comparison_date_4
:
request
=
scrapy
.
Request
(
url
=
self
.
baseURL
+
r
,
callback
=
self
.
parse_item
)
request
.
meta
[
'item'
]
=
item
yield
request
elif
self
.
date
>
self
.
comparison_date_4
and
self
.
date
<=
self
.
comparison_date_6
:
if
r
[:
4
]
==
'http'
and
r
[
-
4
:]
==
'.php'
:
this_url
=
r
.
replace
(
'
\n
'
,
''
)
if
self
.
date
<=
self
.
comparison_date_5
:
request
=
scrapy
.
Request
(
url
=
this_url
,
callback
=
self
.
parse_item
)
elif
self
.
date
>
self
.
comparison_date_5
and
self
.
date
<=
self
.
comparison_date_6
:
request
=
scrapy
.
Request
(
url
=
this_url
,
callback
=
self
.
parse_item_2
)
request
.
meta
[
'item'
]
=
item
yield
request
# elif self.date > self.comparison_date_5 and self.date <= self.comparison_date_6:
# request = scrapy.Request(url=self.baseURL+r, callback=self.parse_item_2)
# request.meta['item'] = item
# yield request
def
parse_4
(
self
,
response
):
print
response
.
url
for
r
in
response
.
xpath
(
'//td[@width="646"]'
)
.
css
(
'a::attr(href)'
)
.
extract
():
if
r
[
-
4
:]
==
'.php'
:
print
r
.
replace
(
'
\n
'
,
''
)
# request = scrapy.Request(url=r.replace('\n',''), callback=self.parse_item)
# request.meta['item'] = item
# yield request
def
parse_5
(
self
,
response
):
if
(
response
.
url
[:
response
.
url
.
rfind
(
'/'
)
+
1
]
==
self
.
baseURL
):
# verifica que se conserva la misma URL base
if
(
response
.
url
[:
response
.
url
.
rfind
(
'/'
)
+
1
]
==
self
.
baseURL
):
# verifica que se conserva la misma URL base
section
=
response
.
url
[
response
.
url
.
rfind
(
'/'
)
+
1
:]
section
=
response
.
url
[
response
.
url
.
rfind
(
'/'
)
+
1
:]
if
(
section
==
'opinion'
):
# la seccion 'opinion' tiene una estructura diferente a las otras
if
(
section
==
'opinion'
):
# la seccion 'opinion' tiene una estructura diferente a las otras
...
@@ -56,49 +242,183 @@ class QuotesSpider(scrapy.Spider):
...
@@ -56,49 +242,183 @@ class QuotesSpider(scrapy.Spider):
for
path
in
path_list
:
for
path
in
path_list
:
for
link
in
response
.
xpath
(
path
)
.
extract
():
for
link
in
response
.
xpath
(
path
)
.
extract
():
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
link
,
callback
=
self
.
parse_item
)
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
link
,
callback
=
self
.
parse_item
_3
)
def
parse_2
(
self
,
response
):
def
parse_6
(
self
,
response
):
if
(
response
.
url
[:
response
.
url
.
rfind
(
'/'
)
+
1
]
==
self
.
baseURL
):
if
(
response
.
url
[:
response
.
url
.
rfind
(
'/'
)
+
1
]
==
self
.
baseURL
):
path_list
=
[
'//*[@class="itemfirst"]/div/a/@href'
,
'//*[@class="item start"]/div/a/@href'
,
path_list
=
[
'//*[@class="itemfirst"]/div/a/@href'
,
'//*[@class="item start"]/div/a/@href'
,
'//*[@class="item"]/div/a/@href'
]
'//*[@class="item"]/div/a/@href'
]
for
path
in
path_list
:
for
path
in
path_list
:
for
link
in
response
.
xpath
(
path
)
.
extract
():
for
link
in
response
.
xpath
(
path
)
.
extract
():
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
link
,
callback
=
self
.
parse_item_2
)
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
link
,
callback
=
self
.
parse_item_4
)
def
parse_item
(
self
,
response
):
def
parse_item
(
self
,
response
):
item
=
response
.
meta
[
'item'
]
flag
=
True
text
=
''
try
:
title
=
response
.
xpath
(
'//font[@size="5"]'
)
.
extract_first
()
item
[
'title'
]
=
remove_tags
(
title
)
except
:
try
:
title
=
response
.
xpath
(
'//p/font[@size="5"]'
)
.
extract_first
()
item
[
'title'
]
=
remove_tags
(
title
)
except
:
try
:
title
=
response
.
xpath
(
'//p/font[@size="5"]'
)
.
extract
()[
1
]
item
[
'title'
]
=
remove_tags
(
title
)
except
:
try
:
title
=
response
.
xpath
(
'//font[@size="4"]'
)
.
extract_first
()
item
[
'title'
]
=
remove_tags
(
title
)
except
:
try
:
title
=
response
.
xpath
(
'//p/font[@size="4"]'
)
.
extract_first
()
item
[
'title'
]
=
remove_tags
(
title
)
except
:
try
:
title
=
response
.
xpath
(
'//p/font[@size="4"][1]'
)
.
extract
()[
1
]
item
[
'title'
]
=
remove_tags
(
title
)
except
:
try
:
title
=
response
.
xpath
(
'//font[@size="3"]'
)
.
extract_first
()
item
[
'title'
]
=
remove_tags
(
title
)
except
:
try
:
title
=
response
.
xpath
(
'//p/font[@size="3"]'
)
.
extract_first
()
item
[
'title'
]
=
remove_tags
(
title
)
except
:
try
:
title
=
response
.
xpath
(
'//p/font[@size="3"][1]'
)
.
extract
()[
1
]
item
[
'title'
]
=
remove_tags
(
title
)
except
:
try
:
title
=
response
.
xpath
(
'//font[@size="+1"]'
)
.
extract_first
()
item
[
'title'
]
=
remove_tags
(
title
)
except
:
try
:
title
=
response
.
xpath
(
'//font[@size="+0"]'
)
.
extract_first
()
item
[
'title'
]
=
remove_tags
(
title
)
except
:
if
self
.
date
<=
date
(
1999
,
10
,
3
):
# en esta fecha hay un cambio respecto a las otras en cuanto al html de la pag
try
:
title
=
remove_tags
(
response
.
xpath
(
'//center'
)
.
extract_first
())
item
[
'title'
]
=
title
flag
=
False
except
:
pass
else
:
pass
if
flag
:
if
self
.
date
<=
self
.
comparison_date_1
:
for
p
in
response
.
css
(
'p'
)
.
extract
():
text
+=
remove_tags
(
p
)
.
replace
(
'
\r
'
,
''
)
## no toma en cuenta los primeros indices donde esta el titulo
text
=
text
.
replace
(
'
\t
'
,
''
)
elif
self
.
date
>
self
.
comparison_date_1
and
self
.
date
<=
self
.
comparison_date_3
:
for
p
in
response
.
xpath
(
'//table[@bordercolor="#CCCCCC"]'
)
.
css
(
'p'
)
.
extract
():
text
+=
remove_tags
(
p
)
.
replace
(
'
\r
'
,
''
)
text
=
text
.
replace
(
'
\t
'
,
''
)
elif
self
.
date
>
self
.
comparison_date_3
and
self
.
date
<=
self
.
comparison_date_4
:
p
=
response
.
css
(
'p'
)
.
extract
()
for
i
in
range
(
0
,
len
(
p
)):
text
+=
remove_tags
(
p
[
i
])
.
replace
(
'
\r
'
,
''
)
text
=
text
.
replace
(
'
\t
'
,
''
)
elif
self
.
date
>
self
.
comparison_date_4
and
self
.
date
<=
self
.
comparison_date_5
:
p
=
response
.
css
(
'p'
)
.
extract
()
for
i
in
range
(
3
,
len
(
p
)):
text
+=
remove_tags
(
p
[
i
])
.
replace
(
'
\r
'
,
''
)
text
=
text
.
replace
(
'
\t
'
,
''
)
if
text
==
''
:
for
i
in
range
(
0
,
len
(
p
)):
text
+=
remove_tags
(
p
[
i
])
.
replace
(
'
\r
'
,
''
)
text
=
text
.
replace
(
'
\t
'
,
''
)
else
:
text
=
remove_tags
(
response
.
body
)
text
=
text
[
len
(
title
):]
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
yield
item
def
parse_item_2
(
self
,
response
):
item
=
response
.
meta
[
'item'
]
text
=
''
title_list
=
[]
title_list
.
extend
(
response
.
xpath
(
'//*[@id="contenido"]/h1/text()'
)
.
extract
())
title_list
.
extend
(
response
.
xpath
(
'//h1/text()'
)
.
extract
())
for
t
in
title_list
:
if
t
is
not
None
or
t
!=
''
:
title
=
remove_tags
(
t
)
.
replace
(
'
\r
'
,
''
)
title
=
title
.
replace
(
'
\t
'
,
''
)
item
[
'title'
]
=
title
p
=
response
.
css
(
'p'
)
.
extract
()
for
i
in
range
(
4
,
len
(
p
)):
text
+=
remove_tags
(
p
[
i
])
.
replace
(
'
\r
'
,
''
)
text
=
text
.
replace
(
'
\t
'
,
''
)
if
text
==
''
:
for
i
in
range
(
0
,
len
(
p
)):
text
+=
remove_tags
(
p
[
i
])
.
replace
(
'
\r
'
,
''
)
text
=
text
.
replace
(
'
\t
'
,
''
)
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
yield
item
def
parse_item_3
(
self
,
response
):
item
=
NoticiasItem
()
item
=
NoticiasItem
()
text
=
''
text
=
''
item
[
'date'
]
=
self
.
year
+
'-'
+
self
.
month
.
zfill
(
2
)
+
'-'
+
self
.
day
.
zfill
(
2
)
item
[
'date'
]
=
self
.
date
title
=
response
.
xpath
(
'//*[@class="documentContent"]/h1[@class="title"]/text()'
)
.
extract
()
title
=
response
.
xpath
(
'//*[@class="documentContent"]/h1[@class="title"]/text()'
)
.
extract
()
if
(
len
(
title
)
>
0
):
if
(
len
(
title
)
>
0
):
item
[
'title'
]
=
title
[
0
]
item
[
'title'
]
=
title
[
0
]
else
:
else
:
item
[
'title'
]
=
response
.
xpath
(
'//*[@class="documentContent"]/h1/text()'
)
.
extract_first
()
item
[
'title'
]
=
response
.
xpath
(
'//*[@class="documentContent"]/h1/text()'
)
.
extract_first
()
item
[
'topic'
]
=
response
.
xpath
(
'//*[@id="portal-breadcrumbs"]/a[2]/text()'
)
.
extract
()
for
paragraph
in
response
.
xpath
(
'//*[@class="documentContent"]/p/text()'
)
.
extract
():
item
[
'topic'
]
=
response
.
xpath
(
'//*[@id="portal-breadcrumbs"]/a[2]/text()'
)
.
extract_first
()
text
+=
paragraph
for
p
in
response
.
xpath
(
'//*[@class="documentContent"]/p'
)
.
extract
():
text
+=
remove_tags
(
p
)
.
replace
(
'
\r
'
,
''
)
text
=
text
.
replace
(
'
\t
'
,
''
)
item
[
'text'
]
=
text
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
item
[
'url'
]
=
response
.
url
# print item['title']
# print item['title']
yield
item
yield
item
def
parse_item_
2
(
self
,
response
):
def
parse_item_
4
(
self
,
response
):
item
=
NoticiasItem
()
item
=
NoticiasItem
()
text
=
''
text
=
''
path_list
=
[
'//*[@class="col"]/p'
,
'//*[@class="col col1"]/p'
,
'//*[@class="col col2"]/p'
]
# path_list = ['//*[@class="col"]/p', '//*[@class="col col1"]/p', '//*[@class="col col2"]/p']
path_list
=
[
'//*[@class="col"]'
,
'//*[@class="col col1"]'
,
'//*[@class="col col2"]'
]
item
[
'date'
]
=
self
.
year
+
'-'
+
self
.
month
.
zfill
(
2
)
+
'-'
+
self
.
day
.
zfill
(
2
)
item
[
'date'
]
=
self
.
date
item
[
'title'
]
=
remove_tags
(
response
.
xpath
(
'//*[@class="cabeza"]'
)
.
extract_first
())
item
[
'title'
]
=
remove_tags
(
response
.
xpath
(
'//*[@class="cabeza"]'
)
.
extract_first
())
item
[
'topic'
]
=
response
.
xpath
(
'//*[@class="breadcrumb gui"]/span[2]/a/text()'
)
.
extract
()
item
[
'topic'
]
=
response
.
xpath
(
'//*[@class="breadcrumb gui"]/span[2]/a/text()'
)
.
extract
_first
()
for
path
in
path_list
:
for
path
in
path_list
:
for
paragraph
in
response
.
xpath
(
path
)
.
extract
():
for
p
in
response
.
xpath
(
path
)
.
extract
():
text
+=
remove_tags
(
paragraph
)
text
+=
remove_tags
(
p
)
.
replace
(
'
\r
'
,
''
)
text
=
text
.
replace
(
'
\t
'
,
''
)
item
[
'text'
]
=
text
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
item
[
'url'
]
=
response
.
url
# print item['title']
# print item['title']
...
...
descarga_por_dia/laJornada/laJornada/spiders/noticias.pyc
View file @
11c4aa01
No preview for this file type
descarga_por_dia/laJornada/noticias.json
deleted
100644 → 0
View file @
f1dfa7e9
This source diff could not be displayed because it is too large. You can
view the blob
instead.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment