Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
2a9d7bdc
Commit
2a9d7bdc
authored
May 02, 2018
by
Renán Sosa Guillen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
crawlers
parent
d60a1a0a
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
133 additions
and
82 deletions
+133
-82
noticias.py
..._atras/foraneos/elSalvador/elSalvador/spiders/noticias.py
+133
-82
noticias.pyc
...atras/foraneos/elSalvador/elSalvador/spiders/noticias.pyc
+0
-0
No files found.
descarga_hacia_atras/foraneos/elSalvador/elSalvador/spiders/noticias.py
View file @
2a9d7bdc
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import
scrapy
,
re
,
json
import
scrapy
,
re
,
json
,
ast
from
scrapy.selector
import
Selector
from
datetime
import
datetime
,
date
from
datetime
import
datetime
,
date
from
elSalvador.items
import
NoticiasItem
from
elSalvador.items
import
NoticiasItem
...
@@ -48,26 +49,26 @@ class QuotesSpider(scrapy.Spider):
...
@@ -48,26 +49,26 @@ class QuotesSpider(scrapy.Spider):
baseURL
=
"http://www.elsalvador.com/category/noticias/"
baseURL
=
"http://www.elsalvador.com/category/noticias/"
# sectionList = []
# sectionList = []
sectionList
=
[
"nacional"
]
sectionList
=
[
"
inter
nacional"
]
#
if self.stopDate is None:
if
self
.
stopDate
is
None
:
#
for s in sectionList:
for
s
in
sectionList
:
#
info = ImportantData()
info
=
ImportantData
()
#
info['page'] = 1
info
[
'page'
]
=
1
#
request = scrapy.Request(url=baseURL + s + "/", callback=self.parse)
request
=
scrapy
.
Request
(
url
=
baseURL
+
s
+
"/"
,
callback
=
self
.
parse
)
#
request.meta['item'] = info
request
.
meta
[
'item'
]
=
info
#
yield request
yield
request
#
#
else:
else
:
#
for s in sectionList:
for
s
in
sectionList
:
#
info = ImportantData()
info
=
ImportantData
()
# info['page'] = 1
info
[
'page'
]
=
0
#
info['CONTINUE_SEARCHING'] = False
info
[
'CONTINUE_SEARCHING'
]
=
False
#
request = scrapy.Request(url=baseURL + s + "/", callback=self.parse_with_stop_date)
request
=
scrapy
.
Request
(
url
=
baseURL
+
s
+
"/"
,
callback
=
self
.
parse_with_stop_date
)
#
request.meta['item'] = info
request
.
meta
[
'item'
]
=
info
#
yield request
yield
request
for
s
in
sectionList
:
#
for s in sectionList:
yield
scrapy
.
Request
(
url
=
baseURL
+
s
+
"/"
,
callback
=
self
.
parse
)
#
yield scrapy.Request(url=baseURL + s + "/", callback=self.parse)
def
parse
(
self
,
response
):
def
parse
(
self
,
response
):
...
@@ -109,82 +110,132 @@ class QuotesSpider(scrapy.Spider):
...
@@ -109,82 +110,132 @@ class QuotesSpider(scrapy.Spider):
# yield request
# yield request
linkList
=
response
.
xpath
(
'//div[@id="main"]'
)
.
css
(
'h2.large-title'
)
.
xpath
(
'./a/@href'
)
.
extract
()
linkList
=
response
.
xpath
(
'//div[@id="main"]'
)
.
css
(
'h2.large-title'
)
.
xpath
(
'./a/@href'
)
.
extract
()
linkList
.
extend
(
response
.
xpath
(
'//div[@class="container even"]'
)
.
css
(
'h2.large-title'
)
.
xpath
(
'./a/@href'
)
.
extract
())
linkList
.
extend
(
response
.
xpath
(
'//div[@class="container even"]'
)
.
css
(
'h2.large-title'
)
.
xpath
(
'./a/@href'
)
.
extract
())
# for link in linkList:
for
link
in
linkList
:
# print link
url
=
"http://www.elsalvador.com//wp-json/LoadMore/Category/posts"
url_peticion
=
"/category/noticias/nacional/"
frmdata
=
{
'pppDestacado'
:
"5"
,
'pppNoDestacado'
:
"4"
,
'slug'
:
"nacional"
,
'paged'
:
"1"
,
'category_name'
:
"Nacional"
,
'url_peticion'
:
url_peticion
}
yield
scrapy
.
http
.
FormRequest
(
url
=
url
,
formdata
=
frmdata
,
callback
=
self
.
after_post
)
def
after_post
(
self
,
response
):
# from scrapy.shell import inspect_response
import
ast
from
scrapy.selector
import
Selector
print
"This is response: "
unescaped
=
ast
.
literal_eval
(
response
.
body
.
strip
())
body
=
Selector
(
text
=
unescaped
)
# inspect_response(response, self)
newsList
=
[]
for
link
in
body
.
xpath
(
'//div[@class="row news"]'
)
.
css
(
'div.subsection'
)
.
css
(
'h2'
)
.
xpath
(
'./a/@href'
)
.
extract
():
link
=
link
.
replace
(
'
\\
'
,
''
)
if
not
link
in
newsList
:
newsList
.
append
(
link
)
for
link
in
newsList
:
print
link
print
link
# url = "http://www.elsalvador.com//wp-json/LoadMore/Category/posts"
# url_peticion = "/category/noticias/internacional/"
# frmdata = {'pppDestacado': "5", 'pppNoDestacado': "4", 'slug': "internacional", 'paged': "4526", 'category_name': "Internacional", 'url_peticion': url_peticion}
#
# yield scrapy.http.FormRequest(url=url, formdata=frmdata, callback=self.after_post)
# def after_post(self, response):
# searchData = response.meta['item']
# # from scrapy.shell import inspect_response
# # print "This is response: "
# unescaped = ast.literal_eval(response.body.strip())
# body = Selector(text=unescaped)
# # inspect_response(response, self)
# newsList = []
# linksObtained = body.xpath('//div[@class="row news"]').css('div.subsection').css('h2').xpath('./a/@href').extract()
# for link in linksObtained:
# link = link.replace('\\', '')
# if not link in newsList:
# newsList.append(link)
#
# # print len(newsList) checar length de newList para determinar el paro
# if len(newsList) > 0:
# for link in newsList:
# info = ImportantData()
# info['url'] = searchData['url']
# info['page'] = searchData['page']
# info['section_url'] = searchData['section_url']
# if link == linkList[-1]: info['LAST_LINK'] = True
# else: info['LAST_LINK'] = False
# reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
# reqst.meta['item'] = info
# yield reqst
def
parse_with_stop_date
(
self
,
response
):
def
parse_with_stop_date
(
self
,
response
):
searchData
=
response
.
meta
[
'item'
]
searchData
=
response
.
meta
[
'item'
]
CONTINUE_SEARCHING
=
searchData
[
'CONTINUE_SEARCHING'
]
CONTINUE_SEARCHING
=
searchData
[
'CONTINUE_SEARCHING'
]
if
not
CONTINUE_SEARCHING
:
if
not
CONTINUE_SEARCHING
:
if
searchData
[
'page'
]
==
1
:
if
searchData
[
'page'
]
==
0
:
searchData
[
'section_url'
]
=
response
.
url
searchData
[
'section_url'
]
=
response
.
url
linkList
=
response
.
xpath
(
'//article[@id="destacada"]/a/@href'
)
.
extract
()
newsList
=
response
.
xpath
(
'//div[@id="main"]'
)
.
css
(
'h2.large-title'
)
.
xpath
(
'./a/@href'
)
.
extract
()
linkList
.
extend
(
response
.
xpath
(
'//aside[@id="mini_sidebar"]/article/a/@href'
)
.
extract
())
# newsList.extend(response.xpath('//div[@class="container even"]').css('h2.large-title').xpath('./a/@href').extract())
linkList
.
extend
(
response
.
xpath
(
'//section[@id="principal"]/article/a/@href'
)
.
extract
())
linkList
.
extend
(
response
.
xpath
(
'//aside[@id="otras_noticias"]/article/a/@href'
)
.
extract
())
linkList
.
extend
(
response
.
xpath
(
'//div[@class="contenedor"]/article/a/@href'
)
.
extract
())
linkList
.
extend
(
response
.
xpath
(
'//article[@class="nobordes"]/div/a/@href'
)
.
extract
())
linkList
.
remove
(
searchData
[
'section_url'
])
else
:
else
:
linkList
=
response
.
xpath
(
'//div[@class="contenedor"]/article/a/@href'
)
.
extract
()
unescaped
=
ast
.
literal_eval
(
response
.
body
.
strip
())
linkList
.
extend
(
response
.
xpath
(
'//article[@class="nobordes"]/div/a/@href'
)
.
extract
())
body
=
Selector
(
text
=
unescaped
)
try
:
linkList
.
remove
(
searchData
[
'section_url'
])
newsList
=
[]
except
KeyError
:
for
link
in
body
.
xpath
(
'//div[@class="row news"]'
)
.
css
(
'div.subsection'
)
.
css
(
'h2'
)
.
xpath
(
'./a/@href'
)
.
extract
():
pass
link
=
link
.
replace
(
'
\\
'
,
''
)
if
not
link
in
newsList
:
newsList
=
[]
newsList
.
append
(
link
)
for
link
in
linkList
:
if
not
link
in
newsList
:
if
len
(
newsList
)
>
0
:
newsList
.
append
(
link
)
for
link
in
newsList
:
info
=
ImportantData
()
for
link
in
newsList
:
# info['url'] = response.url
info
=
ImportantData
()
info
[
'page'
]
=
searchData
[
'page'
]
info
[
'url'
]
=
response
.
url
info
[
'section_url'
]
=
searchData
[
'section_url'
]
info
[
'page'
]
=
searchData
[
'page'
]
if
link
==
newsList
[
-
1
]:
info
[
'LAST_LINK'
]
=
True
info
[
'section_url'
]
=
searchData
[
'section_url'
]
else
:
info
[
'LAST_LINK'
]
=
False
if
link
==
linkList
[
-
1
]:
info
[
'LAST_LINK'
]
=
True
reqst
=
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item_with_stop_date
)
else
:
info
[
'LAST_LINK'
]
=
False
reqst
.
meta
[
'item'
]
=
info
reqst
=
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item_with_stop_date
)
yield
reqst
reqst
.
meta
[
'item'
]
=
info
yield
reqst
# if searchData['page'] == 1:
# searchData['section_url'] = response.url
# linkList = response.xpath('//article[@id="destacada"]/a/@href').extract()
# linkList.extend(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract())
# linkList.extend(response.xpath('//section[@id="principal"]/article/a/@href').extract())
# linkList.extend(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract())
# linkList.extend(response.xpath('//div[@class="contenedor"]/article/a/@href').extract())
# linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
# linkList.remove(searchData['section_url'])
#
# else:
# linkList = response.xpath('//div[@class="contenedor"]/article/a/@href').extract()
# linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
# try:
# linkList.remove(searchData['section_url'])
# except KeyError:
# pass
#
# newsList = []
# for link in linkList:
# if not link in newsList:
# newsList.append(link)
#
# for link in newsList:
# info = ImportantData()
# info['url'] = response.url
# info['page'] = searchData['page']
# info['section_url'] = searchData['section_url']
# if link == linkList[-1]: info['LAST_LINK'] = True
# else: info['LAST_LINK'] = False
# reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
# reqst.meta['item'] = info
# yield reqst
else
:
else
:
searchData
[
'CONTINUE_SEARCHING'
]
=
False
searchData
[
'CONTINUE_SEARCHING'
]
=
False
searchData
[
'page'
]
+=
1
searchData
[
'page'
]
+=
1
page
=
searchData
[
'page'
]
page
=
str
(
searchData
[
'page'
])
url
=
searchData
[
'section_url'
]
request
=
scrapy
.
Request
(
url
=
url
+
"?page="
+
str
(
page
),
callback
=
self
.
parse_with_stop_date
)
url
=
"http://www.elsalvador.com//wp-json/LoadMore/Category/posts"
url_peticion
=
"/category/noticias/internacional/"
frmdata
=
{
'pppDestacado'
:
"5"
,
'pppNoDestacado'
:
"4"
,
'slug'
:
"internacional"
,
'paged'
:
page
,
'category_name'
:
"Internacional"
,
'url_peticion'
:
url_peticion
}
request
=
scrapy
.
http
.
FormRequest
(
url
=
url
,
formdata
=
frmdata
,
callback
=
self
.
parse_with_stop_date
)
request
.
meta
[
'item'
]
=
searchData
request
.
meta
[
'item'
]
=
searchData
yield
request
yield
request
# searchData['CONTINUE_SEARCHING'] = False
# searchData['page'] += 1
# page = searchData['page']
# url = searchData['section_url']
# request = scrapy.Request(url=url + "?page=" + str(page), callback=self.parse_with_stop_date)
# request.meta['item'] = searchData
# yield request
def
parse_item
(
self
,
response
):
def
parse_item
(
self
,
response
):
item
=
NoticiasItem
()
item
=
NoticiasItem
()
...
@@ -217,8 +268,8 @@ class QuotesSpider(scrapy.Spider):
...
@@ -217,8 +268,8 @@ class QuotesSpider(scrapy.Spider):
def
parse_item_with_stop_date
(
self
,
response
):
def
parse_item_with_stop_date
(
self
,
response
):
d
=
response
.
xpath
(
'//
time/text()
'
)
.
extract_first
()
d
=
response
.
xpath
(
'//
meta[@property="article:published_time"]/@content
'
)
.
extract_first
()
dt
=
datetime
.
strptime
(
d
,
'
%
d.
%
m.
%
Y
'
)
.
date
()
dt
=
datetime
.
strptime
(
d
[:
10
],
'
%
Y-
%
m-
%
d
'
)
.
date
()
if
dt
>=
self
.
stopDate
:
if
dt
>=
self
.
stopDate
:
info
=
response
.
meta
[
'item'
]
info
=
response
.
meta
[
'item'
]
...
@@ -251,6 +302,6 @@ class QuotesSpider(scrapy.Spider):
...
@@ -251,6 +302,6 @@ class QuotesSpider(scrapy.Spider):
if
info
[
'LAST_LINK'
]:
if
info
[
'LAST_LINK'
]:
info
[
'CONTINUE_SEARCHING'
]
=
True
info
[
'CONTINUE_SEARCHING'
]
=
True
request
=
scrapy
.
Request
(
url
=
info
[
'url'
],
callback
=
self
.
parse_with_stop_date
,
dont_filter
=
True
)
request
=
scrapy
.
Request
(
url
=
info
[
'
section_
url'
],
callback
=
self
.
parse_with_stop_date
,
dont_filter
=
True
)
request
.
meta
[
'item'
]
=
info
request
.
meta
[
'item'
]
=
info
yield
request
yield
request
descarga_hacia_atras/foraneos/elSalvador/elSalvador/spiders/noticias.pyc
View file @
2a9d7bdc
No preview for this file type
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment