Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
57f362cd
Commit
57f362cd
authored
Apr 15, 2020
by
Mario Chirinos Colunga
💬
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
el sur
parent
eac90030
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
203 additions
and
100 deletions
+203
-100
items.py
descarga_por_dia/elSur/elSur/items.py
+9
-9
settings.py
descarga_por_dia/elSur/elSur/settings.py
+1
-1
noticias.bk
descarga_por_dia/elSur/elSur/spiders/noticias.bk
+90
-0
noticias.py
descarga_por_dia/elSur/elSur/spiders/noticias.py
+58
-90
out_test.json
descarga_por_dia/elSur/out_test.json
+45
-0
No files found.
descarga_por_dia/elSur/elSur/items.py
View file @
57f362cd
...
...
@@ -9,12 +9,12 @@ import scrapy
class
NoticiasItem
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
# define the fields for your item here like:
# name = scrapy.Field()
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
descarga_por_dia/elSur/elSur/settings.py
View file @
57f362cd
...
...
@@ -8,7 +8,7 @@
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
FEED_EXPORT_ENCODING
=
'utf-8'
BOT_NAME
=
'elSur'
SPIDER_MODULES
=
[
'elSur.spiders'
]
...
...
descarga_por_dia/elSur/elSur/spiders/noticias.bk
0 → 100644
View file @
57f362cd
"""
MEDIA:
El Sur, Guerrero
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd elSur/
$ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
"""
import scrapy, re
from elSur.items import NoticiasItem
#TAG_RE = re.compile(r'<[^>]+>')
#HEAD_RE_1 = re.compile(r'Texto:.*?[/y] Foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
#HEAD_RE_2 = re.compile(r'Texto y foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
def remove_tags(text):
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "https://suracapulco.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
print(self.baseURL)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
print(response)
## for link in response.css('div.dslc-blog-posts').css('div.dslc-blog-post-title > h2 > a::attr(href)').extract():
for link in set(response.css('#post-list').css("li > div > div > a::attr(href)").extract():
print(link)
yield scrapy.Request(url=link, callback=self.parse_item)
# pag_lst = response.css('div.dslc-pagination > ul > li')
# if len(pag_lst) > 0:
# del pag_lst[0]
# del pag_lst[0]
# next_page = None
# for li_obj in pag_lst:
# li = remove_tags(li_obj.extract())
# if not li.isdigit():
# next_page = li_obj.xpath('./a/@href').extract_first()
# break
#
# if next_page is not None : yield scrapy.Request(url=next_page, callback=self.parse)
def parse_item(self, response):
print(response)
item = NoticiasItem()
# text = ''
# news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
# title = response.css('div.dslc-tp-title > h1').extract_first()
# if title is not None : title = remove_tags(title)
# topic = response.css('div.dslc-tp-meta').xpath('./ul/li[3]/a[1]').extract_first()
# if topic is not None : topic = remove_tags(topic)
# for p in response.xpath('//div[@id="dslc-theme-content-inner"]').css('p').extract():
# p = remove_tags(p)
# text += p + "\n"
# dateline = response.css('span.dateline').extract_first()
# if dateline is not None:
# dateline = remove_tags(dateline)
# text = text.replace(dateline, '')
# text = text.replace(u'\u00a0', ' ')
# text = HEAD_RE_1.sub('', text)
# text = HEAD_RE_2.sub('', text)
# ## News item info ##
# item['date'] = news_date
# item['title'] = title
# item['topic'] = topic
# item['text'] = text.strip()
# item['url'] = response.url
yield item
descarga_por_dia/elSur/elSur/spiders/noticias.py
View file @
57f362cd
# -*- coding: utf-8 -*-
"""
MEDIA:
El Sur, Guerrero
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd elSur/
$ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
MEDIA:
El Sur, Guerrero
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd elSur/
$ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
"""
import
scrapy
,
re
from
elSur.items
import
NoticiasItem
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
HEAD_RE_1
=
re
.
compile
(
r'Texto:.*?[/y] Foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?'
)
HEAD_RE_2
=
re
.
compile
(
r'Texto y foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?'
)
#from dateutil.parser import parse
from
dateparser
import
parse
from
datetime
import
datetime
class
QuotesSpider
(
scrapy
.
Spider
):
"""
Basic Scrapy Spider class
"""
name
=
"noticias"
def
start_requests
(
self
):
year
=
getattr
(
self
,
"year"
,
None
)
month
=
getattr
(
self
,
"month"
,
None
)
day
=
getattr
(
self
,
"day"
,
None
)
baseURL
=
"https://suracapulco.mx/{0}/{1}/{2}/"
.
format
(
year
,
month
.
zfill
(
2
),
day
.
zfill
(
2
))
yield
scrapy
.
Request
(
url
=
baseURL
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
for
link
in
response
.
css
(
'div.dslc-blog-posts'
)
.
css
(
'div.dslc-blog-post-title > h2 > a::attr(href)'
)
.
extract
():
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
pag_lst
=
response
.
css
(
'div.dslc-pagination > ul > li'
)
if
len
(
pag_lst
)
>
0
:
del
pag_lst
[
0
]
del
pag_lst
[
0
]
next_page
=
None
for
li_obj
in
pag_lst
:
li
=
remove_tags
(
li_obj
.
extract
())
if
not
li
.
isdigit
():
next_page
=
li_obj
.
xpath
(
'./a/@href'
)
.
extract_first
()
break
if
next_page
is
not
None
:
yield
scrapy
.
Request
(
url
=
next_page
,
callback
=
self
.
parse
)
def
parse_item
(
self
,
response
):
item
=
NoticiasItem
()
text
=
''
news_date
=
response
.
xpath
(
'//meta[@property="article:published_time"]/@content'
)
.
extract_first
()
title
=
response
.
css
(
'div.dslc-tp-title > h1'
)
.
extract_first
()
if
title
is
not
None
:
title
=
remove_tags
(
title
)
topic
=
response
.
css
(
'div.dslc-tp-meta'
)
.
xpath
(
'./ul/li[3]/a[1]'
)
.
extract_first
()
if
topic
is
not
None
:
topic
=
remove_tags
(
topic
)
for
p
in
response
.
xpath
(
'//div[@id="dslc-theme-content-inner"]'
)
.
css
(
'p'
)
.
extract
():
p
=
remove_tags
(
p
)
text
+=
p
+
"
\n
"
dateline
=
response
.
css
(
'span.dateline'
)
.
extract_first
()
if
dateline
is
not
None
:
dateline
=
remove_tags
(
dateline
)
text
=
text
.
replace
(
dateline
,
''
)
text
=
text
.
replace
(
u'
\u00a0
'
,
' '
)
text
=
HEAD_RE_1
.
sub
(
''
,
text
)
text
=
HEAD_RE_2
.
sub
(
''
,
text
)
## News item info ##
item
[
'date'
]
=
news_date
item
[
'title'
]
=
title
item
[
'topic'
]
=
topic
item
[
'text'
]
=
text
.
strip
()
item
[
'url'
]
=
response
.
url
yield
item
"""
Basic Scrapy Spider class
"""
name
=
"noticias"
def
start_requests
(
self
):
year
=
getattr
(
self
,
"year"
,
None
)
month
=
getattr
(
self
,
"month"
,
None
)
day
=
getattr
(
self
,
"day"
,
None
)
self
.
baseURL
=
"https://suracapulco.mx/{0}/{1}/{2}/"
.
format
(
year
,
month
.
zfill
(
2
),
day
.
zfill
(
2
))
print
(
self
.
baseURL
)
yield
scrapy
.
Request
(
url
=
self
.
baseURL
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
print
(
response
)
for
link
in
response
.
css
(
'#post-list'
)
.
css
(
"li > div > div > a::attr(href)"
)
.
extract
():
print
(
link
)
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
if
len
(
response
.
css
(
".paging-navigation > a::attr(href)"
)
.
extract
())
>
0
:
next_page
=
response
.
css
(
".paging-navigation > a::attr(href)"
)
.
extract
()[
0
]
print
(
next_page
)
if
next_page
is
not
None
:
yield
scrapy
.
Request
(
url
=
next_page
,
callback
=
self
.
parse
)
def
parse_item
(
self
,
response
):
print
(
response
.
encoding
)
date
=
response
.
xpath
(
'//meta[@property="article:published_time"]/@content'
)
.
extract_first
()
title
=
response
.
xpath
(
'//meta[@property="og:title"]/@content'
)
.
extract_first
()
.
lower
()
topic
=
response
.
xpath
(
'//meta[@property="article:section"]/@content'
)
.
extract_first
()
.
lower
()
text
=
""
for
p
in
response
.
css
(
"div.xt-post-content > p::text"
)
.
extract
():
text
+=
p
.
replace
(
"
\n
"
,
""
)
+
"
\n
"
item
=
NoticiasItem
()
item
[
'date'
]
=
datetime
.
fromtimestamp
(
int
(
date
))
.
isoformat
()
item
[
'title'
]
=
title
item
[
'topic'
]
=
topic
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
if
item
[
"text"
]
!=
""
:
print
(
item
)
yield
item
else
:
yield
descarga_por_dia/elSur/out_test.json
0 → 100644
View file @
57f362cd
This source diff could not be displayed because it is too large. You can
view the blob
instead.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment