Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
ca9d5998
Commit
ca9d5998
authored
2 years ago
by
Mario Chirinos
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
new heraldo leon
parent
f596888b
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
150 additions
and
187 deletions
+150
-187
items.py
spiders/daily/heraldoLeon/heraldoLeon/items.py
+11
-13
middlewares.py
spiders/daily/heraldoLeon/heraldoLeon/middlewares.py
+58
-11
pipelines.py
spiders/daily/heraldoLeon/heraldoLeon/pipelines.py
+4
-66
settings.py
spiders/daily/heraldoLeon/heraldoLeon/settings.py
+22
-20
noticias.py
spiders/daily/heraldoLeon/heraldoLeon/spiders/noticias.py
+54
-76
scrapy.cfg
spiders/daily/heraldoLeon/scrapy.cfg
+1
-1
No files found.
spiders/daily/heraldoLeon/heraldoLeon/items.py
View file @
ca9d5998
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http
://doc
.scrapy.org/en/latest/topics/items.html
# http
s://docs
.scrapy.org/en/latest/topics/items.html
import
scrapy
class
Noticias
Item
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
titl
e
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
class
Heraldoleon
Item
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
dat
e
=
scrapy
.
Field
()
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
This diff is collapsed.
Click to expand it.
spiders/daily/heraldoLeon/heraldoLeon/middlewares.py
View file @
ca9d5998
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http
://doc
.scrapy.org/en/latest/topics/spider-middleware.html
# http
s://docs
.scrapy.org/en/latest/topics/spider-middleware.html
from
scrapy
import
signals
# useful for handling different item types with a single interface
from
itemadapter
import
is_item
,
ItemAdapter
class
HeraldoleonSpiderMiddleware
(
object
)
:
class
HeraldoleonSpiderMiddleware
:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
...
...
@@ -20,30 +21,29 @@ class HeraldoleonSpiderMiddleware(object):
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_spider_input
(
response
,
spider
):
def
process_spider_input
(
self
,
response
,
spider
):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return
None
def
process_spider_output
(
response
,
result
,
spider
):
def
process_spider_output
(
self
,
response
,
result
,
spider
):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request,
dict or I
tem objects.
# Must return an iterable of Request,
or i
tem objects.
for
i
in
result
:
yield
i
def
process_spider_exception
(
response
,
exception
,
spider
):
def
process_spider_exception
(
self
,
response
,
exception
,
spider
):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
# Should return either None or an iterable of Request or item objects.
pass
def
process_start_requests
(
start_requests
,
spider
):
def
process_start_requests
(
s
elf
,
s
tart_requests
,
spider
):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
...
...
@@ -54,3 +54,50 @@ class HeraldoleonSpiderMiddleware(object):
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
class
HeraldoleonDownloaderMiddleware
:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_request
(
self
,
request
,
spider
):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return
None
def
process_response
(
self
,
request
,
response
,
spider
):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return
response
def
process_exception
(
self
,
request
,
exception
,
spider
):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
This diff is collapsed.
Click to expand it.
spiders/daily/heraldoLeon/heraldoLeon/pipelines.py
View file @
ca9d5998
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import
json
from
collections
import
OrderedDict
class
JsonWriterPipeline
(
object
):
def
__init__
(
self
,
filename
):
self
.
filename
=
filename
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# Here you get whatever value was passed through the "filename" command line parameter
settings
=
crawler
.
settings
filename
=
settings
.
get
(
'filename'
)
# Instantiate the pipeline with the file nam
e
return
cls
(
filename
)
# useful for handling different item types with a single interfac
e
from
itemadapter
import
ItemAdapter
def
open_spider
(
self
,
spider
):
self
.
counter
=
0
self
.
file
=
open
(
self
.
filename
,
'w'
)
self
.
file
.
write
(
"["
)
def
close_spider
(
self
,
spider
):
self
.
file
.
write
(
"]"
)
self
.
file
.
close
()
class
HeraldoleonPipeline
:
def
process_item
(
self
,
item
,
spider
):
# print("this is my item", item)
row
=
[]
try
:
row
.
append
((
"date"
,
item
[
'date'
]))
except
:
pass
try
:
row
.
append
((
"topic"
,
item
[
'topic'
]))
except
:
pass
try
:
row
.
append
((
"title"
,
item
[
'title'
]))
except
:
pass
try
:
row
.
append
((
"author"
,
item
[
'author'
]))
except
:
pass
try
:
row
.
append
((
"location"
,
item
[
'location'
]))
except
:
pass
try
:
row
.
append
((
"text"
,
item
[
'text'
]))
except
:
pass
try
:
row
.
append
((
"url"
,
item
[
'url'
]))
except
:
pass
line
=
OrderedDict
(
row
)
self
.
counter
+=
1
if
self
.
counter
==
1
:
self
.
file
.
write
(
json
.
dumps
(
line
))
elif
self
.
counter
>
1
:
self
.
file
.
write
(
",
\n
"
+
json
.
dumps
(
line
))
return
item
This diff is collapsed.
Click to expand it.
spiders/daily/heraldoLeon/heraldoLeon/settings.py
View file @
ca9d5998
# -*- coding: utf-8 -*-
# Scrapy settings for heraldoLeon project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http
://doc
.scrapy.org/en/latest/topics/settings.html
# http
://scrapy.readthedocs
.org/en/latest/topics/downloader-middleware.html
# http
://scrapy.readthedocs
.org/en/latest/topics/spider-middleware.html
# http
s://docs
.scrapy.org/en/latest/topics/settings.html
# http
s://docs.scrapy
.org/en/latest/topics/downloader-middleware.html
# http
s://docs.scrapy
.org/en/latest/topics/spider-middleware.html
BOT_NAME
=
'heraldoLeon'
SPIDER_MODULES
=
[
'heraldoLeon.spiders'
]
NEWSPIDER_MODULE
=
'heraldoLeon.spiders'
FEED_EXPORT_ENCODING
=
"utf-8"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'heraldoLeon (+http://www.yourdomain.com)'
# Obey robots.txt rules
#
ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY
=
True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http
://scrapy.readthedocs
.org/en/latest/topics/settings.html#download-delay
# See http
s://docs.scrapy
.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY
=
0.5
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED
=
False
#
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
...
...
@@ -45,31 +43,31 @@ COOKIES_ENABLED = False
#}
# Enable or disable spider middlewares
# See http
://scrapy.readthedocs
.org/en/latest/topics/spider-middleware.html
# See http
s://docs.scrapy
.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'heraldoLeon.middlewares.HeraldoleonSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http
://scrapy.readthedocs
.org/en/latest/topics/downloader-middleware.html
# See http
s://docs.scrapy
.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'heraldoLeon.middlewares.
MyCustom
DownloaderMiddleware': 543,
# 'heraldoLeon.middlewares.
Heraldoleon
DownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http
://scrapy.readthedocs
.org/en/latest/topics/extensions.html
# See http
s://docs.scrapy
.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http
://scrapy.readthedocs
.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES
=
{
'heraldoLeon.pipelines.JsonWriter
Pipeline'
:
300
,
}
# See http
s://docs.scrapy
.org/en/latest/topics/item-pipeline.html
#
ITEM_PIPELINES = {
# 'heraldoLeon.pipelines.Heraldoleon
Pipeline': 300,
#
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http
://doc
.scrapy.org/en/latest/topics/autothrottle.html
# See http
s://docs
.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
...
...
@@ -82,9 +80,13 @@ ITEM_PIPELINES = {
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http
://scrapy.readthedocs
.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# See http
s://docs.scrapy
.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION
=
'2.7'
TWISTED_REACTOR
=
'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
This diff is collapsed.
Click to expand it.
spiders/daily/heraldoLeon/heraldoLeon/spiders/noticias.py
View file @
ca9d5998
# -*- coding: utf-8 -*-
import
scrapy
,
re
from
heraldoLeon.items
import
NoticiasItem
"""
MEDIO:
El Heraldo de León, Guanajuato
USO:
scrapy crawl noticias --nolog -s filename=2018-01-29.json -a year=2018 -a month=1 -a day=29
Spider for heraldoleon.mx
Author: Mario Chirinos Coluga
Usage:scrapy crawl noticias --nolog -O 2017-04-23.json -a year=2017 -a month=4 -a day=23
"""
import
scrapy
import
re
from
heraldoLeon.items
import
HeraldoleonItem
#-------------------------------------------------------------------------------
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
LOC_RE
=
re
.
compile
(
r'.+?(\d{1,2}-?[a-zA-Z]+)?\s?\.-\s?'
)
DAT_RE
=
re
.
compile
(
r'\s?(\d{1,2}-?[a-zA-Z]+)?\s?\.-\s?'
)
TRASH_RE
=
re
.
compile
(
r'#[^\}]+\}'
)
class
QuotesSpider
(
scrapy
.
Spider
):
name
=
"noticias"
def
start_requests
(
self
):
year
=
getattr
(
self
,
"year"
,
None
)
month
=
getattr
(
self
,
"month"
,
None
)
day
=
getattr
(
self
,
"day"
,
None
)
self
.
baseURL
=
"http://www.heraldoleon.mx/"
+
year
+
"/"
+
month
+
"/"
+
day
yield
scrapy
.
Request
(
url
=
self
.
baseURL
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
lastPage
=
response
.
xpath
(
'//*[@class="page-nav td-pb-padding-side"]/a[@class="last"]/@href'
)
.
extract_first
()
if
lastPage
is
None
:
lastPage
=
response
.
xpath
(
'//*[@class="page-nav td-pb-padding-side"]/a/@href'
)
.
extract
()[
-
1
]
if
lastPage
is
not
None
and
lastPage
!=
''
:
lastPage
=
lastPage
.
strip
(
'/'
)
lastPage
=
int
(
lastPage
[
lastPage
.
rfind
(
'/'
)
+
1
:])
for
page
in
range
(
1
,
lastPage
):
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
"/page/"
+
str
(
page
+
1
),
callback
=
self
.
parse_page
)
def
parse_page
(
self
,
response
):
for
link
in
response
.
xpath
(
'//*[@class="td-block-row"]'
)
.
css
(
'h3'
)
.
css
(
'a::attr(href)'
)
.
extract
():
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
def
parse_item
(
self
,
response
):
item
=
NoticiasItem
()
text
=
''
"La fecha obtenida ya incluye formato y zona horaria"
item
[
'date'
]
=
response
.
xpath
(
'//time[@class="entry-date updated td-module-date"]/@datetime'
)
.
extract_first
()
item
[
'title'
]
=
remove_tags
(
response
.
xpath
(
'//header[@class="td-post-title"]/h1'
)
.
extract_first
())
item
[
'topic'
]
=
response
.
xpath
(
'//ul[@class="td-category"]/li/a/text()'
)
.
extract_first
()
author
=
response
.
xpath
(
'//pre[@style="text-align: justify;"]/text()'
)
.
extract_first
()
if
author
is
not
None
and
author
!=
''
:
item
[
'author'
]
=
" "
.
join
(
author
.
split
(
"
\n
"
)[
0
]
.
split
())
bodyText
=
response
.
xpath
(
'//*[@class="td-post-content"]/p'
)
.
extract
()
for
i
in
range
(
0
,
len
(
bodyText
)):
p
=
remove_tags
(
bodyText
[
i
])
if
i
<=
1
:
p
=
p
.
lstrip
()
result
=
LOC_RE
.
match
(
p
)
if
result
:
item
[
'location'
]
=
DAT_RE
.
sub
(
''
,
result
.
group
(
0
))
p
=
LOC_RE
.
sub
(
''
,
p
)
text
+=
p
+
"
\n
"
item
[
'text'
]
=
TRASH_RE
.
sub
(
''
,
text
)
.
strip
()
item
[
'url'
]
=
response
.
url
yield
item
return
TAG_RE
.
sub
(
''
,
text
)
#-------------------------------------------------------------------------------
class
NoticiasSpider
(
scrapy
.
Spider
):
name
=
'noticias'
allowed_domains
=
[
'heraldoleon.mx'
]
start_urls
=
[
'http://heraldoleon.mx/'
]
def
start_requests
(
self
):
self
.
year
=
getattr
(
self
,
"year"
,
None
)
self
.
month
=
getattr
(
self
,
"month"
,
None
)
self
.
day
=
getattr
(
self
,
"day"
,
None
)
self
.
baseURL
=
"http://www.heraldoleon.mx/"
+
self
.
year
+
"/"
+
self
.
month
+
"/"
+
self
.
day
yield
scrapy
.
Request
(
url
=
self
.
baseURL
,
callback
=
self
.
parse
)
#-----------------------------------------------------------------------
def
parse
(
self
,
response
):
print
(
response
.
url
)
for
link
in
response
.
xpath
(
'//h3[@class="entry-title td-module-title"]/a/@href'
)
.
extract
():
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
next_page
=
response
.
xpath
(
'//div[@class="page-nav td-pb-padding-side"]/a/i[@class="td-icon-menu-right"]/../@href'
)
.
extract_first
()
print
(
"nextPage"
,
next_page
)
if
next_page
is
not
None
:
yield
scrapy
.
Request
(
url
=
next_page
,
callback
=
self
.
parse
)
#-------------------------------------------------------------------------------
def
parse_item
(
self
,
response
):
# print(response.url)
item
=
HeraldoleonItem
()
item
[
'date'
]
=
response
.
xpath
(
"//meta[@property='article:published_time']/@content"
)
.
extract_first
()
item
[
'title'
]
=
response
.
xpath
(
'//h1[@class="entry-title"]/text()'
)
.
extract_first
()
text
=
""
for
p
in
response
.
xpath
(
'//div[@class="td-post-content"]/p/text()'
)
.
extract
():
nt
=
remove_tags
(
p
)
.
replace
(
"
\n
"
,
""
)
.
replace
(
"
\r
"
,
""
)
.
strip
()
text
+=
nt
if
len
(
nt
)
>
0
:
text
+=
"
\n
"
item
[
'text'
]
=
text
.
strip
()
item
[
'topic'
]
=
", "
.
join
(
response
.
xpath
(
'//ul[@class="td-tags td-post-small-box clearfix"]/li/a/text()'
)
.
extract
())
item
[
'url'
]
=
response
.
url
item
[
"author"
]
=
", "
.
join
(
response
.
xpath
(
'//div[@class="td-post-source-via "]/div/a/text()'
)
.
extract
())
item
[
"location"
]
=
""
print
(
self
.
allowed_domains
,
item
[
"title"
])
yield
item
This diff is collapsed.
Click to expand it.
spiders/daily/heraldoLeon/scrapy.cfg
View file @
ca9d5998
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.
org
/en/latest/deploy.html
# https://scrapyd.readthedocs.
io
/en/latest/deploy.html
[settings]
default = heraldoLeon.settings
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment