Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
8dda8d2b
Commit
8dda8d2b
authored
6 years ago
by
Renán Sosa Guillen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
crawlers
parent
566f8f06
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
475 additions
and
0 deletions
+475
-0
__init__.py
...a_hacia_atras/diarioDeChiapas/diarioDeChiapas/__init__.py
+0
-0
items.py
...arga_hacia_atras/diarioDeChiapas/diarioDeChiapas/items.py
+20
-0
middlewares.py
...acia_atras/diarioDeChiapas/diarioDeChiapas/middlewares.py
+103
-0
pipelines.py
..._hacia_atras/diarioDeChiapas/diarioDeChiapas/pipelines.py
+75
-0
settings.py
...a_hacia_atras/diarioDeChiapas/diarioDeChiapas/settings.py
+90
-0
__init__.py
...atras/diarioDeChiapas/diarioDeChiapas/spiders/__init__.py
+4
-0
noticias.py
...atras/diarioDeChiapas/diarioDeChiapas/spiders/noticias.py
+172
-0
scrapy.cfg
descarga_hacia_atras/diarioDeChiapas/scrapy.cfg
+11
-0
No files found.
descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/__init__.py
0 → 100644
View file @
8dda8d2b
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/items.py
0 → 100644
View file @
8dda8d2b
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import
scrapy
class
NoticiasItem
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/middlewares.py
0 → 100644
View file @
8dda8d2b
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from
scrapy
import
signals
class
DiariodechiapasSpiderMiddleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_spider_input
(
self
,
response
,
spider
):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return
None
def
process_spider_output
(
self
,
response
,
result
,
spider
):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for
i
in
result
:
yield
i
def
process_spider_exception
(
self
,
response
,
exception
,
spider
):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def
process_start_requests
(
self
,
start_requests
,
spider
):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for
r
in
start_requests
:
yield
r
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
class
DiariodechiapasDownloaderMiddleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_request
(
self
,
request
,
spider
):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return
None
def
process_response
(
self
,
request
,
response
,
spider
):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return
response
def
process_exception
(
self
,
request
,
exception
,
spider
):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/pipelines.py
0 → 100644
View file @
8dda8d2b
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import
json
from
collections
import
OrderedDict
class
JsonWriterPipeline
(
object
):
def
__init__
(
self
,
filename
):
self
.
filename
=
filename
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# Here you get whatever value was passed through the "filename" command line parameter
settings
=
crawler
.
settings
filename
=
settings
.
get
(
'filename'
)
# Instantiate the pipeline with the file name
return
cls
(
filename
)
def
open_spider
(
self
,
spider
):
self
.
counter
=
0
self
.
file
=
open
(
self
.
filename
,
'w'
)
self
.
file
.
write
(
"["
)
def
close_spider
(
self
,
spider
):
self
.
file
.
write
(
"]"
)
self
.
file
.
close
()
def
process_item
(
self
,
item
,
spider
):
# print("this is my item", item)
row
=
[]
try
:
row
.
append
((
"date"
,
item
[
'date'
]))
except
:
pass
try
:
row
.
append
((
"topic"
,
item
[
'topic'
]))
except
:
pass
try
:
row
.
append
((
"title"
,
item
[
'title'
]))
except
:
pass
try
:
row
.
append
((
"author"
,
item
[
'author'
]))
except
:
pass
try
:
row
.
append
((
"location"
,
item
[
'location'
]))
except
:
pass
try
:
row
.
append
((
"text"
,
item
[
'text'
]))
except
:
pass
try
:
row
.
append
((
"url"
,
item
[
'url'
]))
except
:
pass
line
=
OrderedDict
(
row
)
self
.
counter
+=
1
if
self
.
counter
==
1
:
self
.
file
.
write
(
json
.
dumps
(
line
))
elif
self
.
counter
>
1
:
self
.
file
.
write
(
",
\n
"
+
json
.
dumps
(
line
))
return
item
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/settings.py
0 → 100644
View file @
8dda8d2b
# -*- coding: utf-8 -*-
# Scrapy settings for diarioDeChiapas project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME
=
'diarioDeChiapas'
SPIDER_MODULES
=
[
'diarioDeChiapas.spiders'
]
NEWSPIDER_MODULE
=
'diarioDeChiapas.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'diarioDeChiapas (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY
=
0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED
=
False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'diarioDeChiapas.middlewares.DiariodechiapasSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'diarioDeChiapas.middlewares.DiariodechiapasDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES
=
{
'diarioDeChiapas.pipelines.JsonWriterPipeline'
:
300
,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/spiders/__init__.py
0 → 100644
View file @
8dda8d2b
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/diarioDeChiapas/diarioDeChiapas/spiders/noticias.py
0 → 100644
View file @
8dda8d2b
# -*- coding: utf-8 -*-
"""
MEDIA:
Amandala, Belice
USAGE
$ cd diarioDeChiapas
// Si se quiere obtener todas las noticias desde las más actuales hasta las más antiguas. //
$ scrapy crawl noticias --nolog -s filename=noticias.json
-------------------------------------------------------------------------------------------------
// Si se quiere obtener todas las noticias desde las más actuales hasta una fecha específica. //
$ scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=3 -a day=5
-------------------------------------------------------------------------------------------------
Después será necesario hacer uso del archivo parse_date_files.py para que las noticias contenidas
en noticias.json sean separadas en archivos por fecha.
"""
import
scrapy
,
re
,
json
from
datetime
import
datetime
,
date
from
diarioDeChiapas.items
import
NoticiasItem
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
DAT_RE
=
re
.
compile
(
r'\d{4}\/\d{2}\/\d{2}'
)
class
ImportantData
(
scrapy
.
Item
):
continue_searching
=
scrapy
.
Field
()
last_link
=
scrapy
.
Field
()
section
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
class
QuotesSpider
(
scrapy
.
Spider
):
name
=
"noticias"
def
start_requests
(
self
):
year
=
getattr
(
self
,
"year"
,
None
)
month
=
getattr
(
self
,
"month"
,
None
)
day
=
getattr
(
self
,
"day"
,
None
)
if
year
is
not
None
and
month
is
not
None
and
day
is
not
None
:
self
.
stopDate
=
date
(
int
(
year
),
int
(
month
),
int
(
day
))
else
:
self
.
stopDate
=
None
baseURL
=
"http://www.diariodechiapas.com/landing/"
section_list
=
[
"editorial"
,
"portada"
,
"metropoli"
,
"region"
,
"la-roja"
,
"deportes"
,
"boga"
,
"ae"
,
"opinion-dia"
,
"trascendio"
]
# section_list = ["editorial"]
if
self
.
stopDate
is
None
:
for
s
in
section_list
:
yield
scrapy
.
Request
(
url
=
baseURL
+
s
,
callback
=
self
.
parse
)
else
:
for
s
in
section_list
:
flow_info
=
ImportantData
()
flow_info
[
'continue_searching'
]
=
False
request
=
scrapy
.
Request
(
url
=
baseURL
+
s
,
callback
=
self
.
parse_with_stop_date
)
request
.
meta
[
'item'
]
=
flow_info
yield
request
def
parse
(
self
,
response
):
link_list
=
response
.
xpath
(
'//section[@class="page__content"]'
)
.
css
(
'section.post'
)
.
xpath
(
'./a[@class="post__link"]/@href'
)
.
extract
()
section
=
response
.
xpath
(
'//section[@class="wrapper"]/h1'
)
.
extract_first
()
if
section
is
not
None
:
section
=
remove_tags
(
section
)
for
link
in
link_list
:
flow_info
=
ImportantData
()
flow_info
[
'section'
]
=
section
request
=
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
request
.
meta
[
'item'
]
=
flow_info
yield
request
next_page
=
response
.
css
(
'div.wp-pagenavi'
)
.
css
(
'a.nextpostslink'
)
.
css
(
'::attr(href)'
)
.
extract_first
()
if
next_page
is
not
None
:
yield
scrapy
.
Request
(
url
=
next_page
,
callback
=
self
.
parse
)
def
parse_with_stop_date
(
self
,
response
):
flow_info
=
response
.
meta
[
'item'
]
if
not
flow_info
[
'continue_searching'
]:
link_list
=
response
.
xpath
(
'//section[@class="page__content"]'
)
.
css
(
'section.post'
)
.
xpath
(
'./a[@class="post__link"]/@href'
)
.
extract
()
for
link
in
link_list
:
flow_info
=
ImportantData
()
flow_info
[
'url'
]
=
response
.
url
if
link
==
link_list
[
-
1
]
:
flow_info
[
'last_link'
]
=
True
else
:
flow_info
[
'last_link'
]
=
False
request
=
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item_with_stop_date
)
request
.
meta
[
'item'
]
=
flow_info
yield
request
else
:
next_page
=
response
.
css
(
'div.wp-pagenavi'
)
.
css
(
'a.nextpostslink'
)
.
css
(
'::attr(href)'
)
.
extract_first
()
if
next_page
is
not
None
:
flow_info
[
'continue_searching'
]
=
False
request
=
scrapy
.
Request
(
url
=
next_page
,
callback
=
self
.
parse_with_stop_date
)
request
.
meta
[
'item'
]
=
flow_info
yield
request
def
parse_item
(
self
,
response
):
flow_info
=
response
.
meta
[
'item'
]
item
=
NoticiasItem
()
text
=
''
title
=
response
.
xpath
(
'//section[@class="single__content"]/h1'
)
.
extract_first
()
if
title
is
not
None
:
title
=
remove_tags
(
title
)
for
p
in
response
.
xpath
(
'//section[@class="single__content"]'
)
.
css
(
'p'
)
.
extract
():
text
+=
remove_tags
(
p
)
+
"
\n
"
item
[
'date'
]
=
response
.
xpath
(
'//meta[@property="article:published_time"]/@content'
)
.
extract_first
()
item
[
'topic'
]
=
flow_info
[
'section'
]
item
[
'title'
]
=
title
item
[
'text'
]
=
text
.
strip
()
item
[
'url'
]
=
response
.
url
yield
item
def
parse_item_with_stop_date
(
self
,
response
):
d
=
response
.
xpath
(
'//meta[@property="article:published_time"]/@content'
)
.
extract_first
()
dt
=
datetime
.
strptime
(
d
,
'
%
Y-
%
m-
%
d'
)
.
date
()
if
dt
>=
self
.
stopDate
:
flow_info
=
response
.
meta
[
'item'
]
item
=
NoticiasItem
()
text
=
''
item
[
'date'
]
=
datetime
.
strptime
(
d
,
'
%
Y-
%
m-
%
d'
)
.
isoformat
(
"T"
)
item
[
'title'
]
=
remove_tags
(
response
.
xpath
(
'//div[@class="active"]/h1/a'
)
.
extract_first
())
try
:
topic
=
response
.
css
(
'div.date'
)
.
css
(
'span.date'
)
.
css
(
'a::text'
)
.
extract
()[
0
]
except
:
topic
=
None
item
[
'topic'
]
=
topic
for
p
in
response
.
css
(
'div.content'
)
.
css
(
'p'
)
.
extract
():
text
+=
remove_tags
(
p
)
+
"
\n
"
item
[
'text'
]
=
text
.
strip
()
item
[
'url'
]
=
response
.
url
yield
item
if
flow_info
[
'last_link'
]:
flow_info
[
'continue_searching'
]
=
True
request
=
scrapy
.
Request
(
url
=
flow_info
[
'url'
],
callback
=
self
.
parse_with_stop_date
,
dont_filter
=
True
)
request
.
meta
[
'item'
]
=
flow_info
yield
request
This diff is collapsed.
Click to expand it.
descarga_hacia_atras/diarioDeChiapas/scrapy.cfg
0 → 100644
View file @
8dda8d2b
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = diarioDeChiapas.settings
[deploy]
#url = http://localhost:6800/
project = diarioDeChiapas
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment