Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
eaa37668
Commit
eaa37668
authored
6 years ago
by
Renán Sosa Guillen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
crawlers
parent
fa10af53
Changes
26
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
26 changed files
with
1181 additions
and
2 deletions
+1181
-2
noticias.py
descarga_por_dia/cuartoPoder/cuartoPoder/spiders/noticias.py
+7
-2
2018-09-05.json
descarga_por_dia/diarioPuntual/2018-09-05.json
+4
-0
__init__.py
descarga_por_dia/diarioPuntual/diarioPuntual/__init__.py
+0
-0
items.py
descarga_por_dia/diarioPuntual/diarioPuntual/items.py
+20
-0
middlewares.py
descarga_por_dia/diarioPuntual/diarioPuntual/middlewares.py
+103
-0
pipelines.py
descarga_por_dia/diarioPuntual/diarioPuntual/pipelines.py
+75
-0
settings.py
descarga_por_dia/diarioPuntual/diarioPuntual/settings.py
+90
-0
__init__.py
...a_por_dia/diarioPuntual/diarioPuntual/spiders/__init__.py
+4
-0
noticias.py
...a_por_dia/diarioPuntual/diarioPuntual/spiders/noticias.py
+89
-0
scrapy.cfg
descarga_por_dia/diarioPuntual/scrapy.cfg
+11
-0
__init__.py
descarga_por_dia/elComentario/elComentario/__init__.py
+0
-0
items.py
descarga_por_dia/elComentario/elComentario/items.py
+20
-0
middlewares.py
descarga_por_dia/elComentario/elComentario/middlewares.py
+103
-0
pipelines.py
descarga_por_dia/elComentario/elComentario/pipelines.py
+75
-0
settings.py
descarga_por_dia/elComentario/elComentario/settings.py
+90
-0
__init__.py
...rga_por_dia/elComentario/elComentario/spiders/__init__.py
+4
-0
noticias.py
...rga_por_dia/elComentario/elComentario/spiders/noticias.py
+76
-0
scrapy.cfg
descarga_por_dia/elComentario/scrapy.cfg
+11
-0
__init__.py
descarga_por_dia/elSur/elSur/__init__.py
+0
-0
items.py
descarga_por_dia/elSur/elSur/items.py
+20
-0
middlewares.py
descarga_por_dia/elSur/elSur/middlewares.py
+103
-0
pipelines.py
descarga_por_dia/elSur/elSur/pipelines.py
+75
-0
settings.py
descarga_por_dia/elSur/elSur/settings.py
+90
-0
__init__.py
descarga_por_dia/elSur/elSur/spiders/__init__.py
+4
-0
noticias.py
descarga_por_dia/elSur/elSur/spiders/noticias.py
+96
-0
scrapy.cfg
descarga_por_dia/elSur/scrapy.cfg
+11
-0
No files found.
descarga_por_dia/cuartoPoder/cuartoPoder/spiders/noticias.py
View file @
eaa37668
...
...
@@ -99,6 +99,11 @@ class QuotesSpider(scrapy.Spider):
item
=
NoticiasItem
()
text
=
''
news_date
=
datetime
(
int
(
self
.
year
),
int
(
self
.
month
),
int
(
self
.
day
),
tzinfo
=
self
.
tz
)
.
isoformat
(
"T"
)
title
=
response
.
css
(
'div.post-title'
)
.
css
(
'h1'
)
.
extract_first
()
if
title
is
not
None
:
remove_tags
(
title
)
topic
=
response
.
css
(
'div.big-title'
)
.
xpath
(
'./h2/a/span'
)
.
extract_first
()
if
topic
is
not
None
:
topic
=
remove_tags
(
topic
)
...
...
@@ -107,8 +112,8 @@ class QuotesSpider(scrapy.Spider):
text
+=
p
+
"
\n
"
## News item info ##
item
[
'date'
]
=
datetime
(
int
(
self
.
year
),
int
(
self
.
month
),
int
(
self
.
day
),
tzinfo
=
self
.
tz
)
.
isoformat
(
"T"
)
item
[
'title'
]
=
remove_tags
(
response
.
css
(
'div.post-title'
)
.
css
(
'h1'
)
.
extract_first
())
item
[
'date'
]
=
news_date
item
[
'title'
]
=
title
item
[
'topic'
]
=
topic
item
[
'text'
]
=
text
.
strip
()
item
[
'url'
]
=
response
.
url
...
...
This diff is collapsed.
Click to expand it.
descarga_por_dia/diarioPuntual/2018-09-05.json
0 → 100644
View file @
eaa37668
This diff is collapsed.
Click to expand it.
descarga_por_dia/diarioPuntual/diarioPuntual/__init__.py
0 → 100644
View file @
eaa37668
This diff is collapsed.
Click to expand it.
descarga_por_dia/diarioPuntual/diarioPuntual/items.py
0 → 100644
View file @
eaa37668
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import
scrapy
class
NoticiasItem
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
This diff is collapsed.
Click to expand it.
descarga_por_dia/diarioPuntual/diarioPuntual/middlewares.py
0 → 100644
View file @
eaa37668
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from
scrapy
import
signals
class
DiariopuntualSpiderMiddleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_spider_input
(
self
,
response
,
spider
):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return
None
def
process_spider_output
(
self
,
response
,
result
,
spider
):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for
i
in
result
:
yield
i
def
process_spider_exception
(
self
,
response
,
exception
,
spider
):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def
process_start_requests
(
self
,
start_requests
,
spider
):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for
r
in
start_requests
:
yield
r
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
class
DiariopuntualDownloaderMiddleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_request
(
self
,
request
,
spider
):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return
None
def
process_response
(
self
,
request
,
response
,
spider
):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return
response
def
process_exception
(
self
,
request
,
exception
,
spider
):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
This diff is collapsed.
Click to expand it.
descarga_por_dia/diarioPuntual/diarioPuntual/pipelines.py
0 → 100644
View file @
eaa37668
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import
json
from
collections
import
OrderedDict
class
JsonWriterPipeline
(
object
):
def
__init__
(
self
,
filename
):
self
.
filename
=
filename
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# Here you get whatever value was passed through the "filename" command line parameter
settings
=
crawler
.
settings
filename
=
settings
.
get
(
'filename'
)
# Instantiate the pipeline with the file name
return
cls
(
filename
)
def
open_spider
(
self
,
spider
):
self
.
counter
=
0
self
.
file
=
open
(
self
.
filename
,
'w'
)
self
.
file
.
write
(
"["
)
def
close_spider
(
self
,
spider
):
self
.
file
.
write
(
"]"
)
self
.
file
.
close
()
def
process_item
(
self
,
item
,
spider
):
# print("this is my item", item)
row
=
[]
try
:
row
.
append
((
"date"
,
item
[
'date'
]))
except
:
pass
try
:
row
.
append
((
"topic"
,
item
[
'topic'
]))
except
:
pass
try
:
row
.
append
((
"title"
,
item
[
'title'
]))
except
:
pass
try
:
row
.
append
((
"author"
,
item
[
'author'
]))
except
:
pass
try
:
row
.
append
((
"location"
,
item
[
'location'
]))
except
:
pass
try
:
row
.
append
((
"text"
,
item
[
'text'
]))
except
:
pass
try
:
row
.
append
((
"url"
,
item
[
'url'
]))
except
:
pass
line
=
OrderedDict
(
row
)
self
.
counter
+=
1
if
self
.
counter
==
1
:
self
.
file
.
write
(
json
.
dumps
(
line
))
elif
self
.
counter
>
1
:
self
.
file
.
write
(
",
\n
"
+
json
.
dumps
(
line
))
return
item
This diff is collapsed.
Click to expand it.
descarga_por_dia/diarioPuntual/diarioPuntual/settings.py
0 → 100644
View file @
eaa37668
# -*- coding: utf-8 -*-
# Scrapy settings for diarioPuntual project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME
=
'diarioPuntual'
SPIDER_MODULES
=
[
'diarioPuntual.spiders'
]
NEWSPIDER_MODULE
=
'diarioPuntual.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'diarioPuntual (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY
=
0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED
=
False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'diarioPuntual.middlewares.DiariopuntualSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'diarioPuntual.middlewares.DiariopuntualDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES
=
{
'diarioPuntual.pipelines.JsonWriterPipeline'
:
300
,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
This diff is collapsed.
Click to expand it.
descarga_por_dia/diarioPuntual/diarioPuntual/spiders/__init__.py
0 → 100644
View file @
eaa37668
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
This diff is collapsed.
Click to expand it.
descarga_por_dia/diarioPuntual/diarioPuntual/spiders/noticias.py
0 → 100644
View file @
eaa37668
# -*- coding: utf-8 -*-
"""
MEDIA:
Puntual, EDOMEX
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd diarioPuntual/
$ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
"""
import
scrapy
,
re
from
diarioPuntual.items
import
NoticiasItem
from
datetime
import
datetime
,
timedelta
,
tzinfo
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
class
UTC
(
tzinfo
):
"""
Class for Time Zone
"""
def
utcoffset
(
self
,
dt
):
## Time zone for EDOMEX: UTC-6 ##
return
timedelta
(
hours
=-
6
)
def
tzname
(
self
,
dt
):
## Time zone name ##
return
'UTC-6'
class
QuotesSpider
(
scrapy
.
Spider
):
"""
Basic Scrapy Spider class
"""
name
=
"noticias"
def
start_requests
(
self
):
self
.
tz
=
UTC
()
self
.
year
=
getattr
(
self
,
"year"
,
None
)
self
.
month
=
getattr
(
self
,
"month"
,
None
)
self
.
day
=
getattr
(
self
,
"day"
,
None
)
baseURL
=
"http://diario-puntual.com.mx/{0}/{1}/{2}/"
.
format
(
self
.
year
,
self
.
month
.
zfill
(
2
),
self
.
day
.
zfill
(
2
))
yield
scrapy
.
Request
(
url
=
baseURL
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
for
link
in
response
.
css
(
'div.post-column'
)
.
css
(
'h2.posttitle > a::attr(href)'
)
.
extract
():
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
next_page
=
response
.
css
(
'div.archive-pagination'
)
.
xpath
(
'./a[@class="next page-numbers"]/@href'
)
.
extract_first
()
if
next_page
is
not
None
:
yield
scrapy
.
Request
(
url
=
next_page
,
callback
=
self
.
parse
)
def
parse_item
(
self
,
response
):
item
=
NoticiasItem
()
text
=
''
news_date
=
datetime
(
int
(
self
.
year
),
int
(
self
.
month
),
int
(
self
.
day
),
tzinfo
=
self
.
tz
)
.
isoformat
(
"T"
)
title
=
response
.
css
(
'div.post-container'
)
.
css
(
'h1.post-title'
)
.
extract_first
()
if
title
is
not
None
:
title
=
remove_tags
(
title
)
topic
=
None
for
p
in
response
.
css
(
'div.post-column > article'
)
.
css
(
'p'
)
.
extract
():
p
=
remove_tags
(
p
)
text
+=
p
+
"
\n
"
## News item info ##
item
[
'date'
]
=
news_date
item
[
'title'
]
=
title
item
[
'topic'
]
=
topic
item
[
'text'
]
=
text
.
strip
()
item
[
'url'
]
=
response
.
url
yield
item
This diff is collapsed.
Click to expand it.
descarga_por_dia/diarioPuntual/scrapy.cfg
0 → 100644
View file @
eaa37668
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = diarioPuntual.settings
[deploy]
#url = http://localhost:6800/
project = diarioPuntual
This diff is collapsed.
Click to expand it.
descarga_por_dia/elComentario/elComentario/__init__.py
0 → 100644
View file @
eaa37668
This diff is collapsed.
Click to expand it.
descarga_por_dia/elComentario/elComentario/items.py
0 → 100644
View file @
eaa37668
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import
scrapy
class
NoticiasItem
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
This diff is collapsed.
Click to expand it.
descarga_por_dia/elComentario/elComentario/middlewares.py
0 → 100644
View file @
eaa37668
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from
scrapy
import
signals
class
ElcomentarioSpiderMiddleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_spider_input
(
self
,
response
,
spider
):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return
None
def
process_spider_output
(
self
,
response
,
result
,
spider
):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for
i
in
result
:
yield
i
def
process_spider_exception
(
self
,
response
,
exception
,
spider
):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def
process_start_requests
(
self
,
start_requests
,
spider
):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for
r
in
start_requests
:
yield
r
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
class
ElcomentarioDownloaderMiddleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_request
(
self
,
request
,
spider
):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return
None
def
process_response
(
self
,
request
,
response
,
spider
):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return
response
def
process_exception
(
self
,
request
,
exception
,
spider
):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
This diff is collapsed.
Click to expand it.
descarga_por_dia/elComentario/elComentario/pipelines.py
0 → 100644
View file @
eaa37668
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import
json
from
collections
import
OrderedDict
class
JsonWriterPipeline
(
object
):
def
__init__
(
self
,
filename
):
self
.
filename
=
filename
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# Here you get whatever value was passed through the "filename" command line parameter
settings
=
crawler
.
settings
filename
=
settings
.
get
(
'filename'
)
# Instantiate the pipeline with the file name
return
cls
(
filename
)
def
open_spider
(
self
,
spider
):
self
.
counter
=
0
self
.
file
=
open
(
self
.
filename
,
'w'
)
self
.
file
.
write
(
"["
)
def
close_spider
(
self
,
spider
):
self
.
file
.
write
(
"]"
)
self
.
file
.
close
()
def
process_item
(
self
,
item
,
spider
):
# print("this is my item", item)
row
=
[]
try
:
row
.
append
((
"date"
,
item
[
'date'
]))
except
:
pass
try
:
row
.
append
((
"topic"
,
item
[
'topic'
]))
except
:
pass
try
:
row
.
append
((
"title"
,
item
[
'title'
]))
except
:
pass
try
:
row
.
append
((
"author"
,
item
[
'author'
]))
except
:
pass
try
:
row
.
append
((
"location"
,
item
[
'location'
]))
except
:
pass
try
:
row
.
append
((
"text"
,
item
[
'text'
]))
except
:
pass
try
:
row
.
append
((
"url"
,
item
[
'url'
]))
except
:
pass
line
=
OrderedDict
(
row
)
self
.
counter
+=
1
if
self
.
counter
==
1
:
self
.
file
.
write
(
json
.
dumps
(
line
))
elif
self
.
counter
>
1
:
self
.
file
.
write
(
",
\n
"
+
json
.
dumps
(
line
))
return
item
This diff is collapsed.
Click to expand it.
descarga_por_dia/elComentario/elComentario/settings.py
0 → 100644
View file @
eaa37668
# -*- coding: utf-8 -*-
# Scrapy settings for elComentario project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME
=
'elComentario'
SPIDER_MODULES
=
[
'elComentario.spiders'
]
NEWSPIDER_MODULE
=
'elComentario.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'elComentario (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY
=
0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED
=
False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'elComentario.middlewares.ElcomentarioSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'elComentario.middlewares.ElcomentarioDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES
=
{
'elComentario.pipelines.JsonWriterPipeline'
:
300
,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
This diff is collapsed.
Click to expand it.
descarga_por_dia/elComentario/elComentario/spiders/__init__.py
0 → 100644
View file @
eaa37668
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
This diff is collapsed.
Click to expand it.
descarga_por_dia/elComentario/elComentario/spiders/noticias.py
0 → 100644
View file @
eaa37668
# -*- coding: utf-8 -*-
"""
MEDIA:
El Comentario, Colima
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd elComentario/
$ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
"""
import
scrapy
,
re
from
elComentario.items
import
NoticiasItem
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
class
QuotesSpider
(
scrapy
.
Spider
):
"""
Basic Scrapy Spider class
"""
name
=
"noticias"
def
start_requests
(
self
):
year
=
getattr
(
self
,
"year"
,
None
)
month
=
getattr
(
self
,
"month"
,
None
)
day
=
getattr
(
self
,
"day"
,
None
)
baseURL
=
"https://elcomentario.ucol.mx/{0}/{1}/{2}/"
.
format
(
year
,
month
.
zfill
(
2
),
day
.
zfill
(
2
))
yield
scrapy
.
Request
(
url
=
baseURL
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
for
link
in
response
.
css
(
'div.articles'
)
.
xpath
(
'./article/div[@class="cnt"]/h3/a/@href'
)
.
extract
():
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
next_page
=
response
.
css
(
'div.post-pagination'
)
.
xpath
(
'./a[@title="Next page"]/@href'
)
.
extract_first
()
if
next_page
is
not
None
:
yield
scrapy
.
Request
(
url
=
next_page
,
callback
=
self
.
parse
)
def
parse_item
(
self
,
response
):
item
=
NoticiasItem
()
text
=
''
news_date
=
response
.
xpath
(
'//meta[@property="article:published_time"]/@content'
)
.
extract_first
()
title
=
response
.
xpath
(
'//header/h1'
)
.
extract_first
()
if
title
is
not
None
:
title
=
remove_tags
(
title
)
topic
=
response
.
css
(
'a.theme'
)
.
extract_first
()
if
topic
is
not
None
:
topic
=
remove_tags
(
topic
)
for
p
in
response
.
css
(
'div.pf-content'
)
.
css
(
'p'
)
.
extract
():
p
=
remove_tags
(
p
)
text
+=
p
+
"
\n
"
text
=
text
.
strip
()
## News item info ##
item
[
'date'
]
=
news_date
item
[
'title'
]
=
title
item
[
'topic'
]
=
topic
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
yield
item
This diff is collapsed.
Click to expand it.
descarga_por_dia/elComentario/scrapy.cfg
0 → 100644
View file @
eaa37668
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = elComentario.settings
[deploy]
#url = http://localhost:6800/
project = elComentario
This diff is collapsed.
Click to expand it.
descarga_por_dia/elSur/elSur/__init__.py
0 → 100644
View file @
eaa37668
This diff is collapsed.
Click to expand it.
descarga_por_dia/elSur/elSur/items.py
0 → 100644
View file @
eaa37668
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import
scrapy
class
NoticiasItem
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
This diff is collapsed.
Click to expand it.
descarga_por_dia/elSur/elSur/middlewares.py
0 → 100644
View file @
eaa37668
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from
scrapy
import
signals
class
ElsurSpiderMiddleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_spider_input
(
self
,
response
,
spider
):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return
None
def
process_spider_output
(
self
,
response
,
result
,
spider
):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for
i
in
result
:
yield
i
def
process_spider_exception
(
self
,
response
,
exception
,
spider
):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def
process_start_requests
(
self
,
start_requests
,
spider
):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for
r
in
start_requests
:
yield
r
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
class
ElsurDownloaderMiddleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_request
(
self
,
request
,
spider
):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return
None
def
process_response
(
self
,
request
,
response
,
spider
):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return
response
def
process_exception
(
self
,
request
,
exception
,
spider
):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
This diff is collapsed.
Click to expand it.
descarga_por_dia/elSur/elSur/pipelines.py
0 → 100644
View file @
eaa37668
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import
json
from
collections
import
OrderedDict
class
JsonWriterPipeline
(
object
):
def
__init__
(
self
,
filename
):
self
.
filename
=
filename
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# Here you get whatever value was passed through the "filename" command line parameter
settings
=
crawler
.
settings
filename
=
settings
.
get
(
'filename'
)
# Instantiate the pipeline with the file name
return
cls
(
filename
)
def
open_spider
(
self
,
spider
):
self
.
counter
=
0
self
.
file
=
open
(
self
.
filename
,
'w'
)
self
.
file
.
write
(
"["
)
def
close_spider
(
self
,
spider
):
self
.
file
.
write
(
"]"
)
self
.
file
.
close
()
def
process_item
(
self
,
item
,
spider
):
# print("this is my item", item)
row
=
[]
try
:
row
.
append
((
"date"
,
item
[
'date'
]))
except
:
pass
try
:
row
.
append
((
"topic"
,
item
[
'topic'
]))
except
:
pass
try
:
row
.
append
((
"title"
,
item
[
'title'
]))
except
:
pass
try
:
row
.
append
((
"author"
,
item
[
'author'
]))
except
:
pass
try
:
row
.
append
((
"location"
,
item
[
'location'
]))
except
:
pass
try
:
row
.
append
((
"text"
,
item
[
'text'
]))
except
:
pass
try
:
row
.
append
((
"url"
,
item
[
'url'
]))
except
:
pass
line
=
OrderedDict
(
row
)
self
.
counter
+=
1
if
self
.
counter
==
1
:
self
.
file
.
write
(
json
.
dumps
(
line
))
elif
self
.
counter
>
1
:
self
.
file
.
write
(
",
\n
"
+
json
.
dumps
(
line
))
return
item
This diff is collapsed.
Click to expand it.
descarga_por_dia/elSur/elSur/settings.py
0 → 100644
View file @
eaa37668
# -*- coding: utf-8 -*-
# Scrapy settings for elSur project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME
=
'elSur'
SPIDER_MODULES
=
[
'elSur.spiders'
]
NEWSPIDER_MODULE
=
'elSur.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'elSur (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY
=
0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED
=
False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'elSur.middlewares.ElsurSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'elSur.middlewares.ElsurDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES
=
{
'elSur.pipelines.JsonWriterPipeline'
:
300
,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
This diff is collapsed.
Click to expand it.
descarga_por_dia/elSur/elSur/spiders/__init__.py
0 → 100644
View file @
eaa37668
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
This diff is collapsed.
Click to expand it.
descarga_por_dia/elSur/elSur/spiders/noticias.py
0 → 100644
View file @
eaa37668
# -*- coding: utf-8 -*-
"""
MEDIA:
El Sur, Guerrero
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd elComentario/
$ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
"""
import
scrapy
,
re
from
elSur.items
import
NoticiasItem
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
HEAD_RE_1
=
re
.
compile
(
r'Texto:.*?[/y] Foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?'
)
HEAD_RE_2
=
re
.
compile
(
r'Texto y foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?'
)
class
QuotesSpider
(
scrapy
.
Spider
):
"""
Basic Scrapy Spider class
"""
name
=
"noticias"
def
start_requests
(
self
):
year
=
getattr
(
self
,
"year"
,
None
)
month
=
getattr
(
self
,
"month"
,
None
)
day
=
getattr
(
self
,
"day"
,
None
)
baseURL
=
"https://suracapulco.mx/{0}/{1}/{2}/"
.
format
(
year
,
month
.
zfill
(
2
),
day
.
zfill
(
2
))
yield
scrapy
.
Request
(
url
=
baseURL
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
for
link
in
response
.
css
(
'div.dslc-blog-posts'
)
.
css
(
'div.dslc-blog-post-title > h2 > a::attr(href)'
)
.
extract
():
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
pag_lst
=
response
.
css
(
'div.dslc-pagination > ul > li'
)
if
len
(
pag_lst
)
>
0
:
del
pag_lst
[
0
]
del
pag_lst
[
0
]
next_page
=
None
for
li_obj
in
pag_lst
:
li
=
remove_tags
(
li_obj
.
extract
())
if
not
li
.
isdigit
():
next_page
=
li_obj
.
xpath
(
'./a/@href'
)
.
extract_first
()
break
if
next_page
is
not
None
:
yield
scrapy
.
Request
(
url
=
next_page
,
callback
=
self
.
parse
)
def
parse_item
(
self
,
response
):
item
=
NoticiasItem
()
text
=
''
news_date
=
response
.
xpath
(
'//meta[@property="article:published_time"]/@content'
)
.
extract_first
()
title
=
response
.
css
(
'div.dslc-tp-title > h1'
)
.
extract_first
()
if
title
is
not
None
:
title
=
remove_tags
(
title
)
topic
=
response
.
css
(
'div.dslc-tp-meta'
)
.
xpath
(
'./ul/li[3]/a[1]'
)
.
extract_first
()
if
topic
is
not
None
:
topic
=
remove_tags
(
topic
)
for
p
in
response
.
xpath
(
'//div[@id="dslc-theme-content-inner"]'
)
.
css
(
'p'
)
.
extract
():
p
=
remove_tags
(
p
)
text
+=
p
+
"
\n
"
dateline
=
response
.
css
(
'span.dateline'
)
.
extract_first
()
if
dateline
is
not
None
:
dateline
=
remove_tags
(
dateline
)
text
=
text
.
replace
(
dateline
,
''
)
text
=
text
.
replace
(
u'
\u00a0
'
,
' '
)
text
=
HEAD_RE_1
.
sub
(
''
,
text
)
text
=
HEAD_RE_2
.
sub
(
''
,
text
)
## News item info ##
item
[
'date'
]
=
news_date
item
[
'title'
]
=
title
item
[
'topic'
]
=
topic
item
[
'text'
]
=
text
.
strip
()
item
[
'url'
]
=
response
.
url
yield
item
This diff is collapsed.
Click to expand it.
descarga_por_dia/elSur/scrapy.cfg
0 → 100644
View file @
eaa37668
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = elSur.settings
[deploy]
#url = http://localhost:6800/
project = elSur
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment