Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
12e7cc76
Commit
12e7cc76
authored
3 years ago
by
Mario Chirinos Colunga
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
1699ec10
Changes
26
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
26 changed files
with
0 additions
and
1648 deletions
+0
-1648
__init__.py
...cated_versions/laJornada_deprecated/laJornada/__init__.py
+0
-0
items.py
deprecated_versions/laJornada_deprecated/laJornada/items.py
+0
-20
middlewares.py
...ed_versions/laJornada_deprecated/laJornada/middlewares.py
+0
-56
pipelines.py
...ated_versions/laJornada_deprecated/laJornada/pipelines.py
+0
-75
settings.py
...cated_versions/laJornada_deprecated/laJornada/settings.py
+0
-90
__init__.py
...rsions/laJornada_deprecated/laJornada/spiders/__init__.py
+0
-4
noticias.py
...rsions/laJornada_deprecated/laJornada/spiders/noticias.py
+0
-682
scrapy.cfg
deprecated_versions/laJornada_deprecated/scrapy.cfg
+0
-11
scrapy.cfg
deprecated_versions/tribunaHn_deprecated/scrapy.cfg
+0
-11
__init__.py
...cated_versions/tribunaHn_deprecated/tribunaHn/__init__.py
+0
-0
items.py
deprecated_versions/tribunaHn_deprecated/tribunaHn/items.py
+0
-20
middlewares.py
...ed_versions/tribunaHn_deprecated/tribunaHn/middlewares.py
+0
-56
pipelines.py
...ated_versions/tribunaHn_deprecated/tribunaHn/pipelines.py
+0
-75
settings.py
...cated_versions/tribunaHn_deprecated/tribunaHn/settings.py
+0
-90
__init__.py
...rsions/tribunaHn_deprecated/tribunaHn/spiders/__init__.py
+0
-4
noticias.py
...rsions/tribunaHn_deprecated/tribunaHn/spiders/noticias.py
+0
-119
2017-03-22.json
deprecated_versions/yucatanALaMano_20190918/2017-03-22.json
+0
-1
2019-09-17.json
deprecated_versions/yucatanALaMano_20190918/2019-09-17.json
+0
-1
scrapy.cfg
deprecated_versions/yucatanALaMano_20190918/scrapy.cfg
+0
-11
__init__.py
...rsions/yucatanALaMano_20190918/yucatanALaMano/__init__.py
+0
-0
items.py
..._versions/yucatanALaMano_20190918/yucatanALaMano/items.py
+0
-20
middlewares.py
...ons/yucatanALaMano_20190918/yucatanALaMano/middlewares.py
+0
-56
pipelines.py
...sions/yucatanALaMano_20190918/yucatanALaMano/pipelines.py
+0
-75
settings.py
...rsions/yucatanALaMano_20190918/yucatanALaMano/settings.py
+0
-90
__init__.py
...ucatanALaMano_20190918/yucatanALaMano/spiders/__init__.py
+0
-4
noticias.py
...ucatanALaMano_20190918/yucatanALaMano/spiders/noticias.py
+0
-77
No files found.
deprecated_versions/laJornada_deprecated/laJornada/__init__.py
deleted
100644 → 0
View file @
1699ec10
This diff is collapsed.
Click to expand it.
deprecated_versions/laJornada_deprecated/laJornada/items.py
deleted
100644 → 0
View file @
1699ec10
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import
scrapy
class
NoticiasItem
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
This diff is collapsed.
Click to expand it.
deprecated_versions/laJornada_deprecated/laJornada/middlewares.py
deleted
100644 → 0
View file @
1699ec10
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from
scrapy
import
signals
class
LajornadaSpiderMiddleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_spider_input
(
response
,
spider
):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return
None
def
process_spider_output
(
response
,
result
,
spider
):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for
i
in
result
:
yield
i
def
process_spider_exception
(
response
,
exception
,
spider
):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def
process_start_requests
(
start_requests
,
spider
):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for
r
in
start_requests
:
yield
r
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
This diff is collapsed.
Click to expand it.
deprecated_versions/laJornada_deprecated/laJornada/pipelines.py
deleted
100644 → 0
View file @
1699ec10
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import
json
from
collections
import
OrderedDict
class
JsonWriterPipeline
(
object
):
def
__init__
(
self
,
filename
):
self
.
filename
=
filename
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# Here you get whatever value was passed through the "filename" command line parameter
settings
=
crawler
.
settings
filename
=
settings
.
get
(
'filename'
)
# Instantiate the pipeline with the file name
return
cls
(
filename
)
def
open_spider
(
self
,
spider
):
self
.
counter
=
0
self
.
file
=
open
(
self
.
filename
,
'w'
)
self
.
file
.
write
(
"["
)
def
close_spider
(
self
,
spider
):
self
.
file
.
write
(
"]"
)
self
.
file
.
close
()
def
process_item
(
self
,
item
,
spider
):
# print("this is my item", item)
row
=
[]
try
:
row
.
append
((
"date"
,
item
[
'date'
]))
except
:
pass
try
:
row
.
append
((
"topic"
,
item
[
'topic'
]))
except
:
pass
try
:
row
.
append
((
"title"
,
item
[
'title'
]))
except
:
pass
try
:
row
.
append
((
"author"
,
item
[
'author'
]))
except
:
pass
try
:
row
.
append
((
"location"
,
item
[
'location'
]))
except
:
pass
try
:
row
.
append
((
"text"
,
item
[
'text'
]))
except
:
pass
try
:
row
.
append
((
"url"
,
item
[
'url'
]))
except
:
pass
line
=
OrderedDict
(
row
)
self
.
counter
+=
1
if
self
.
counter
==
1
:
self
.
file
.
write
(
json
.
dumps
(
line
))
elif
self
.
counter
>
1
:
self
.
file
.
write
(
",
\n
"
+
json
.
dumps
(
line
))
return
item
This diff is collapsed.
Click to expand it.
deprecated_versions/laJornada_deprecated/laJornada/settings.py
deleted
100644 → 0
View file @
1699ec10
# -*- coding: utf-8 -*-
# Scrapy settings for laJornada project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME
=
'laJornada'
SPIDER_MODULES
=
[
'laJornada.spiders'
]
NEWSPIDER_MODULE
=
'laJornada.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'laJornada (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY
=
0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED
=
False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'laJornada.middlewares.LajornadaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'laJornada.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES
=
{
'laJornada.pipelines.JsonWriterPipeline'
:
300
,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
This diff is collapsed.
Click to expand it.
deprecated_versions/laJornada_deprecated/laJornada/spiders/__init__.py
deleted
100644 → 0
View file @
1699ec10
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
This diff is collapsed.
Click to expand it.
deprecated_versions/laJornada_deprecated/laJornada/spiders/noticias.py
deleted
100644 → 0
View file @
1699ec10
This diff is collapsed.
Click to expand it.
deprecated_versions/laJornada_deprecated/scrapy.cfg
deleted
100644 → 0
View file @
1699ec10
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = laJornada.settings
[deploy]
#url = http://localhost:6800/
project = laJornada
This diff is collapsed.
Click to expand it.
deprecated_versions/tribunaHn_deprecated/scrapy.cfg
deleted
100644 → 0
View file @
1699ec10
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = tribunaHn.settings
[deploy]
#url = http://localhost:6800/
project = tribunaHn
This diff is collapsed.
Click to expand it.
deprecated_versions/tribunaHn_deprecated/tribunaHn/__init__.py
deleted
100644 → 0
View file @
1699ec10
This diff is collapsed.
Click to expand it.
deprecated_versions/tribunaHn_deprecated/tribunaHn/items.py
deleted
100644 → 0
View file @
1699ec10
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import
scrapy
class
NoticiasItem
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
This diff is collapsed.
Click to expand it.
deprecated_versions/tribunaHn_deprecated/tribunaHn/middlewares.py
deleted
100644 → 0
View file @
1699ec10
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from
scrapy
import
signals
class
TribunahnSpiderMiddleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_spider_input
(
response
,
spider
):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return
None
def
process_spider_output
(
response
,
result
,
spider
):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for
i
in
result
:
yield
i
def
process_spider_exception
(
response
,
exception
,
spider
):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def
process_start_requests
(
start_requests
,
spider
):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for
r
in
start_requests
:
yield
r
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
This diff is collapsed.
Click to expand it.
deprecated_versions/tribunaHn_deprecated/tribunaHn/pipelines.py
deleted
100644 → 0
View file @
1699ec10
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import
json
from
collections
import
OrderedDict
class
JsonWriterPipeline
(
object
):
def
__init__
(
self
,
filename
):
self
.
filename
=
filename
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# Here you get whatever value was passed through the "filename" command line parameter
settings
=
crawler
.
settings
filename
=
settings
.
get
(
'filename'
)
# Instantiate the pipeline with the file name
return
cls
(
filename
)
def
open_spider
(
self
,
spider
):
self
.
counter
=
0
self
.
file
=
open
(
self
.
filename
,
'w'
)
self
.
file
.
write
(
"["
)
def
close_spider
(
self
,
spider
):
self
.
file
.
write
(
"]"
)
self
.
file
.
close
()
def
process_item
(
self
,
item
,
spider
):
# print("this is my item", item)
row
=
[]
try
:
row
.
append
((
"date"
,
item
[
'date'
]))
except
:
pass
try
:
row
.
append
((
"topic"
,
item
[
'topic'
]))
except
:
pass
try
:
row
.
append
((
"title"
,
item
[
'title'
]))
except
:
pass
try
:
row
.
append
((
"author"
,
item
[
'author'
]))
except
:
pass
try
:
row
.
append
((
"location"
,
item
[
'location'
]))
except
:
pass
try
:
row
.
append
((
"text"
,
item
[
'text'
]))
except
:
pass
try
:
row
.
append
((
"url"
,
item
[
'url'
]))
except
:
pass
line
=
OrderedDict
(
row
)
self
.
counter
+=
1
if
self
.
counter
==
1
:
self
.
file
.
write
(
json
.
dumps
(
line
))
elif
self
.
counter
>
1
:
self
.
file
.
write
(
",
\n
"
+
json
.
dumps
(
line
))
return
item
This diff is collapsed.
Click to expand it.
deprecated_versions/tribunaHn_deprecated/tribunaHn/settings.py
deleted
100644 → 0
View file @
1699ec10
# -*- coding: utf-8 -*-
# Scrapy settings for tribunaHn project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME
=
'tribunaHn'
SPIDER_MODULES
=
[
'tribunaHn.spiders'
]
NEWSPIDER_MODULE
=
'tribunaHn.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tribunaHn (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY
=
0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED
=
False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'tribunaHn.middlewares.TribunahnSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'tribunaHn.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES
=
{
'tribunaHn.pipelines.JsonWriterPipeline'
:
300
,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
This diff is collapsed.
Click to expand it.
deprecated_versions/tribunaHn_deprecated/tribunaHn/spiders/__init__.py
deleted
100644 → 0
View file @
1699ec10
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
This diff is collapsed.
Click to expand it.
deprecated_versions/tribunaHn_deprecated/tribunaHn/spiders/noticias.py
deleted
100644 → 0
View file @
1699ec10
# -*- coding: utf-8 -*-
"""
===============================================================================
THIS VERSION OF La Tribuna Honduras IS NOW DEPRECATED SINCE THE SITE'S WEB PAGE
CHANGED ITS ACCESS TO ACCES BY DAY.
THE NEW VERSION CAN BE FOUNd IN THE descarga_por_dia/foraneos FOLDER.
===============================================================================
"""
import
scrapy
,
re
,
json
from
datetime
import
date
from
tribunaHn.items
import
NoticiasItem
"""
MEDIO:
La Tribuna, Honduras
USO:
// Si se quiere obtener todas las noticias desde las más actuales hasta las más antiguas. //
scrapy crawl noticias --nolog -s filename=noticias.json
-------------------------------------------------------------------------------------------------
// Si se quiere obtener todas las noticias desde las más actuales hasta una fecha específica. //
scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=2 -a day=29
-------------------------------------------------------------------------------------------------
Después será necesario hacer uso del archivo parse_date_files.py para que las noticias contenidas
en noticias.json sean separadas en archivos por fecha.
"""
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
DAT_RE
=
re
.
compile
(
r'\d{4}\/\d{2}\/\d{2}'
)
class
ImportantData
(
scrapy
.
Item
):
section
=
scrapy
.
Field
()
page
=
scrapy
.
Field
()
class
QuotesSpider
(
scrapy
.
Spider
):
name
=
"noticias"
def
start_requests
(
self
):
self
.
year
=
getattr
(
self
,
"year"
,
None
)
self
.
month
=
getattr
(
self
,
"month"
,
None
)
self
.
day
=
getattr
(
self
,
"day"
,
None
)
if
self
.
year
is
not
None
and
self
.
month
is
not
None
and
self
.
day
is
not
None
:
self
.
stopDate
=
date
(
int
(
self
.
year
),
int
(
self
.
month
),
int
(
self
.
day
))
else
:
self
.
stopDate
=
None
baseURL
=
"http://www.latribuna.hn/"
sectionList
=
[
"noticias"
,
"honduras"
,
"sociedad"
,
"cafeteando"
,
"dejenme-decirles"
,
"desde-usa"
,
"ecomentarios"
,
"el-cambio-climatico"
,
"el-dossier-de-atenea"
,
"enfoques"
,
"pecadillos-idiomaticos"
,
"pildoritas"
,
"columnistas"
,
"editorial"
,
"tribuna-del-pueblo"
,
"anales-historicos"
,
"cine"
,
"dejando-huellas"
,
"dia-7"
,
"dominicales"
,
"done-un-aula"
,
"especiales-lt"
,
"la-cobra-pregunta"
,
"la-tribuna-agropecuaria"
,
"la-tribuna-cultural"
,
"nuestro-orgullo"
,
"turismo"
]
# sectionList = ["noticias"]
for
s
in
sectionList
:
yield
scrapy
.
Request
(
url
=
baseURL
+
s
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
CONTINUE_SEARCHING
=
True
linkList
=
response
.
xpath
(
'//div[@id="main"]'
)
.
css
(
'article.linkbox'
)
.
xpath
(
'./a[@itemprop="url"]/@href'
)
.
extract
()
linkList
.
extend
(
response
.
xpath
(
'//div[@id="main"]'
)
.
css
(
'div.bottom-margin'
)
.
css
(
'div.col-sm-6'
)
.
xpath
(
'./h3/a[@itemprop="url"]/@href'
)
.
extract
())
if
self
.
stopDate
is
None
:
for
link
in
linkList
:
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
else
:
for
link
in
linkList
:
res
=
DAT_RE
.
search
(
link
)
if
res
:
dat
=
map
(
int
,
res
.
group
(
0
)
.
split
(
"/"
))
newsDate
=
date
(
dat
[
0
],
dat
[
1
],
dat
[
2
])
if
newsDate
>=
self
.
stopDate
:
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
else
:
CONTINUE_SEARCHING
=
False
break
if
CONTINUE_SEARCHING
:
nextPage
=
response
.
xpath
(
'//span[@class="next"]/a/@href'
)
.
extract_first
()
if
nextPage
is
not
None
:
yield
scrapy
.
Request
(
url
=
nextPage
,
callback
=
self
.
parse
)
def
parse_item
(
self
,
response
):
item
=
NoticiasItem
()
text
=
''
"La fecha obtenida ya incluye formato y zona horaria"
item
[
'date'
]
=
response
.
xpath
(
'//meta[@property="article:published_time"]/@content'
)
.
extract_first
()
item
[
'title'
]
=
remove_tags
(
response
.
xpath
(
'//header/h1[@itemprop="name"]'
)
.
extract_first
())
try
:
topic
=
response
.
xpath
(
'//aside[@class="tags"]/ul/li/a/text()'
)
.
extract
()[
0
]
except
:
topic
=
None
item
[
'topic'
]
=
topic
for
p
in
response
.
css
(
'div.article-post-content'
)
.
css
(
'p'
)
.
extract
():
text
+=
remove_tags
(
p
)
+
"
\n
"
item
[
'text'
]
=
text
.
strip
()
item
[
'url'
]
=
response
.
url
yield
item
This diff is collapsed.
Click to expand it.
deprecated_versions/yucatanALaMano_20190918/2017-03-22.json
deleted
100644 → 0
View file @
1699ec10
[]
\ No newline at end of file
This diff is collapsed.
Click to expand it.
deprecated_versions/yucatanALaMano_20190918/2019-09-17.json
deleted
100644 → 0
View file @
1699ec10
[]
\ No newline at end of file
This diff is collapsed.
Click to expand it.
deprecated_versions/yucatanALaMano_20190918/scrapy.cfg
deleted
100644 → 0
View file @
1699ec10
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = yucatanALaMano.settings
[deploy]
#url = http://localhost:6800/
project = yucatanALaMano
This diff is collapsed.
Click to expand it.
deprecated_versions/yucatanALaMano_20190918/yucatanALaMano/__init__.py
deleted
100644 → 0
View file @
1699ec10
This diff is collapsed.
Click to expand it.
deprecated_versions/yucatanALaMano_20190918/yucatanALaMano/items.py
deleted
100644 → 0
View file @
1699ec10
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import
scrapy
class
NoticiasItem
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
This diff is collapsed.
Click to expand it.
deprecated_versions/yucatanALaMano_20190918/yucatanALaMano/middlewares.py
deleted
100644 → 0
View file @
1699ec10
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from
scrapy
import
signals
class
YucatanalamanoSpiderMiddleware
(
object
):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_spider_input
(
response
,
spider
):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return
None
def
process_spider_output
(
response
,
result
,
spider
):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for
i
in
result
:
yield
i
def
process_spider_exception
(
response
,
exception
,
spider
):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def
process_start_requests
(
start_requests
,
spider
):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for
r
in
start_requests
:
yield
r
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
This diff is collapsed.
Click to expand it.
deprecated_versions/yucatanALaMano_20190918/yucatanALaMano/pipelines.py
deleted
100644 → 0
View file @
1699ec10
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import
json
from
collections
import
OrderedDict
class
JsonWriterPipeline
(
object
):
def
__init__
(
self
,
filename
):
self
.
filename
=
filename
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# Here you get whatever value was passed through the "filename" command line parameter
settings
=
crawler
.
settings
filename
=
settings
.
get
(
'filename'
)
# Instantiate the pipeline with the file name
return
cls
(
filename
)
def
open_spider
(
self
,
spider
):
self
.
counter
=
0
self
.
file
=
open
(
self
.
filename
,
'w'
)
self
.
file
.
write
(
"["
)
def
close_spider
(
self
,
spider
):
self
.
file
.
write
(
"]"
)
self
.
file
.
close
()
def
process_item
(
self
,
item
,
spider
):
# print("this is my item", item)
row
=
[]
try
:
row
.
append
((
"date"
,
item
[
'date'
]))
except
:
pass
try
:
row
.
append
((
"topic"
,
item
[
'topic'
]))
except
:
pass
try
:
row
.
append
((
"title"
,
item
[
'title'
]))
except
:
pass
try
:
row
.
append
((
"author"
,
item
[
'author'
]))
except
:
pass
try
:
row
.
append
((
"location"
,
item
[
'location'
]))
except
:
pass
try
:
row
.
append
((
"text"
,
item
[
'text'
]))
except
:
pass
try
:
row
.
append
((
"url"
,
item
[
'url'
]))
except
:
pass
line
=
OrderedDict
(
row
)
self
.
counter
+=
1
if
self
.
counter
==
1
:
self
.
file
.
write
(
json
.
dumps
(
line
))
elif
self
.
counter
>
1
:
self
.
file
.
write
(
",
\n
"
+
json
.
dumps
(
line
))
return
item
This diff is collapsed.
Click to expand it.
deprecated_versions/yucatanALaMano_20190918/yucatanALaMano/settings.py
deleted
100644 → 0
View file @
1699ec10
# -*- coding: utf-8 -*-
# Scrapy settings for yucatanALaMano project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME
=
'yucatanALaMano'
SPIDER_MODULES
=
[
'yucatanALaMano.spiders'
]
NEWSPIDER_MODULE
=
'yucatanALaMano.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'yucatanALaMano (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY
=
0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED
=
False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'yucatanALaMano.middlewares.YucatanalamanoSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'yucatanALaMano.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES
=
{
'yucatanALaMano.pipelines.JsonWriterPipeline'
:
300
,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
This diff is collapsed.
Click to expand it.
deprecated_versions/yucatanALaMano_20190918/yucatanALaMano/spiders/__init__.py
deleted
100644 → 0
View file @
1699ec10
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
This diff is collapsed.
Click to expand it.
deprecated_versions/yucatanALaMano_20190918/yucatanALaMano/spiders/noticias.py
deleted
100644 → 0
View file @
1699ec10
# -*- coding: utf-8 -*-
import
scrapy
,
re
from
yucatanALaMano.items
import
NoticiasItem
"""
MEDIO:
Yucatán a la Mano, Yuc.
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
class
QuotesSpider
(
scrapy
.
Spider
):
name
=
"noticias"
def
start_requests
(
self
):
year
=
getattr
(
self
,
'year'
,
None
)
month
=
getattr
(
self
,
'month'
,
None
)
day
=
getattr
(
self
,
'day'
,
None
)
# self.baseURL = "http://www.yucatanalamano.com/" + year + "/" + month + "/" + day
# self.baseURL = "http://yucatanalamano.com/" + year + "/" + month + "/" + day
urlList
=
[
"http://www.yucatanalamano.com/"
+
year
+
"/"
+
month
+
"/"
+
day
,
"http://yucatanalamano.com/"
+
year
+
"/"
+
month
+
"/"
+
day
]
for
url
in
urlList
:
yield
scrapy
.
Request
(
url
=
url
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
pagination
=
response
.
css
(
'div.pagination'
)
.
css
(
'a::attr(href)'
)
.
extract
()
if
len
(
pagination
)
>
0
:
pagination
=
pagination
[
-
1
]
.
strip
(
'/'
)
pages
=
int
(
pagination
[
pagination
.
rfind
(
'/'
)
+
1
:])
for
page
in
range
(
1
,
pages
):
yield
scrapy
.
Request
(
url
=
response
.
url
+
"/page/"
+
str
(
page
+
1
),
callback
=
self
.
parse_page
)
def
parse_page
(
self
,
response
):
for
link
in
response
.
css
(
'div.bp-head'
)
.
css
(
'h2'
)
.
css
(
'a::attr(href)'
)
.
extract
():
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
def
parse_item
(
self
,
response
):
item
=
NoticiasItem
()
text
=
''
title
=
response
.
xpath
(
'//div[@class="main-col"]/div[@itemprop="name"]/text()'
)
.
extract_first
()
if
title
is
None
:
title
=
response
.
xpath
(
'//div[@class="main-col"]'
)
.
css
(
'h1'
)
.
extract_first
()
if
title
is
not
None
:
item
[
'title'
]
=
remove_tags
(
title
)
else
:
item
[
'title'
]
=
title
d
=
response
.
css
(
'div.mom-post-meta'
)
.
css
(
'span'
)
.
css
(
'time::attr(datetime)'
)
.
extract_first
()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if
d
[
-
6
:]
!=
'-06:00'
:
d
=
d
[:
-
6
]
+
'-06:00'
item
[
'date'
]
=
d
item
[
'topic'
]
=
response
.
css
(
'div.breadcrumbs-plus'
)
.
css
(
'span'
)
.
css
(
'a::attr(title)'
)
.
extract_first
()
for
paragraph
in
response
.
css
(
'div.entry-content'
)
.
css
(
'p'
)
.
extract
():
text
+=
remove_tags
(
paragraph
)
+
'
\n
'
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
# print item['title']
yield
item
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment