Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
9518b445
Commit
9518b445
authored
1 year ago
by
Mario Chirinos
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
la jornada san luis
parent
5d702f68
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
137 additions
and
173 deletions
+137
-173
items.py
spiders/daily/laJornadaSanLuis/laJornadaSanLuis/items.py
+11
-13
middlewares.py
...rs/daily/laJornadaSanLuis/laJornadaSanLuis/middlewares.py
+58
-11
pipelines.py
spiders/daily/laJornadaSanLuis/laJornadaSanLuis/pipelines.py
+4
-66
settings.py
spiders/daily/laJornadaSanLuis/laJornadaSanLuis/settings.py
+22
-20
noticias.py
...ily/laJornadaSanLuis/laJornadaSanLuis/spiders/noticias.py
+42
-63
No files found.
spiders/daily/laJornadaSanLuis/laJornadaSanLuis/items.py
View file @
9518b445
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http
://doc
.scrapy.org/en/latest/topics/items.html
# http
s://docs
.scrapy.org/en/latest/topics/items.html
import
scrapy
class
Noticia
sItem
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
titl
e
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
date
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
class
Lajornadasanlui
sItem
(
scrapy
.
Item
):
# define the fields for your item here like:
# name = scrapy.Field()
dat
e
=
scrapy
.
Field
()
title
=
scrapy
.
Field
()
text
=
scrapy
.
Field
()
location
=
scrapy
.
Field
()
author
=
scrapy
.
Field
()
topic
=
scrapy
.
Field
()
url
=
scrapy
.
Field
()
This diff is collapsed.
Click to expand it.
spiders/daily/laJornadaSanLuis/laJornadaSanLuis/middlewares.py
View file @
9518b445
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http
://doc
.scrapy.org/en/latest/topics/spider-middleware.html
# http
s://docs
.scrapy.org/en/latest/topics/spider-middleware.html
from
scrapy
import
signals
# useful for handling different item types with a single interface
from
itemadapter
import
is_item
,
ItemAdapter
class
LajornadasanluisSpiderMiddleware
(
object
)
:
class
LajornadasanluisSpiderMiddleware
:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
...
...
@@ -20,30 +21,29 @@ class LajornadasanluisSpiderMiddleware(object):
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_spider_input
(
response
,
spider
):
def
process_spider_input
(
self
,
response
,
spider
):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return
None
def
process_spider_output
(
response
,
result
,
spider
):
def
process_spider_output
(
self
,
response
,
result
,
spider
):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request,
dict or I
tem objects.
# Must return an iterable of Request,
or i
tem objects.
for
i
in
result
:
yield
i
def
process_spider_exception
(
response
,
exception
,
spider
):
def
process_spider_exception
(
self
,
response
,
exception
,
spider
):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
# Should return either None or an iterable of Request or item objects.
pass
def
process_start_requests
(
start_requests
,
spider
):
def
process_start_requests
(
s
elf
,
s
tart_requests
,
spider
):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
...
...
@@ -54,3 +54,50 @@ class LajornadasanluisSpiderMiddleware(object):
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
class
LajornadasanluisDownloaderMiddleware
:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# This method is used by Scrapy to create your spiders.
s
=
cls
()
crawler
.
signals
.
connect
(
s
.
spider_opened
,
signal
=
signals
.
spider_opened
)
return
s
def
process_request
(
self
,
request
,
spider
):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return
None
def
process_response
(
self
,
request
,
response
,
spider
):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return
response
def
process_exception
(
self
,
request
,
exception
,
spider
):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def
spider_opened
(
self
,
spider
):
spider
.
logger
.
info
(
'Spider opened:
%
s'
%
spider
.
name
)
This diff is collapsed.
Click to expand it.
spiders/daily/laJornadaSanLuis/laJornadaSanLuis/pipelines.py
View file @
9518b445
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import
json
from
collections
import
OrderedDict
class
JsonWriterPipeline
(
object
):
def
__init__
(
self
,
filename
):
self
.
filename
=
filename
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
@
classmethod
def
from_crawler
(
cls
,
crawler
):
# Here you get whatever value was passed through the "filename" command line parameter
settings
=
crawler
.
settings
filename
=
settings
.
get
(
'filename'
)
# Instantiate the pipeline with the file nam
e
return
cls
(
filename
)
# useful for handling different item types with a single interfac
e
from
itemadapter
import
ItemAdapter
def
open_spider
(
self
,
spider
):
self
.
counter
=
0
self
.
file
=
open
(
self
.
filename
,
'w'
)
self
.
file
.
write
(
"["
)
def
close_spider
(
self
,
spider
):
self
.
file
.
write
(
"]"
)
self
.
file
.
close
()
class
LajornadasanluisPipeline
:
def
process_item
(
self
,
item
,
spider
):
# print("this is my item", item)
row
=
[]
try
:
row
.
append
((
"date"
,
item
[
'date'
]))
except
:
pass
try
:
row
.
append
((
"topic"
,
item
[
'topic'
]))
except
:
pass
try
:
row
.
append
((
"title"
,
item
[
'title'
]))
except
:
pass
try
:
row
.
append
((
"author"
,
item
[
'author'
]))
except
:
pass
try
:
row
.
append
((
"location"
,
item
[
'location'
]))
except
:
pass
try
:
row
.
append
((
"text"
,
item
[
'text'
]))
except
:
pass
try
:
row
.
append
((
"url"
,
item
[
'url'
]))
except
:
pass
line
=
OrderedDict
(
row
)
self
.
counter
+=
1
if
self
.
counter
==
1
:
self
.
file
.
write
(
json
.
dumps
(
line
))
elif
self
.
counter
>
1
:
self
.
file
.
write
(
",
\n
"
+
json
.
dumps
(
line
))
return
item
This diff is collapsed.
Click to expand it.
spiders/daily/laJornadaSanLuis/laJornadaSanLuis/settings.py
View file @
9518b445
# -*- coding: utf-8 -*-
# Scrapy settings for laJornadaSanLuis project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http
://doc
.scrapy.org/en/latest/topics/settings.html
# http
://scrapy.readthedocs
.org/en/latest/topics/downloader-middleware.html
# http
://scrapy.readthedocs
.org/en/latest/topics/spider-middleware.html
# http
s://docs
.scrapy.org/en/latest/topics/settings.html
# http
s://docs.scrapy
.org/en/latest/topics/downloader-middleware.html
# http
s://docs.scrapy
.org/en/latest/topics/spider-middleware.html
BOT_NAME
=
'laJornadaSanLuis'
SPIDER_MODULES
=
[
'laJornadaSanLuis.spiders'
]
NEWSPIDER_MODULE
=
'laJornadaSanLuis.spiders'
FEED_EXPORT_ENCODING
=
"utf-8"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'laJornadaSanLuis (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = Tru
e
ROBOTSTXT_OBEY
=
Fals
e
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http
://scrapy.readthedocs
.org/en/latest/topics/settings.html#download-delay
# See http
s://docs.scrapy
.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY
=
0.5
DOWNLOAD_DELAY
=
2
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED
=
False
#
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
...
...
@@ -45,31 +43,31 @@ COOKIES_ENABLED = False
#}
# Enable or disable spider middlewares
# See http
://scrapy.readthedocs
.org/en/latest/topics/spider-middleware.html
# See http
s://docs.scrapy
.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'laJornadaSanLuis.middlewares.LajornadasanluisSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http
://scrapy.readthedocs
.org/en/latest/topics/downloader-middleware.html
# See http
s://docs.scrapy
.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'laJornadaSanLuis.middlewares.
MyCustom
DownloaderMiddleware': 543,
# 'laJornadaSanLuis.middlewares.
Lajornadasanluis
DownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http
://scrapy.readthedocs
.org/en/latest/topics/extensions.html
# See http
s://docs.scrapy
.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http
://scrapy.readthedocs
.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES
=
{
'laJornadaSanLuis.pipelines.JsonWriter
Pipeline'
:
300
,
}
# See http
s://docs.scrapy
.org/en/latest/topics/item-pipeline.html
#
ITEM_PIPELINES = {
# 'laJornadaSanLuis.pipelines.Lajornadasanluis
Pipeline': 300,
#
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http
://doc
.scrapy.org/en/latest/topics/autothrottle.html
# See http
s://docs
.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
...
...
@@ -82,9 +80,13 @@ ITEM_PIPELINES = {
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http
://scrapy.readthedocs
.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# See http
s://docs.scrapy
.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION
=
'2.7'
TWISTED_REACTOR
=
'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
This diff is collapsed.
Click to expand it.
spiders/daily/laJornadaSanLuis/laJornadaSanLuis/spiders/noticias.py
View file @
9518b445
# -*- coding: utf-8 -*-
import
scrapy
,
re
from
laJornadaSanLuis.items
import
NoticiasItem
"""
MEDIO:
La Jornada de San Luis, San Luis Potosi
Uso:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
import
scrapy
import
re
from
laJornadaSanLuis.items
import
LajornadasanluisItem
#-------------------------------------------------------------------------------
TAG_RE
=
re
.
compile
(
r'<[^>]+>'
)
def
remove_tags
(
text
):
return
TAG_RE
.
sub
(
''
,
text
)
class
QuotesSpider
(
scrapy
.
Spider
):
name
=
"noticias"
def
start_requests
(
self
):
year
=
getattr
(
self
,
'year'
,
None
)
month
=
getattr
(
self
,
'month'
,
None
)
day
=
getattr
(
self
,
'day'
,
None
)
self
.
baseURL
=
'http://lajornadasanluis.com.mx/'
+
year
+
'/'
+
month
+
'/'
+
day
yield
scrapy
.
Request
(
url
=
self
.
baseURL
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
yield
scrapy
.
Request
(
url
=
response
.
url
,
callback
=
self
.
parse_page
,
dont_filter
=
True
)
pagination
=
response
.
xpath
(
'//div[@class="pages"]/a/@href'
)
.
extract
()
if
len
(
pagination
)
>
0
:
pagination
=
pagination
[
-
1
]
.
strip
(
'/'
)
pages
=
int
(
pagination
[
pagination
.
rfind
(
'/'
)
+
1
:])
for
page
in
range
(
1
,
pages
):
yield
scrapy
.
Request
(
url
=
self
.
baseURL
+
'/page/'
+
str
(
page
+
1
),
callback
=
self
.
parse_page
)
def
parse_page
(
self
,
response
):
for
link
in
response
.
xpath
(
'//*[@class="post-title"]/h2/a/@href'
)
.
extract
():
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
def
parse_item
(
self
,
response
):
item
=
NoticiasItem
()
text
=
''
## la fecha de la noticia ya incluye la zona horaria
d
=
response
.
xpath
(
'//meta[@property="article:published_time"]/@content'
)
.
extract_first
()
if
d
is
None
:
d
=
response
.
xpath
(
'//time[@class="entry-date updated"]/@datetime'
)
.
extract_first
()
item
[
'date'
]
=
d
item
[
'title'
]
=
response
.
css
(
'h1.entry-title::text'
)
.
extract_first
()
item
[
'topic'
]
=
response
.
xpath
(
'//ul[@class="post-categories"]/li/a/text()'
)
.
extract_first
()
for
paragraph
in
response
.
xpath
(
'//p[@style="text-align: justify;"]/text()'
)
.
extract
():
text
+=
remove_tags
(
paragraph
)
+
'
\n
'
item
[
'text'
]
=
text
item
[
'url'
]
=
response
.
url
# print item['title']
yield
item
return
TAG_RE
.
sub
(
''
,
text
)
class
NoticiasSpider
(
scrapy
.
Spider
):
name
=
'noticias'
allowed_domains
=
[
'lajornadasanluis.com.mx'
]
start_urls
=
[
'https://lajornadasanluis.com.mx/'
]
def
start_requests
(
self
):
self
.
year
=
getattr
(
self
,
"year"
,
None
)
self
.
month
=
getattr
(
self
,
"month"
,
None
)
self
.
day
=
getattr
(
self
,
"day"
,
None
)
self
.
baseURL
=
"https://lajornadasanluis.com.mx/{0}/{1}/{2}/"
.
format
(
self
.
year
,
self
.
month
.
zfill
(
2
),
self
.
day
.
zfill
(
2
))
yield
scrapy
.
Request
(
url
=
self
.
baseURL
,
callback
=
self
.
parse
)
def
parse
(
self
,
response
):
print
(
response
.
url
)
next_page
=
response
.
xpath
(
'//div[@class="pager"]/a[@class="next_page"]/@href'
)
.
extract_first
()
print
(
"nextPage"
,
next_page
)
if
next_page
is
not
None
:
yield
scrapy
.
Request
(
url
=
next_page
,
callback
=
self
.
parse
)
for
link
in
response
.
xpath
(
'//h2[@class="entry-title"]/a/@href'
)
.
extract
():
yield
scrapy
.
Request
(
url
=
link
,
callback
=
self
.
parse_item
)
#-------------------------------------------------------------------------------
def
parse_item
(
self
,
response
):
# print(response.url)
item
=
LajornadasanluisItem
()
item
[
"date"
]
=
response
.
xpath
(
'//meta[@property="article:published_time"]/@content'
)
.
extract_first
()
item
[
"title"
]
=
response
.
xpath
(
'//h1[@class="entry-title"]/text()'
)
.
extract_first
()
text
=
""
for
p
in
response
.
xpath
(
'//div[@class="the_content_wrapper "]/p'
)
.
extract
():
text
+=
remove_tags
(
p
)
+
"
\n
"
item
[
"text"
]
=
text
item
[
'topic'
]
=
", "
.
join
(
response
.
xpath
(
'//div[@class="cat-wrapper"]/ul/li/a/text()'
)
.
extract
())
print
(
self
.
allowed_domains
,
item
[
"title"
])
yield
(
item
)
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment