settings.py 3.74 KB
Newer Older
Renán Sosa Guillen's avatar
Renán Sosa Guillen committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
# -*- coding: utf-8 -*-

# Scrapy settings for proceso project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'proceso'

SPIDER_MODULES = ['proceso.spiders']
NEWSPIDER_MODULE = 'proceso.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
Renán Sosa Guillen's avatar
Renán Sosa Guillen committed
19
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
Renán Sosa Guillen's avatar
Renán Sosa Guillen committed
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35

# Obey robots.txt rules
# ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
Renán Sosa Guillen's avatar
Renán Sosa Guillen committed
36 37 38
COOKIES_ENABLED = True
COOKIES_DEBUG = True
SPLASH_COOKIES_DEBUG = True
Renán Sosa Guillen's avatar
Renán Sosa Guillen committed
39 40 41 42 43

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
Renán Sosa Guillen's avatar
Renán Sosa Guillen committed
44 45 46 47 48 49 50
# DEFAULT_REQUEST_HEADERS = {
#     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#     'User-Agent': USER_AGENT,
#     'Connection': 'Keep-Alive',
#     # 'Accept-Encoding': 'gzip, deflate',
#     'Accept-Language': 'en',
# }
Renán Sosa Guillen's avatar
Renán Sosa Guillen committed
51 52 53 54 55 56 57 58 59 60 61

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
   # 'proceso.middlewares.ProcesoSpiderMiddleware': 543,
   'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
Renán Sosa Guillen's avatar
Renán Sosa Guillen committed
62 63 64 65 66
    # 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None,
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
    # 'proceso.middlewares.ThreatDefenceRedirectMiddleware': 820,
Renán Sosa Guillen's avatar
Renán Sosa Guillen committed
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'proceso.pipelines.JsonWriterPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'

SPLASH_URL = 'http://localhost:8050/'

DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'