SPIDER_MODULES = ['Amazon.spiders']
NEWSPIDER_MODULE = 'Amazon.spiders'html
USER_AGENT = 'Amazon (+http://www.yourdomain.com)'python
ROBOTSTXT_OBEY = Falsecookie
COOKIES_ENABLED = True并发
TELNETCONSOLE_ENABLED = False
TELNETCONSOLE_HOST = '127.0.0.1'
TELNETCONSOLE_PORT = [6023,]app
DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', }
#爬虫程序类中设置 custom_settings = { 'DEFAULT_REQUEST_HEADERS' : { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', "User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36' } }
CONCURRENT_REQUESTS = 32默认值16
dom
CONCURRENT_REQUESTS_PER_DOMAIN = 16默认值8
ide
默认值0,表明无限制,须要注意两点
调试
这个值就表明一个规定死的值,表明对同一网址延迟请求的秒数
code
DOWNLOAD_DELAY = 3xml
开启True,默认False
AUTOTHROTTLE_ENABLED = True
起始的延迟
AUTOTHROTTLE_START_DELAY = 5
最小延迟
DOWNLOAD_DELAY = 3
最大延迟
AUTOTHROTTLE_MAX_DELAY = 10
每秒并发请求数的平均值
AUTOTHROTTLE_TARGET_CONCURRENCY = 16.0不能高于 CONCURRENT_REQUESTS_PER_DOMAIN或CONCURRENT_REQUESTS_PER_IP
AUTOTHROTTLE_DEBUG = True CONCURRENT_REQUESTS_PER_DOMAIN = 16 CONCURRENT_REQUESTS_PER_IP = 16