维护一组浏览器,实现每分钟1000次查询。DriverPool使用变幻版只初始化一次的单例模式。维护每一个浏览器的当前是否使用的状态。javascript
不须要等待请求来了,临时开浏览器,开一个浏览器会耽误6秒钟。css
能够在程序启动后,随便使用命令杀死slenium,,不怕被别人杀死,不须要重启程序就能保证长久正常运行。java
主要使用了 mixin继承、变化版单例模式、鸭子类、桥接模式、上下文管理器,引入了资源池的概念,自动选择一个当前未被使用的浏览器。web
使用了池固定了浏览器最大数量,避免了直接开孤立的slenium driver,当并发大的时候代码忽然启动几百上千个浏览器,会致使系统忽然性能衰竭。chrome
# coding=utf8 """ 浏览器资源池维护。不须要等待有任务来了,再重开浏览器。新开浏览器会耽误6秒时间。 抗杀抗oom,能够随便在程序启动后,批量杀死浏览器,程序会自动开启。 """ import time import os from pathlib import Path from threading import Lock from urllib.error import URLError from selenium.webdriver import DesiredCapabilities from selenium.common.exceptions import WebDriverException from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait from app.utils_ydf import LoggerMixin, BoundedThreadPoolExecutor, decorators, LogManager class NoAvailableDriverError(Exception): pass class DriverItem: def __init__(self, driver, ): self.driver = driver self.create_time = time.time() self.is_using = False self.last_use_time = time.time() def __str__(self): # noinspection PyRedundantParentheses return (f"{time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(self.create_time))} {self.is_using} {time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(self.last_use_time))} {self.driver}") class PhantomjsItemBuilder(LoggerMixin): # noinspection PyBroadException def create_a_driver_item(self): t0 = time.time() capabilities = DesiredCapabilities.PHANTOMJS.copy() capabilities['platform'] = "WINDOWS" capabilities['version'] = "10" capabilities['phantomjs.page.settings.loadImages'] = False # capabilities['phantomjs.page.settings.userAgent'] = ( # "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) " # "Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0") capabilities['phantomjs.page.settings.userAgent'] = ( "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36") service_args = ['--load-images=no', '--disk-cache=yes', '--ignore-ssl-errors=true'] self.logger_with_file.info('建立一个driver中。。。。。。') driver = None if os.name == 'posix': # driver = webdriver.PhantomJS(executable_path=Path(__file__).parent / Path('phantomjs'), desired_capabilities=capabilities, service_args=service_args) try: driver = webdriver.PhantomJS(desired_capabilities=capabilities, service_args=service_args) except Exception as e: self.logger.exception(f'从环境变量获取driver路径失败,改成从/usr/local/bin文件夹获取 {e}') try: driver = webdriver.PhantomJS(executable_path='/usr/local/bin/phantomjs', desired_capabilities=capabilities, service_args=service_args) except Exception as e: self.logger.exception(f'从/usr/local/bin/phantomjs启动失败 {e}') else: driver = webdriver.PhantomJS(desired_capabilities=capabilities, service_args=service_args) # driver.maximize_window() driver.set_window_size(390, 713) driver.set_page_load_timeout(10) # driver.implicitly_wait(10) self.logger.info(f'建立一个浏览器耗时{time.time() - t0}') return DriverItem(driver) class ChromeItemBuilder(LoggerMixin): def create_a_driver_item(self): self.logger.info('建立一个driver中。。。。。。') t0 = time.time() chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--disable-images') chrome_options.binary_location = r'C:\Users\Administrator\AppData\Local\Google\Chrome\Application\chrome.exe' # prefs = {"profile.managed_default_content_settings.images": 2} prefs = { 'profile.default_content_setting_values': { # 也能够这样写,两种都正确 # 'profile.default_content_settings': { 'images': 2, # 不加载图片 'javascript': 1, # 2不加载JS "User-Agent": 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36"', # 更换UA } } chrome_options.add_experimental_option("prefs", prefs) chrome_options.add_argument('blink-settings=imagesEnabled=false') # 这句禁用图片才能生效,上面两个禁用图片没起到效果。 driver = webdriver.Chrome(chrome_options=chrome_options) # driver.maximize_window() driver.set_window_size(390, 713) driver.set_page_load_timeout(100) driver.implicitly_wait(100) self.logger.info(f'建立一个浏览器耗时{time.time() - t0}') return DriverItem(driver) class DriverPool(LoggerMixin): lock = Lock() def __new__(cls, *args, **kwargs): if not hasattr(cls, '_instance'): self = super().__new__(cls, ) cls._instance = self self.__custom_init__(*args, **kwargs) return cls._instance def __custom_init__(self, driver_item_num=10, driver_name=1): """ :param driver_item_num:浏览器数量 :param driver_name: 浏览器种类 1为phantomsj,2为chrome :return: """ self.driver_item_list = list() self._driver_item_num = driver_item_num self.driver_item_builder = PhantomjsItemBuilder() if driver_name == 1 else ChromeItemBuilder() self.logger_with_file.info(f'准备初始化{driver_item_num}个浏览器') self._has_init_all_driver_item = False self._init_time = 0 self._init_all_driver_item() def _init_all_driver_item(self): if time.time() - self._init_time > 60: self._init_time = time.time() self.logger.warning('杀死残留的phantomjs进程') # 此处的命令不用怕误杀其它地方的phantomjs,上下文管理器使用被杀的浏览器会自动启动。 if os.name == 'posix': os.system('ps -aux|grep phantomjs|grep -v grep|cut -c 9-15|xargs kill -9') else: os.system('taskkill /F /im phantomjs.exe') t0 = time.time() self.driver_item_list.clear() # 必定须要清空原来的。 def _inner(this: DriverPool): driver_item = this.driver_item_builder.create_a_driver_item() this.driver_item_list.append(driver_item) thread_pool = BoundedThreadPoolExecutor(self._driver_item_num) [thread_pool.submit(_inner, self) for _ in range(self._driver_item_num)] # 亲测多线程建立10个浏览器,比一个接一个的建立速度要快不少。 thread_pool.shutdown() self._has_init_all_drivers = True self.logger.info(f'全部浏览器初始化建立成功,耗时 {time.time() - t0}秒 {len(self.driver_item_list)} {self.driver_item_list}') def borrow_a_driver_item(self): with self.lock: current_using_number = 0 current_not_using_number = 0 for driver_item in self.driver_item_list: if driver_item.is_using: current_using_number += 1 else: current_not_using_number += 1 self.logger.debug(f'当前正在使用的浏览器数量是{current_using_number},闲置的浏览器数量是{current_not_using_number}') for index, driver_item in enumerate(self.driver_item_list): if driver_item.is_using is False: if time.time() - driver_item.create_time > 3600: self.logger.debug('防止phantomjs内存泄漏,关闭并从新建立一个浏览器') self.driver_item_list.pop(index) driver_item.driver.quit() driver_item = self.driver_item_builder.create_a_driver_item() self.driver_item_list.insert(index, driver_item) driver_item.is_using = True return driver_item raise NoAvailableDriverError('当前没有可用的浏览器。。。。。。。。。。。。') @staticmethod def give_back_a_driver_item(driver_item: DriverItem): driver_item.is_using = False driver_item.last_use_time = time.time() class DriverContext: def __init__(self): self.driver_pool = DriverPool() self.driver_item = None self.start_using_time = time.time() def __enter__(self): self.driver_item = self.driver_pool.borrow_a_driver_item() self.driver_pool.logger_with_file.debug(f'当前使用的浏览器是 {self.driver_item}') return self.driver_item.driver def __exit__(self, exc_type, exc_val, exc_tb): self.driver_pool.logger.info(f'此浏览器 {self.driver_item} 占用时间为 {time.time() - self.start_using_time}秒') self.driver_pool.give_back_a_driver_item(self.driver_item) if exc_type == URLError: # 若是phantomjs被被手动杀死或者oom了,再次使用这个phatntomjs会出这个URLError错,从新生成浏览器池。 self.driver_pool._init_all_driver_item() if exc_type and issubclass(exc_type, WebDriverException): self.driver_pool.logger.error(f'selenium发生错误 ,错误类型--> {exc_type} 错误缘由--> {exc_val}') # return True if __name__ == '__main__': logger = LogManager('driver_pool_test').get_logger_and_add_handlers() DriverPool(50) if not Path('/picture').exists(): Path('/picture').mkdir() @decorators.tomorrow_threads(40) def f(): with DriverContext() as driver: # 须要使用with语法来使用浏览器,不然须要手动额外处理一些问题和维护浏览器的使用状态。 logger.debug(f'使用的浏览器是--> {driver}') driver.get('http://m.elong.com/ihotel/283904/?inDate=2018-12-12&outDate=2018-12-13&roomPerson=1|2') driver.save_screenshot(f'/picture/{time.time()}.png') WebDriverWait(driver, 10, 0.2).until( lambda driverx: driverx.find_element_by_css_selector('#detail-mapping-box > li:nth-child(1) > div.prodjh_list_box.clearfix > div.detail-mrooom-mapping-product > div.dprodtname')) logger.info(f'页面内容长度是: {len(driver.page_source)}') driver.save_screenshot(f'/picture/{time.time()}.png') [(time.sleep(0.1), f()) for _ in range(50000)]
使用如图,因为不须要对每次请求都频繁建立和摧毁浏览器,因此打开网页速度很快。浏览器