selenium:css
-- 概念:一个基于浏览器自动化的模块html
-- 基本使用流程:web
-- pip install seleniumchrome
-- 下载对应驱动程序:http://chromedriver.storage.googleapis.com/index.htmlapi
-- 实例化一个浏览器对象,将浏览器的驱动程序加载到该对象中浏览器
from selenium import webdriver from lxml import etree import time # 实例化一个浏览器对象,executable_table是chromedrive.exe的路径 bro = webdriver.Chrome(executable_path='chromedriver.exe') # 让浏览器对指定url发起访问 bro.get('http://125.35.6.84:81/xk/') # 获取页面源码(可见便可得) page_text = bro.page_source tree = etree.HTML(page_text) # 能够获取动态加载的数据 name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0] print(name) time.sleep(2) bro.quit()
打开淘宝并搜索相关内容:less
from selenium import webdriver import time bro = webdriver.Chrome(executable_path='chromedriver.exe') bro.get('https://www.taobao.com') # 标签订位:find系列方法 input_text = bro.find_element_by_id('q') input_text.send_keys('mac') time.sleep(2) # 执行js程序 bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') btn = bro.find_element_by_css_selector('.btn-search') btn.click() time.sleep(3) bro.quit()
经常使用方法:函数
get(url)ui
find系列函数进行标签订位google
send_keys('key')
click()
excute_script('js_code')
page_source
switch_to.frame('iframe_ID')
quite()
save_screenshot()
a = ActionChains(bro) a.click_and_hold(tag)
tag.move_by_offset(x,y).perform
from selenium import webdriver from lxml import etree from selenium.webdriver import ChromeOptions import time # 用来规避检测 option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) bro = webdriver.Chrome(executable_path='chromedriver.exe', options=option) # 让浏览器对指定url发起访问 bro.get('http://125.35.6.84:81/xk/') # 获取页面源码(可见便可得) page_text = bro.page_source tree = etree.HTML(page_text) # 能够获取动态加载的数据 name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0] print(name) time.sleep(2) bro.quit()
设置为在浏览器不可见下进行爬取:
from selenium import webdriver from selenium.webdriver.chrome.options import Options from lxml import etree import time chrome_options = Options() # 设置不可见 chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') bro = webdriver.Chrome(executable_path='./chromedriver.exe', options=chrome_options) # 让浏览器对指定url发起访问 bro.get('http://125.35.6.84:81/xk/') # 获取页面源码(可见便可得) page_text = bro.page_source time.sleep(2) tree = etree.HTML(page_text) # 能够获取动态加载的数据 name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0] print(name) time.sleep(2) bro.quit()