selenium

selenium:css

  -- 概念:一个基于浏览器自动化的模块html

  -- 基本使用流程:web

    -- pip install seleniumchrome

    -- 下载对应驱动程序:http://chromedriver.storage.googleapis.com/index.htmlapi

    -- 实例化一个浏览器对象,将浏览器的驱动程序加载到该对象中浏览器

  

1.简单示例

from selenium import webdriver
from lxml import etree
import time

# 实例化一个浏览器对象,executable_table是chromedrive.exe的路径
bro = webdriver.Chrome(executable_path='chromedriver.exe')
# 让浏览器对指定url发起访问
bro.get('http://125.35.6.84:81/xk/')
# 获取页面源码(可见便可得)
page_text = bro.page_source
tree = etree.HTML(page_text)
# 能够获取动态加载的数据
name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0]
print(name)
time.sleep(2)
bro.quit()

 

2.相关行为定制

打开淘宝并搜索相关内容:less

from selenium import webdriver
import time

bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.get('https://www.taobao.com')
# 标签订位:find系列方法
input_text = bro.find_element_by_id('q')
input_text.send_keys('mac')
time.sleep(2)
# 执行js程序
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
btn = bro.find_element_by_css_selector('.btn-search')
btn.click()
time.sleep(3)
bro.quit()

 经常使用方法:函数

    get(url)ui

    find系列函数进行标签订位google

    send_keys('key')

    click()

    excute_script('js_code')

    page_source

    switch_to.frame('iframe_ID')

    quite()

    save_screenshot()

    a = ActionChains(bro)   a.click_and_hold(tag)

    tag.move_by_offset(x,y).perform

 

3.规避检测

from selenium import webdriver
from lxml import etree
from selenium.webdriver import ChromeOptions
import time

# 用来规避检测
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
bro = webdriver.Chrome(executable_path='chromedriver.exe', options=option)
# 让浏览器对指定url发起访问
bro.get('http://125.35.6.84:81/xk/')
# 获取页面源码(可见便可得)
page_text = bro.page_source
tree = etree.HTML(page_text)
# 能够获取动态加载的数据
name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0]
print(name)
time.sleep(2)
bro.quit()

 

4.无头浏览器

设置为在浏览器不可见下进行爬取:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from lxml import etree
import time

chrome_options = Options()
# 设置不可见
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
bro = webdriver.Chrome(executable_path='./chromedriver.exe', options=chrome_options)
# 让浏览器对指定url发起访问
bro.get('http://125.35.6.84:81/xk/')
# 获取页面源码(可见便可得)
page_text = bro.page_source
time.sleep(2)
tree = etree.HTML(page_text)
# 能够获取动态加载的数据
name = tree.xpath('//*[@id="gzlist"]/li[1]/dl/a/text()')[0]
print(name)
time.sleep(2)
bro.quit()
相关文章
相关标签/搜索