1.进入淘宝,主页:https://www.taobao.com/css
2.搜索:美食,点击搜索html
3.获得当前搜索结果商品的:price(价格),location(销售地),shop(商店名称),image(图片),title(商品名称),product_link商品链接web
4切换点击到最后一页中止chrome
代码:浏览器
from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options import re from bs4 import BeautifulSoup SERVICE_ARGS = ['--load-images=false', '--disk-cache=false'] # 全局变量 # 无界面的浏览器 options = Options() options.add_argument("--headless") driver = webdriver.Chrome(service_args=SERVICE_ARGS, options=options) # 设置浏览器的窗口大小 # driver.set_window_size(width=1360, height=768) # 第一个参数是传入driver,第二个参数是等待时间 wait = WebDriverWait(driver, 5) def next_page(page): print("正在切换===", page, "页") input = driver.find_element_by_css_selector("#mainsrp-pager div div div div.form input") # 清空内容 input.clear() # 输入传入的页面 input.send_keys(page) submit = driver.find_element_by_css_selector("#mainsrp-pager div div div div.form span.btn.J_Submit") submit.click() # 判断是否切换成功,等待校验是否成功 wait.until( EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > ul > li.item.active"), str(page))) # 解析对应页面的数据 get_product_info(page) def get_product_info(page): print("当前正在解析========", page, "页") # 判断页面是否加载完成 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-itemlist .items .item"))) # 当前页面的数据 html = driver.page_source soup = BeautifulSoup(html, "lxml") # 获得当前页面的因此的商品标签item product_lists = soup.select("#mainsrp-itemlist .items .item") for product in product_lists: print("---" * 100) item = {} # 出售地点 location = product.select(".location")[0].text # 店铺名称 shopname = product.select(".shopname")[0].text # 商品名称 title = product.select(".title .J_ClickStat")[0].text # 商品的图片 iamge = product.select("img")[0]["data-src"] data_link = product.select(".pic-link.J_ClickStat.J_ItemPicA")[0]["href"] # print(location,shopname,title,iamge,data_link) item["location"] = location item["shopname"] = shopname item["title"] = title item["iamge"] = iamge item["data_link"] = data_link print(item) # 获得总页数 def get_total_page(): driver.get("https://www.taobao.com/") # 等待搜索宽出现 input = wait.until(EC.presence_of_element_located((By.ID, "q"))) input.clear() input.send_keys("美食") # 按点击按钮 driver.find_element_by_css_selector(".btn-search").click() # 第1页 # 获得总页数 total = driver.find_element_by_class_name("total").text # print(total) total_num = re.compile(r'\d+').search(total).group() # 写一个函数获取当前页(第一页的数据) get_product_info(1) return total_num if __name__ == "__main__": toto_page = get_total_page() print("总页数===", toto_page) for page in range(2, int(toto_page) + 1): print(page) next_page(page) # 退出浏览器 driver.quit()
运行效果:less