selenium对51job进行职位爬虫

selenium 爬虫流程以下:
  一、对某职位进行爬虫 ---如:自动化测试
  二、用到IDE为 pycharm
  三、爬虫职位导入到MongoDB数据库中
  四、在线安装 pip install pymongo
  五、本次使用到脚本化无头浏览器 --- PhantomJS

MongoDB安装说明链接:https://www.twblogs.net/a/5c27009bbd9eee16b3dba7bc/zh-cn
PhantomJS 下载地址和API链接:http://phantomjs.org/download.htmlhttp://phantomjs.org/api/

下载后添加path中 --- CMD窗口输入 PhantomJS 按回车 --- 出现 phantomjs> 说明配置成功

以下为 51job.py 截图:

 

 

 

config配置文件以下:css

 

pycharm 运行结果:html

 

MongoDB 数据库截图:web

 

 以下为config 配置文件:mongodb


MONGO_URL = "mongodb://127.0.0.1:27017/"

MONGO_DB = "自动化测试"
MONGO_TABLE = "自动化测试工程师"

KEYWORD = "自动化测试工程师"

SERVICE_ARGS = ["--load-images=false","--disk-cache=true"] #忽略缓存和图片加载


爬虫源码以下:
from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.wait import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECimport time as tfrom pyquery import PyQuery as pqimport re#config -- 上面已展现from Selenium_test.config import *import pymongo#对MonGoDB进行实例化操做client = pymongo.MongoClient(MONGO_URL)#链接MongoDB数据库db = client[MONGO_DB]#浏览器实例化driver = webdriver.PhantomJS(service_args=SERVICE_ARGS)# driver = webdriver.Chrome()driver.set_window_size(1400,900)driver.maximize_window()driver.implicitly_wait(10)#显示等待wait = WebDriverWait(driver,10)def search():    print("正在搜索")    try:        driver.get("https://www.51job.com/")        element = wait.until(            EC.presence_of_element_located((By.ID,"kwdselectid"))        )        element.send_keys(KEYWORD)        # 取消选中城市        driver.find_element_by_id("work_position_input").click()        t.sleep(2)        selectedCityEles = driver.find_elements_by_css_selector(            "#work_position_click_center_right_list_000000 em[class=on]"        )        for one in selectedCityEles:            # print(one.text)            one.click()        # 选中城市        t.sleep(2)        driver.find_element_by_id("work_position_click_center_right_list_category_000000_040000").click()        t.sleep(2)        driver.find_element_by_id("work_position_click_bottom_save").click()        #点击搜索        t.sleep(2)        driver.find_element_by_css_selector("body > div.content > div > div.fltr.radius_5 > div > button").click()        # submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"body > div.dw_wp > form > div > div.dw_search_in > button")))        # submit.click()        #总计页数        total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#resultList > div.dw_page > div > div > div > span:nth-child(2)")))        # get_products()        return total.text    except Exception:        return search()def next_page(page_number):    print("正在翻页",page_number)    try:        element = wait.until(            EC.presence_of_element_located((By.CSS_SELECTOR, "#jump_page"))        )        element.clear()        element.send_keys(page_number)        submit = wait.until(            EC.element_to_be_clickable((By.CSS_SELECTOR, "#resultList > div.dw_page > div > div > div > span.og_but")))        submit.click()        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#resultList > div.dw_page > div > div > div > ul > li.on"),str(page_number)))        get_products()    except Exception:        next_page(page_number)def get_products():    # jobs = driver.find_elements_by_css_selector("#resultList div[class=el]")    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#resultList div[class=el]")))    html = driver.page_source    doc = pq(html)    items = doc("#resultList div[class=el]").items()    for item in items:        product = {            "职位":item.find(".t1").text(),            "公司":item.find(".t2").text(),            "工做点":item.find(".t3").text(),            "薪资":item.find(".t4").text(),            "发布时间":item.find(".t5").text()        }        print(product)        save_to_mongo(product)def save_to_mongo(result):    try:        if db[MONGO_TABLE].insert(result):            print("存储到MONGODB成功",result)    except Exception:        print("存储到MONGODB失败",result)def main():    try:        total = search()        total = int(re.compile("(\d+)").search(total).group(1))        for i in range(2,total + 1):            next_page(i)    except Exception:        print("出错")    finally:        driver.quit()if __name__ == '__main__':    main()
相关文章
相关标签/搜索