python+selenium爬取链家网房源信息并保存至csv
抓取的信息有:房源’, ‘详细信息’, ‘价格’,‘楼层’, '有无电梯css
import csv from selenium import webdriver import time from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait def write2txt(line): with open('租房.txt', 'a', encoding='utf-8') as f: f.write(line + '\n') def write_to_csv(row_data): with open('data.csv', 'a+', newline="", encoding='utf-8') as f: csv_add = csv.writer(f) csv_add.writerow(row_data) def process(): driver_path = r"D:\chromedriver.exe" browser = webdriver.Chrome(executable_path=driver_path) browser.implicitly_wait(1) write_to_csv(['房源', '详细信息', '价格','楼层', '有无电梯']) for page in range(1, 14): if page == 1: url = 'https://sh.lianjia.com/zufang/rs%E6%9D%BE%E6%B1%9F%E5%A4%A7%E5%AD%A6%E5%9F%8E/#contentList' else: url = 'https://sh.lianjia.com/zufang/pg' + str(page) +'rs松江大学城/#contentList' browser.get(url) browser.maximize_window() wait = WebDriverWait(browser, 3) div = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.content__list'))) div_list = div.find_elements_by_tag_name('div') print(len(div_list)) list_page_handle = browser.current_window_handle for n, div in enumerate(div_list): detail_p_list = div.find_elements_by_css_selector('p') print(n+1) #title title_a = detail_p_list[0].find_element_by_tag_name('a') title = title_a.text print('房源:',title) a_list = detail_p_list[1].find_elements_by_tag_name('a') detail_text = a_list[0].text detail_text += a_list[1].text detail_text += a_list[2].text detail_text += detail_p_list[1].text print('详细信息:',detail_text) #price price_span = div.find_element_by_css_selector('span > em').text print('价格:',price_span) #下拉滚动条 js = 'window.scrollTo(0, + ' + str((n+1) * 1000) + ')' browser.execute_script(js) title_a.click() # 获取楼层和电梯信息 time.sleep(1) all_handles = browser.window_handles browser.switch_to.window(all_handles[-1]) li_list = browser.find_elements_by_css_selector('div.content__article__info > ul > li') louceng = li_list[7].text dianti = li_list[8].text print(louceng + dianti) write2txt(title + ',' + detail_text + ',' + price_span + ',' + louceng + ',' + dianti) raw_data = [title, detail_text, price_span, louceng, dianti] write_to_csv(raw_data) detail_page_handle = browser.current_window_handle browser.close() browser.switch_to.window(list_page_handle) if __name__ == '__main__': s = time.time() process() e = time.time() print('用时:'+ str(e-s))
欢迎关注个人微信公众号~
python