爬取后保留的信息有,"标题","楼盘名称","地址",html
https://suzhou.anjuke.com/sale/p{}
import requests from lxml import etree import csv class Anjuke(): def __init__(self): self.url_temp = "https://suzhou.anjuke.com/sale/p{}" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"} def get_url_list(self): return [self.url_temp.format(i) for i in range(1, 3)] #这里爬取1-3页。 def pase_url(self, url): response = requests.get(url, headers=self.headers) return response.content.decode() def get_content_list(self, html_str): html = etree.HTML(html_str) content_list = [] div_list = html.xpath('//ul[@id="houselist-mod-new"]/li') for div in div_list: item = {} item["标题"] = div.xpath( './/div[@class="house-title"]/a/text()') item["标题"] = item["标题"][0].strip() item["楼盘名称"] = div.xpath( './/div[@class="details-item"]/span[@class="comm-address"]/text()') item["楼盘名称"] = item['楼盘名称'][0].split("\xa0")[0].strip() item["地址"] = div.xpath( './/div[@class="details-item"]/span[@class="comm-address"]/text()') item["地址"] = item['地址'][0].split("\xa0")[-1].strip() content_list.append(item) return content_list def save_content_list(self, content_list): headers = ["标题","楼盘名称","地址"] with open("信息.csv","w",encoding="utf-8-sig", newline="") as fp: writer = csv.DictWriter(fp, headers) writer.writeheader() writer.writerows(content_list) # for i in content_list: # print(i["title"]) def run(self): url_list = self.get_url_list() for url in url_list: html_str = self.pase_url(url) content_list = self.get_content_list(html_str) self.save_content_list(content_list) if __name__ == '__main__': Anjuke = Anjuke() Anjuke.run()