# 导入requests 和 BeautifulSoupimport requestsfrom bs4 import BeautifulSoupdef download_page(url): # 定义头部,用来骗过浏览器 headers ={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'} # 这里我是用了代理,是我本地电脑上跑的一个程序,能够随机寻找一个代理IP地址 # 爬取大量数据的时候会用到 # PROXY_POOL_URL = 'http://localhost:5555/random' # response = requests.get(PROXY_POOL_URL) # proxies = {"http:": "http://" + response.text} # html = requests.get(url,headers = headers,proxies = proxies) # 访问网页并获取HTML文件 html = requests.get(url,headers = headers) return html.textdef get_content(html,page): # 从返回的HTML网页中找到须要的做者,段子,年龄等信息 output = """第{}页 做者:{} 性别:{} 年龄:{} 点赞:{} 评论:{}\n{}\n------------\n""" # 最终输出格式 # 作一锅汤。 soup = BeautifulSoup(html,'lxml') # 找到每一页每个段子的信息 content = soup.find(id = 'content') content_list = content.find_all('div',class_ = 'article') # 循环遍历每个段子的信息 for index in content_list: # 查询出做者的昵称 author = index.find('h2').string # 获取段子内容 content = index.find('div', class_= 'content').find('span').get_text() # 获取内容 # 获取点赞和评论数的标签 stats = index.find('div',class_ = 'stats') # 获取点赞数 dianzan = stats.find('span',class_ = 'stats-vote').find('i').string # 获取评论数 pinglun = stats.find('span',class_ = 'stats-comments').find('a').find('i').string # 获取做者的性别和年龄 author_info = index.find('div',class_ = 'articleGender') # 这里首先判断做者是否匿名 if author_info is not None: class_list = author_info['class'] # 根据标签来判断做者的性别 if 'womenIcon' in class_list: gender = '女' elif 'manIcon' in class_list: gender = '男' else: gender = '' age = author_info.string else: gender = '' age = '' # 调用函数将数据写入文件中 save_text(output.format(page,author,gender,age,dianzan,pinglun,content))# 将数据写入文件中的函数def save_text(*args): # 遍历出入的每一组数据,而后依次写入 for index in args: with open(r"D:\python\qiushibaike.txt","a",encoding = "utf-8") as f: f.write(index)def main(): # 主函数,循环查询能够查询不少页 for index in range(1,2): # 首先定义url地址 url = "https://qiushibaike.com/text/page/{}".format(index) # 调用函数下载网页 html = download_page(url) # 调用函数获取咱们须要的数据 get_content(html,index)if __name__ == "__main__": main()