功能:爬取百度热搜的实时排行榜信息html
程序设计:app
代码:url
#爬取百度热搜的实时排行榜 #技术路线:requests---bs4 import requests from bs4 import BeautifulSoup def getHTML(url): try: r=requests.get(url,headers={'User-Agent':'Mozilla/5.0'}) r.raise_for_status() r.encoding=r.apparent_encoding return r.text except: return "" def parseHTML(demo,file_path): f=open(file_path,"w") soup=BeautifulSoup(demo,"html.parser") num_list=soup.find_all('td','first') title_list=soup.find_all('a','list-title') for i in range(len(num_list)): info_dict={} try: info_dict.update({ '排名':num_list[i].find('span').string, '标题':title_list[i].string, }) f.write(str(info_dict)+'\n') except: continue f.close() print("爬取完毕!") def main(): url='http://top.baidu.com/buzz?b=1&fr=20811' file_path="D://百度实时热搜排行.txt" demo=getHTML(url) parseHTML(demo,file_path) main()
结果:spa