import requests import re response = requests.get('http://www.taobao.com')# 模拟浏览器打开淘宝网页 response = requests.get('http://duanziwang.com/')# 模拟浏览器打开段子网网页 data = reponse.text # .表明匹配全部字符,*表示前面的字符0到无穷个 res = re.findall('href="(.*?)"',data) print(res)
import requests import re response = requests.get('http://duanziwang.com/') print(response.status_code) print(response.encoding) data = response.text print(data) # .表明匹配全部字符,*表示前面的字符0到无穷个 content_res = re.findall('<div class="content">(.*?)</div>',data)# 查找内容赋给print(content_res) title_res = re.findall('<a href="/subject/">(.*?)</a>') print(title_res.index('活得糊涂的人,容易幸福))# 打印title_res的索引位置9 print(title_res.index('购买银行理财产品亏损后如何起诉'))#打印title_res的索引位置60 title_res = title_res[10:60] print(type(title_res)) dict = {} for i in range(len(title_res):# for循环title_res列表的长度 dict[titile(i)] = content_res(i) # 字典里面添加内容 for i in title_content_dic.items():# 循环字典,i是元组类型 print(f'{i[0]:<40} {i[1]:<1000}')
3.昵图网爬虫html
import requests import re response = requests.get('http://www.nipic.com/design/acg/renwu/index.html?page=1') # 模仿浏览器打开昵图网的网页 data = response.text # print(data) # .表明匹配全部字符,*表示前面的字符0到无穷个 res = re.findall('data-src="(.*?)"',data) # print(res) for i in res:# 循环渠道的res,列表类型 print(i) res_response = requests.get(i) res_data = res_response.content res_name = i.split('/')[-1] f=open(res_name,'wb') f.write(res_data) # f.flush()
4.视频爬虫python
import requests import re response = requests.get('http://www.mod.gov.cn/v/index.htm') # response.encoding = 'utf8' data = response.text # print(data) # mp4_res1 = re.findall('<a href="(.*?)" class="img">',data) # for i in mp4_res1: # print(i) mp4_res2 = re.findall('<a href="(.*?)">', data) for i in mp4_res2: # type:str res = re.findall('(.*?htm)', i)[0] res = 'http://www.mod.gov.cn/v/' + res response = requests.get(res) data = response.text # http://vv.chinamil.com.cn/asset/category3/2019/06/27/asset_357593.mp4 url_res = re.findall('//Video(.*?.mp4)',data)[0] mp4_response = requests.get(url_res) mp4_data = mp4_response.content f = open('test.mp4','wb') f.write(mp4_data) # break