什么是爬虫?
爬取的数据去哪了
需要的软件和环境
浏览器的请求
认识HTTP、HTTPS
HTTP协议之请求
HTTP协议之响应
requests模块的学习
使用之前先安装requests模块
pip install requests
发送get,post请求,获取响应
response=requests.get(url) # 发送get请求,请求url地址 response=requests.post(url,data={请求体的字典}) # 发送post请求,请求url地址对应的响应
response.encoding=“utf-8” response.content.decode()
response.request.url # 发送请求的url地址 response.url # response响应的url地址 response.request.headers # 请求头 response.headers # 响应头
headers= { “User-Agent”:“ Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1” “Referer”: “https://fanyi.baidu.com/” } response=requests.get(url,headers=headers)
# 3秒内必须返回响应,否则会报错 requests.get(url,headers=headers,timeout=3)
pip install retrying
from retrying import retry @retry(stop_max_attempt_number=3) def fun1(): print(“this is func1”) raise ValueError(“this is test error”)
处理cookie相关的请求
Headers={“User_Agent”:“……”“Cookie”:“cookie字符串”}
requests.get(url,cookies=cookie_dict)
先发送post请求,获取cookie,带上cookie请求登录后的页面
# 1.session具有的方法和requests一样 session = requests.session() # 2.服务器设置在本地的cookie会保存在session session.post(url,data,headers) # 3.会带上之前保存在session中的cookie,能够请求成功 session.get(url)
数据提取方法
Json.loads
把json字符串转化为Python类型
json.loads(json字符串)
Json.dumps
把Python类型转化为json字符串
json.dumps({“a”=”a”,”b”=”2”})
json.dumps(ret1,ensure_ascii=False,indent=2)
ensure_ascii:让中文显示成中文
indent:能够让下一行在上一行的基础上空格
xpath和lxml
xpath
一门从html中提取数据的语言
xpath语法
Xpath helper插件:帮助我们从elements中定位数据
from lxml import etree element = etree.HTML(“html字符串”) element.xpath(“”)
# coding=utf-8 from lxml import etree import requests import json class QiuBaiSpider: def __init__(self): self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/" self.headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"} def get_url_list(self):# 根据url地址的规律,构造url_list url_list = [self.url_temp.format(i) for i in range(1,14)] return url_list def parse_url(self,url): print("now parsing:",url) response = requests.get(url,headers=self.headers) return response.content.decode() def get_content_list(self,html_str): html = etree.HTML(html_str) # 1.分组 div_list = html.xpath("//div[@id='content-left']/div") content_list = [] for div in div_list: item = {} item["author_name"] = div.xpath(".//h2/text()")[0].strip() if len(div.xpath(".//h2/text()"))>0 else None item["content"] = div.xpath(".//div[@class='content']/span/text()") item["content"] = [i.strip() for i in item["content"]] item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()") item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"])>0 else None item["stats_comments"] = div.xpath(".//span[@class='stats-comments']//i/text()") item["stats_comments"] = item["stats_comments"][0] if len(item["stats_comments"]) > 0 else None item["img"] = div.xpath(".//div[@class='thumb']//img/@src") item["img"] = "https:"+item["img"][0] if len(item["img"])>0 else None content_list.append(item) return content_list def save_content_list(self,content_list): # 保存 with open("qiubai.txt","a",encoding="utf-8") as f: for content in content_list: f.write(json.dumps(content,ensure_ascii=False)) f.write("\n") print("保存成功") def run(self):# 实现主要逻辑 # 1.根据url地址的规律,构造url_list url_list = self.get_url_list() # 2.发送请求,获取响应 for url in url_list: html_str = self.parse_url(url) # 3.提取数据 content_list = self.get_content_list(html_str) # 4.保存 self.save_content_list(content_list) if __name__ == '__main__': qiubai = QiuBaiSpider() qiubai.run()