爬虫基础01

时间 2019-11-11

原文原文链接

写在前面

　　　　逆水行舟html

  1 爬虫
  2     
  3     - 基本操做
  4         概要：
  5             - 发送Http请求，Python Http请求，requests
  6             - 提取指定信息，Python 正则表达式，beautifulsoup
  7             - 数据持久化，
  8         
  9         Python两个模块
 10             - requests
 11             - beautifulsoup
 12         
 13         Http请求相关知识
 14             - 请求：
 15                 请求头
 16                     - cookie
 17                 请求体
 18                     - 发送内容
 19                     
 20             - 响应：
 21                 响应头
 22                     - 浏览器读取
 23                 响应体
 24                     - 看到的内容
 25             
 26             特殊：
 27                 - cookie
 28                 - csrftoken
 29                 - content-type:
 30                 
 31                     content-type:application/url-form....
 32                     name=alex&age=18
 33                     
 34                     content-type:application/json
 35                     {name:'alex',age:18}
 36     - 性能相关
 37         - 串行： 1我的，一个任务一个任务，空余时间，玩。
 38         - 线程： 10我的，一个任务一个任务，空余时间，玩。
 39         - 进程： 10个家庭，一个任务一个任务，空余时间，玩。
 40         - 【协程】异步非阻塞：1我的，充分利用时间。
 41     
 42     - scrapy框架
 43         - 规则
 44         
 45     - redis-scrapy组件
 46     
 47     
 48     
 49 内容详细：
 50     - 基本操做，python伪造浏览器发送请求并或者指定内容
 51     
 52         pip3 install requests
 53         response = requests.get('http://www.baidu.com')
 54         response.text
 55         
 56         
 57         pip3 install beautifulsoup4
 58         from bs4 import Beautifulsoup
 59         
 60         soup = Beautifulsoup(response.text,'html.parser')
 61         soup.find(name='h3',attrs={'class':'t'})
 62         soup.find_all(name='h3')
 63         
 64         示例：爬取汽车之家新闻
 65         
 66         
 67     - 模块
 68     
 69         requests
 70             GET:
 71                 requests.get(url="http://www.oldboyedu.com")
 72                 # data="http GET / http1.1\r\nhost:oldboyedu.com\r\n....\r\n\r\n"
 73                 
 74                 requests.get(url="http://www.oldboyedu.com/index.html?p=1")
 75                 # data="http GET /index.html?p=1 http1.1\r\nhost:oldboyedu.com\r\n....\r\n\r\n"
 76                 
 77                 requests.get(url="http://www.oldboyedu.com/index.html",params={'p':1})
 78                 # data="http GET /index.html?p=1 http1.1\r\nhost:oldboyedu.com\r\n....\r\n\r\n"
 79             
 80             POST:
 81                 requests.post(url="http://www.oldboyedu.com",data={'name':'alex','age':18}) # 默认请求头：url-formend....
 82                 data="http POST / http1.1\r\nhost:oldboyedu.com\r\n....\r\n\r\nname=alex&age=18"
 83                 
 84                 
 85                 requests.post(url="http://www.oldboyedu.com",json={'name':'alex','age':18}) # 默认请求头：application/json
 86                 data="http POST / http1.1\r\nhost:oldboyedu.com\r\n....\r\n\r\n{"name": "alex", "age": 18}"
 87 
 88                 
 89                 requests.post(
 90                     url="http://www.oldboyedu.com",
 91                     params={'p':1},
 92                     json={'name':'alex','age':18}
 93                 ) # 默认请求头：application/json
 94                 
 95                 data="http POST /?p=1 http1.1\r\nhost:oldboyedu.com\r\n....\r\n\r\n{"name": "alex", "age": 18}"
 96                 
 97                 
 98                 补充：
 99                     request.body,永远有值
100                     request.POST，可能没有值
101                     
102                 
103         beautifulsoup
104             soup = beautifulsoup('HTML格式字符串','html.parser')
105             
106             tag = soup.find(name='div',attrs={})
107             tags = soup.find_all(name='div',attrs={})
108             
109             
110             tag.find('h3').text
111             tag.find('h3').get('属性名称')
112             tag.find('h3').attrs
113     
114     
115         HTTP请求：
116             GET请求：
117                 data="http GET /index?page=1 http1.1\r\nhost:baidu.com\r\n....\r\n\r\n"
118                 
119                 
120             POST请求：
121                 data="http POST /index?page=1 http1.1\r\nhost:baidu.com\r\n....\r\n\r\nname=alex&age=18"
122                 
123                 
124             socket.sendall(data)
125     
126     
127         示例【github和抽屉】：任何一个不用验证码的网站，经过代码自动登陆
128             
129             1. 按理说
130                 r1 = requests.get(url='https://github.com/login')
131                 s1 = beautifulsoup(r1.text,'html.parser')
132                 val = s1.find(attrs={'name':'authenticity_token'}).get('value')
133                 
134                 r2 = requests.post(
135                         url= 'https://github.com/session',
136                         data={
137                             'commit': 'Sign in',
138                             'utf8': '✓',
139                             'authenticity_token': val,
140                             'login':'xxxxx',
141                             'password': 'xxxx',
142                             
143                         }
144                     )
145                     
146                 r2_cookie_dict = r2.cookies.get_dict() # {'session_id':'asdfasdfksdfoiuljksdf'}
147         
148                 保存登陆状态，查看任意URL
149                 
150                 r3 = requests.get(
151                     url='xxxxxxxx',
152                     cookies=r2_cookie_dict
153                 )
154         
155                 print(r3.text) # 登陆成功以后，能够查看的页面
156                 
157             2. 不按理说
158                 r1 = requests.get(url='https://github.com/login')
159                 s1 = beautifulsoup(r1.text,'html.parser')
160                 val = s1.find(attrs={'name':'authenticity_token'}).get('value')
161                 # cookie返回给你
162                 r1_cookie_dict = r1.cookies.get_dict()
163                 
164                 
165                 r2 = requests.post(
166                         url= 'https://github.com/session',
167                         data={
168                             'commit': 'Sign in',
169                             'utf8': '✓',
170                             'authenticity_token': val,
171                             'login':'xxxxx',
172                             'password': 'xxxx',
173                             
174                         },
175                         cookies=r1_cookie_dict
176                     )
177                 # 受权
178                 r2_cookie_dict = r2.cookies.get_dict() # {}
179         
180         
181         
182                 保存登陆状态，查看任意URL
183                 
184                 r3 = requests.get(
185                     url='xxxxxxxx',
186                     cookies=r1_cookie_dict
187                 )
188         
189                 print(r3.text) # 登陆成功以后，能够查看的页面
190             
191             
192     - requests
193         """
194         1. method
195         2. url
196         3. params
197         4. data
198         5. json
199         6. headers
200         7. cookies
201         8. files
202         9. auth
203         10. timeout
204         11. allow_redirects
205         12. proxies
206         13. stream
207         14. cert
208         ================ session,保存请求相关信息（不推荐）===================
209         import requests
210 
211         session = requests.Session()
212 
213         i1 = session.get(url="http://dig.chouti.com/help/service")
214         i2 = session.post(
215             url="http://dig.chouti.com/login",
216             data={
217                 'phone': "8615131255089",
218                 'password': "xxooxxoo",
219                 'oneMonth': ""
220             }
221         )
222         i3 = session.post(
223             url="http://dig.chouti.com/link/vote?linksId=8589523"
224         )
225         print(i3.text)
226 
227         """
228     - beautifulsoup
229         - find()
230         - find_all()
231         - get()
232         - attrs
233         - text
234         
235 内容：
236     1. 示例：汽车之家
237     2. 示例：github和chouti
238     3. requests和beautifulsoup
239     4. 轮询和长轮询
240     5. Django
241         request.POST
242         request.body
243         
244         # content-type:xxxx
245         
246 做业：web微信
247       功能：
248         1. 二维码显示
249         2. 长轮询：check_login
250         3. 
251             - 检测是否已经扫码
252             - 扫码以后201，头像： base64:.....
253             - 点击确认200，response.text     redirect_ur=....
254         4. 可选，获取最近联系人信息
255         
256 安装：
257     twsited
258     scrapy框架
259     
260     
261

武Sir - 笔记

参考：http://www.cnblogs.com/wupeiqi/articles/6283017.htmlpython

爬虫相关
	- 基本操做
		- 概要
			- 发送http请求	requests模块
			- 提取指定信息 	正则	Beautifulsoup模块
			- 数据持久化

		- Python的2个模块
			- requests
			- Beautifulsoup

		- Http请求相关知识
			- 请求
				- 请求头 
					- cookie
				- 请求体 
					- 发送的内容
			- 响应 
				- 响应头 
					- 浏览器读取
				- 响应体
					- 看到的内容

			- 特殊
				- cookie
				- csrf_token
				- content-type 用来指定客户端按照哪一种格式进行解析


	- 性能相关
		- 进程
		- 线程
		- 协程

		- 【协程】异步非阻塞：充分利用系统资源


	- scrapy框架
		- 学习scrapy的规则


	- redis&scrapy组件：完成一个简单的分布式爬虫



内容详细
	- 基本操做	Python伪造浏览器发送请求

		pip3 install requests
		pip3 install Beautifulsoup4

		import requests
		from bs4 import BeautifulSoup


		response = requests.get("http://www.baidu.com")
		response.text  ->  网页内容

		soup = Beautifulsoup(response.text,'html.parse')

		# 从上到下第一个 <h3 class='t'> 标签
		soup.find(name='h3',attrs={'class':'t'})
		# 查找所有 <h3>标签
		soup.find_all(name='h3')

		...

	模块
		requests
			response = requests.get(url='url路径')
			# 解决乱码问题
			response.encoding = response.apparent_encoding

			GET请求：
				requests.get(url='www.baidu.com')
				data = "http GET / ...."
				requests.get(url='www.baidu.com?page=1')
				data = "http GET page=1 ...."
				requests.get(url='www.baidu.com',params={'page':1})


			POST请求：
				requests.post(url='www.baidu.com',data={'name':'alex','age':18}) # 默认携带请求头类型：application/x-www-form-urlencoded

				requests.post(url='www.baidu.com',json={'name':'alex','age':18}) # 默认携带请求头类型：application/json

				# POST请求既能够在请求体里传参，又能够在url里传参
				requests.post(url='www.baidu.com',params={'page':1},json={'name':'alex','age':18})



				补充：
					django里的 request.POST 里的值是django根据请求体里的数据转换过来的
						因此，若是body里的数据格式不对，那么就转换不了，致使request.POST里面没有值
					django里的 request.body 里永远有值
					django里的 request.POST 可能没有值



		BeautifulSoup
			soup = BeautifulSoup('html格式字符串','html.parser')
			tag = soup.find(name='div',attrs={...})
			tag = soup.find_all(name='div',attrs={...})

			tag.find('h3').text
			tag.find('h3').content 
			tag.find('h3').get('属性名称')
			tag.find('h3').attrs['属性名称']







服务器端不能主动给客户端发消息
可是websocket能够

- 【轮询】     	http协议，客户端轮询（每秒1次）请求服务端；一次请求，服务端收到后无论有没有新消息都当即返回
- 【长轮询】 	http协议，客户端发来请求，服务器把客户端给hang住，直到服务端收到新消息并发送给全部客户端、才断开链接；
				客户端收到消息后，再当即发请求到服务端进行下一次hang住。
				hang住，有一个超时时间，web微信超时时间是25s
				应用：web微信
- 【WebSocket】	不是http协议，创建在tcp之上
				一次链接不断开，双工通道，能够互相发送消息
				可是浏览器兼容性不太好，之后将会应用的更普遍




浏览器有同源策略
ajax发送跨域请求是接收不到结果的





http://www.cnblogs.com/wupeiqi/articles/6283017.html




#!/usr/bin/python
# -*- coding:utf-8 -*-

import requests

requests.request()

requests.get(url='xxx')
# 本质上就是：
requests.request(method='get',url='xxx')

import json
requests.post(url='xxx',data={'name':'alex','age':18}) # content_type: application/x-www-form-urlencoded
requests.post(url='xxx',data="name=alex&age=18")   # content_type: application/x-www-form-urlencoded
# 不三不四
requests.post(url='xxx',data=json.dumps({'name':'alex','age':18}))  # content_type: application/x-www-form-urlencoded
# 利用headers参数重写 Content_type
requests.post(url='xxx',data=json.dumps({'name':'alex','age':18}),headers={'Content_type':'application/json'})  # content_type: application/x-www-form-urlencoded
requests.post(url='xxx',json={'name':'alex','age':18})  # content_type: application/json


"""
1.method
2.url
3.params
4.data
5.json
6.headers
7.cookies

8.files
9.auth
10.timeout
11.allow_redirects
12.proxies
13.stream
14.cert

=================== session,保存请求相关信息  ==================
session = requests.Session()
session.get(url='xxx')
session.post(...)
"""

"""
8.files 用做文件上传
"""
file_dict = {
    'f1': open('readme', 'rb')
}
requests.post(url='xxx',file=file_dict)
# 发送文件，定制文件名
# file_dict = {
#   'f1': ('test.txt', open('readme', 'rb'))
# }
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# files=file_dict)

# 发送文件，定制文件名
# file_dict = {
#   'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf")
# }
# requests.request(method='POST',
# url='http://127.0.0.1:8000/test/',
# files=file_dict)



"""
9.auth  基本认证    路由器登陆
"""
from requests.auth import HTTPBasicAuth,HTTPDigestAuth

requests.get('https://api.github.com/user',auth=HTTPBasicAuth('gypsying','password'))


"""
timeout     (链接超时，响应超时)
"""
requests.get('http://google.com',timeout=3)
requests.get('http://google.com',timeout=(5,1))


"""
allow_redirects
"""

"""
proxies 应对IP被封的状况
"""
proxyDict = {
    "http": "61.172.249.96:80",
    "https": "http://61.185.219.126:3128",
}
proxies = {'http://10.20.1.128': 'http://10.10.1.10:5323'}

"""
stream
"""
from contextlib import closing
with closing(requests.get('xxx',stream=True)) as f:
    for i in f.iter_content():
        print(i)




requests.put()
requests.delete()





BeautifulSoup
	- find()
	- find_all()
	- get()
	- attrs
	- text

soup = BeautifulSoup('html格式字符串','html.parser')
soup = BeautifulSoup('html格式字符串',features='lxml')	第三方，需额外安装，可是速度比'html.parser'更快


soup = BeautifulSoup('html格式字符串','html.parser')
tag = soup.find(attrs={'class':'c1'})
tag.name  ->  标签名字

tag = soup.find(attrs={'class':'c1'})
等价于：
tag = soup.find(class_='c1')

print(tag.attrs)

tag.attrs['id'] = 1
del tag.attrs['class']
# attrs 进行增删改查均可以


tag.children  	全部孩子
tag.descendants	全部后代
tag.find_all()	包含的全部标签，而且递归了
tag.find_all(recursive=False)	包含的全部标签，不递归

tag.clear()		清空内部元素，保留本身
tag.decompose()	递归删除全部标签，包含本身
res = tag.extract()	至关于字典的pop，其他同decompose()


tag = soup.find(class_='c1')	# 对象
tag.decode()	# 对象变成字符串
tag.encode()	# 对象变成字节

tag.find('a')
# tag = soup.find('a')
# print(tag)
# tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tag)

find_all()
# tags = soup.find_all('a')
# print(tags)
 
# tags = soup.find_all('a',limit=1)
# print(tags)
 
# tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
# # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
# print(tags)
 
 
# ####### 列表 #######
# v = soup.find_all(name=['a','div'])
# print(v)
 
# v = soup.find_all(class_=['sister0', 'sister'])
# print(v)
 
# v = soup.find_all(text=['Tillie'])
# print(v, type(v[0]))
 
 
# v = soup.find_all(id=['link1','link2'])
# print(v)
 
# v = soup.find_all(href=['link1','link2'])
# print(v)
 
# ####### 正则 #######
import re
# rep = re.compile('p')
# rep = re.compile('^p')
# v = soup.find_all(name=rep)
# print(v)
 
# rep = re.compile('sister.*')
# v = soup.find_all(class_=rep)
# print(v)
 
# rep = re.compile('http://www.oldboy.com/static/.*')
# v = soup.find_all(href=rep)
# print(v)
 
# ####### 方法筛选 #######
# def func(tag):
# return tag.has_attr('class') and tag.has_attr('id')
# v = soup.find_all(name=func)
# print(v)
 
 
# ## get,获取标签属性
# tag = soup.find('a')
# v = tag.get('id')
# print(v)


from bs4.element import Tag

tag.has_attr()
tag.text  等价于 tag.get_text()


v = tag.index(tag.find('div'))


tag.text
tag.string 也能够获取内容，并扩展了修改内容
tag.string = "xxxx"
tag.stripped_strings 至关于join给分割成了list 
tag.children
for item in tag.children:
	print(item,type(item))




from bs4.element import Tag
tag= Tag(name='i',attrs={'id':'it'})
tag.string = "asasasasasasazxzxzx"


soup.find(id='xxx').append(tag)




""" 扩展copy模块 """
import copy
copy.deepcopy()
...



tag.wrap(tag1)
tag.unwrap()

++++++++++++++++++++++++++++++++++++


内容梳理：
	- 汽车之间新闻爬取示例
	- github和抽屉自动登陆  以及 登录后的操做
	- requests 和 Beautifulsoup 基本使用
	- 轮训和长轮询
	- Django 里 content-type问题
		request.POST 
		request.body






练习：web微信
	1. 二维码显示
	2. 长轮询 check_login() ：ajax递归  （js递归没有层数限制）
	3. 检测是否已经扫码
		- 扫码以后201：替换头像 base64:...
		src="img_path"
		或者
		src="base64:xxxxxxxx...."
		- 扫码以后继续轮训，等待用户点击确认
		- 点击确认以后，返回200 
			response.text redirect_url-....
		- 获取最近联系人信息






下节课前安装
	twsited
	scrapy框架

服务器端不能主动给客户端发消息
可是websocket能够

- 【轮询】     	http协议，客户端轮询（每秒1次）请求服务端；一次请求，服务端收到后无论有没有新消息都当即返回
- 【长轮询】 	http协议，客户端发来请求，服务器把客户端给hang住，直到服务端收到新消息并发送给全部客户端、才断开链接；
				客户端收到消息后，再当即发请求到服务端进行下一次hang住。
				hang住，有一个超时时间，web微信超时时间是25s
				应用：web微信
- 【WebSocket】	不是http协议，创建在tcp之上
				一次链接不断开，双工通道，能够互相发送消息
				可是浏览器兼容性不太好，之后将会应用的更普遍

1、爬虫几点基础知识

- 基本操做
	- 概要
		- 发送http请求	requests模块
		- 提取指定信息 	正则	Beautifulsoup模块
		- 数据持久化

	- Python的2个模块
		- requests
		- Beautifulsoup

	- Http请求相关知识
		- 请求
			- 请求头 
				- cookie
			- 请求体 
				- 发送的内容
		- 响应 
			- 响应头 
				- 浏览器读取
			- 响应体
				- 看到的内容

		- 特殊
			- cookie
			- csrf_token
			- content-type 用来指定客户端按照哪一种格式进行解析


- 性能相关
	- 进程
	- 线程
	- 协程

	- 【协程】异步非阻塞：充分利用系统资源


- scrapy框架
	- 学习scrapy的规则


- redis&scrapy组件：完成一个简单的分布式爬虫

2、爬取汽车之家新闻示例

#!/usr/bin/python
# -*- coding:utf-8 -*-

"""
爬取汽车之家的新闻
"""
import os
import requests
from bs4 import BeautifulSoup

response = requests.get('http://www.autohome.com.cn/news/')
"""  指定编码，不然会乱码 """
# print(response.apparent_encoding)
# print(response.encoding)
""" Good """
response.encoding = response.apparent_encoding
# print(response.encoding)
# print(type(response.text))      # <class 'str'>
# print(type(response.content))   # <class 'bytes'>

""" BeautifulSoup把各类HTML标签转换成各类对象，因此可使用 obj.attr 方式 """
soup = BeautifulSoup(response.text,'html.parser')
tag = soup.find(name='div',attrs={'id':'auto-channel-lazyload-article'})

li_list = tag.find_all('li') # [标签对象,标签对象,标签对象...]
for li in li_list:
    h3 = li.find(name='h3')
    if not h3:
        continue
    else:
        print(h3.text)
        # 获取属性
        print(li.find(name='a').get('href'))
        # 或者：print(li.find(name='a').attrs['href'])
        print(li.find('p').text)

        # 下载图片
        img_url = li.find('img').get('src')
        print(img_url)
        res = requests.get('http:'+img_url)
        img_path = os.path.join('autohome',img_url.split('/')[-1])
        with open(img_path,'wb') as fw:
            fw.write(res.content)

一抹红的专属感 Macan Turbo特别版官图
//www.autohome.com.cn/news/201710/908351.html#pvareaid=102624
[汽车之家 新车官图]  日前，保时捷发布了Macan Turbo Exclusive Performance Edition的官图，做为一款特别版车...
//www3.autoimg.cn/newsdfs/g10/M0F/B2/EA/120x90_0_autohomecar__wKgH0VnqsC6AYGDFAAGFLm8dSfc007.jpg
还要怎么轻？ 路特斯Elise Cup 260官图
//www.autohome.com.cn/news/201710/908350.html#pvareaid=102624
[汽车之家 新车官图]  日前，路特斯官方宣布推出Elise Cup 260，这款车相比于已经进行进一步轻量化改造的新款Cup 250要更轻更快，全球...
//www3.autoimg.cn/newsdfs/g18/M0C/B9/7A/120x90_0_autohomecar__wKgH6FnqrhyAH3UDAAFOwoge9w4751.jpg
...

3、自动登陆网站示例

参考：http://www.cnblogs.com/wupeiqi/articles/6283017.htmlgit

　　- .2种网站受权登陆的方式

requests.get()  +  requests.post()

    - 方式1

　　　　1.第一次GET请求获取token

　　　　2.第二次POST请求进行验证并获取cookie

　　　　3.第三次GET/POST请求并携带cookie实现用户登陆后的某些操做

 
    - 方式2

　　　　1.第一次GET请求获取token和未被受权的cookie

　　　　2.第二次POST请求并携带cookie进行验证并受权

　　　　3.第三次GET/POST请求并携带受权过的cookie实现用户登陆后的某些操做

另外可使用 requests.session() 更简单的实现：github

session = requests.Session()

session.get()  + session.post()

　　- .自动登陆Github并浏览我的主页

#!/usr/bin/python
# -*- coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup


"""
第二种Python登陆的cookie携带方式
以登陆 github帐户为例：
    - 第一次去请求 https://github.com/login 这个页面的时候，服务端就给返回了cookie
    - 第二次去请求 https://github.com/session 进行提交用户名密码的时候，要带上上一次返回的cookie进行受权
    - 第三次去请求用户登陆后才能看到的页面（例如我的主页），须要带上上面受权好的cookie，才能够
"""

""" 1.获取token和cookie """
rsp1 = requests.get(url='https://github.com/login')
soup1 = BeautifulSoup(rsp1.text,'html.parser')
# 根据属性值找到对应标签，进而获取其value值
token = soup1.find(attrs={'name':'authenticity_token'}).get('value')
# 获取第一次请求得到的cookie
rsp1_cookie_dict = rsp1.cookies.get_dict()
print(token)
print(rsp1_cookie_dict)

""" 2.发起登陆POST请求 """
rsp2 = requests.post(
    url='https://github.com/session',
    data={
        'commit':'Sign in',
        'utf8':'✓',
        'authenticity_token':token,
        'login':'gypsying',
        'password':'xxxxxxxxx',
    },
    cookies=rsp1_cookie_dict
)
# 获取第二次请求得到的cookie
rsp2_cookie_dict = rsp2.cookies.get_dict()
print(rsp2_cookie_dict)

all_cookie_dict = {}
all_cookie_dict.update(rsp1_cookie_dict)
all_cookie_dict.update(rsp2_cookie_dict)

print(all_cookie_dict)

""" 3.发起查看我的主页的GET请求 """
rsp3 = requests.get(
    url='https://github.com/Gypsying',
    cookies=all_cookie_dict
)

soup3 = BeautifulSoup(rsp3.text,'html.parser')
email = soup3.find(name='a',attrs={'class':'u-email'}).text
print(email)  # 就能够拿到了 hitwh_Gypsy@163.com

　　- .自动登陆抽屉并实施点赞操做

import requests
from bs4 import BeautifulSoup

index_url = "http://dig.chouti.com/"
rsp1 = requests.get(index_url)

soup = BeautifulSoup(rsp1.text,'html.parser')
a_list = soup.find_all(attrs={'class':'digg-a'})
id_list = []
# 获取首页上全部新闻的id
for item in a_list:
    news_id = item.find(name='i').text
    id_list.append(news_id)

# 得到GET首页时候返回的 cookie ，此时的cookie是没有受权的
index_cookie = rsp1.cookies.get_dict()
login_url = "http://dig.chouti.com/login"
data = {
    'phone':8600000000000,
    'password':'xxxxxx',
    'oneMonth':1
}
# 提交用户名和密码，并带上未受权的cookie进行受权
login_ret = requests.post(url=login_url,data=data,cookies=index_cookie)
login_cookie = login_ret.cookies.get_dict()
login_ret = eval(login_ret.text)
code = login_ret.get('result').get('code')
if "9999"  == code:
    print("登陆成功")
else:
    print("登陆失败")
"""
{"result":{"code":"8887", "message":"手机号格式不对", "data":""}}
{"result":{"code":"21100", "message":"该手机号未注册", "data":""}}
{"result":{"code":"29998", "message":"手机号或密码错误", "data":{}}}

{"result":{"code":"9999", "message":"", "data":{"complateReg":"0","destJid":"cdu_50613120077"}}}
"""

# 点赞的时候须要带上上次受权好的cookie
for news_id in id_list:
    like_url = "http://dig.chouti.com/link/vote?linksId={}".format(news_id)
    like_ret = requests.post(url=like_url,cookies=index_cookie)
    print(like_ret.text)

"""
{"result":{"code":"30010", "message":"您已经推荐过了", "data":""}}
{"result":{"code":"9999", "message":"推荐成功", "data":{"jid":"cdu_50613120077","likedTime":"1509378903908000","lvCount":"8","nick":"gypsy","uvCount":"1","voteTime":"小于1分钟前"}}}
"""

4、模拟Web版微信相关操做

"""
微信网页版登陆示例

GET        https://login.wx.qq.com/jslogin?appid=wx782c26e4c19acffb&redirect_uri=https%3A%2F%2Fwx.qq.com%2Fcgi-bin%2Fmmwebwx-bin%2Fwebwxnewloginpage&fun=new&lang=zh_CN&_=1508052025433
获得响应：   window.QRLogin.code = 200; window.QRLogin.uuid = "IapQqsoqcA==";

二维码src   https://login.weixin.qq.com/qrcode/IapQqsoqcA==

长轮询：     https://login.wx.qq.com/cgi-bin/mmwebwx-bin/login?loginicon=true&uuid=IapQqsoqcA==&tip=0&r=-518626217&_=1508052025438
"""