python爬虫——利用 session 处理登陆状态 github 登陆实例


Cookie

Cookies是服务器在本地机器上存储的小段文本并随每个请求发送至同一个服务器html

Session

session机制是一种服务器端的机制,服务器使用一种相似于散列表的结构(也可能就是使用散列表)来保存信息。python

使用Cookies直接访问须要登陆的获取数据
#!/usr/bin/python
# -*- coding: utf-8 -*-

import  requests


# 1. 利用 hTTP headers 中 Cookie 字段实现登陆后的数据抓取

# 若是请求中携带 登陆后 Cookie 发送请求,就会表示已经登陆成功
headers = { 
 
     
    "Cookie": "_ga=GA1.2.1855430798.1461857641; _octo=GH1.1.783519559.1525492869; tz=Asia%2FShanghai; _gat=1; user_session=kYlCFIbmw-cQzLHcexbjA365OWA7ecKmWA2sGN4oXTCNx9ae; __Host-user_session_same_site=kYlCFIbmw-cQzLHcexbjA365OWA7ecKmWA2sGN4oXTCNx9ae; logged_in=yes; dotcom_user=czwspider; _gh_sess=cVhpRy95OXJNdE85NWkwMlJST0NkK0oza3A2WEJ0aGxqTWQ0dzFFNHdRMTZMNUxLaHIyMmE2anc4TDh1VFdzT1UycitCbzJ6RHg1U2diYVJkdjU1d1phejk4S1ZKelcrLzFxOUhvb2hOTHZROUZ3RUM5NVN3RDdySjUzeXJQNjNTbUZBc0ZNYW9QdzFmZWFDSnRmd2VnNzMyNzBCOTUyazJudmxWeDRveHRBPS0tTjQ1Y1JFWXZlRFFKbnc3Vko1V3RrZz09--f2e6c23defd0c0e6d470eb2dcb91c7cc2ed54dc4",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36"
}

# 想要访问该页面就得放入登陆成功后的 Cookies 值
setting_url = "https://github.com/settings/profile"

response = requests.get(setting_url,headers=headers)

with open('github_login_01.html','wb') as f:
    f.write(response.content)
使用requests模拟登陆获取数据
#!/usr/bin/python
# -*- coding: utf-8 -*-

# 模拟 github 登陆

# 0. 分析爬虫
''' 登陆地址 肯定请求地址 https://github.com/session 肯定请求方式 POST 肯定请求的内容 commit: Sign in utf8: ✓ authenticity_token: FKPt8/jlSD6VqqKbJqQUylCZaArCLMEhyIYWtA12LSzK47nyaPOs8IoIZ04o5AGJiQIc04jX9b0lsWuETzc8+g== login: czwspider password: qwer1234 当请求成功 获取 页面的 Cookie 值而且,保存下来之后发送的请求都携带这个Cookie值 '''

import requests
import requests.utils

# 1。定义请求参数
login_url = "https://github.com/session"

headers = { 
 
     
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36",
    "Cookie": "_ga=GA1.2.1855430798.1461857641; _octo=GH1.1.783519559.1525492869; tz=Asia%2FShanghai; logged_in=no; _gat=1; _gh_sess=TTZZNnVPOTRwTlI2YW1UOWFqTUZwa1JHVENFOVpjYWxoekJLbS9GTGFQQkR4ODZVSTIrV1NTWUFQS0dETUhURmg4YVR3bGxjV1hJT2dMZ2NIOEptZlREUUxrOXV0eG1EcG5kUi9adUZQamFpWmxmMHVhY04vckJVWkxXbkNVa3ZHNCtjekF2WEVnMGVzaElqMnBpVUIvVGVSNzJmdjNQMFFxYWpONE1HMks2eDhpVzZ0Wk9ZQUZLMVJOOTRsYXJWUjV6VUNmRFhyaHlYczUzdUozSWR5M210akh3dkcvaXRhY2ZmanRNRC9IbElUMm5OSmkzVDhtbEwvSEdGWFMvd0xySWIxcFRrbGZ0RDQwQit5eGUvaGZKdWp5U1dYSnZ0VzRRb1FqUThZZXlreTBNU1RicUhheGJGQjFRcDlnN1N2b2RXRXRsT21lRFB5Q3RFSHQ3V1FpR05QSlo2TVBic2o3R0hDaUZOSmhST0l2ZXdabHVzVEdBcGdMRWpiS2lXME5jOWV2WTFJckFGUXI2WHpjeTZ5ZXpXMUJvSXlpdmRpZWNONUhFejFUMD0tLVhyaTZJdlZBMUdNWnBNU0QrWDFFQ1E9PQ%3D%3D--ab7943f35d872df42bb86b8086b5cec50d3ef0ce"
}
post_data = { 
 
     
    "commit": "Sign in",
    "utf8": "✓",
    "authenticity_token": "UFKuL5RE8DTUXAc0cddcawtX3gWADuVQInNPqIacBfESUsUZwZ8jNQ24sYpQVHS6vqFXlci9FqeTV9aZ+wqa+Q==",
    "login": "czwspider",
    "password": "qwer1234"
}

# 2. 发送请求获取响应
response = requests.post(login_url,post_data,headers=headers)


# 3. 从响应中获取Cookies值而且在之后的通信中都使用这个Cookies值
# requests.utils.dict_from_cookiejar 用于把Cookie对象转换成词典对象
# print(requests.utils.dict_from_cookiejar(response.cookies))

# 4. 爬取
setting_headers = { 
 
     
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36",
}
setting_url = "https://github.com/settings/profile"
settting_resp = requests.get(setting_url,headers=setting_headers,cookies=response.cookies)

# print(response.content.decode('utf-8'))
print(settting_resp.status_code)

with open('github_login_02.html','wb') as f:
    f.write(settting_resp.content)
使用session对象进行登陆获取数据
#!/usr/bin/python
# -*- coding: utf-8 -*-

# 模拟 github 登陆

# 0. 分析爬虫
''' 登陆地址 肯定请求地址 https://github.com/session 肯定请求方式 POST 肯定请求的内容 commit: Sign in utf8: ✓ authenticity_token: FKPt8/jlSD6VqqKbJqQUylCZaArCLMEhyIYWtA12LSzK47nyaPOs8IoIZ04o5AGJiQIc04jX9b0lsWuETzc8+g== login: czwspider password: qwer1234 当请求成功 获取 页面的 Cookie 值而且,保存下来之后发送的请求都携带这个Cookie值 '''

import requests
import requests.utils

# 1。定义请求参数
login_url = "https://github.com/session"

headers = { 
 
     
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36",
    "Cookie": "_ga=GA1.2.1855430798.1461857641; _octo=GH1.1.783519559.1525492869; tz=Asia%2FShanghai; logged_in=no; _gat=1; _gh_sess=TTZZNnVPOTRwTlI2YW1UOWFqTUZwa1JHVENFOVpjYWxoekJLbS9GTGFQQkR4ODZVSTIrV1NTWUFQS0dETUhURmg4YVR3bGxjV1hJT2dMZ2NIOEptZlREUUxrOXV0eG1EcG5kUi9adUZQamFpWmxmMHVhY04vckJVWkxXbkNVa3ZHNCtjekF2WEVnMGVzaElqMnBpVUIvVGVSNzJmdjNQMFFxYWpONE1HMks2eDhpVzZ0Wk9ZQUZLMVJOOTRsYXJWUjV6VUNmRFhyaHlYczUzdUozSWR5M210akh3dkcvaXRhY2ZmanRNRC9IbElUMm5OSmkzVDhtbEwvSEdGWFMvd0xySWIxcFRrbGZ0RDQwQit5eGUvaGZKdWp5U1dYSnZ0VzRRb1FqUThZZXlreTBNU1RicUhheGJGQjFRcDlnN1N2b2RXRXRsT21lRFB5Q3RFSHQ3V1FpR05QSlo2TVBic2o3R0hDaUZOSmhST0l2ZXdabHVzVEdBcGdMRWpiS2lXME5jOWV2WTFJckFGUXI2WHpjeTZ5ZXpXMUJvSXlpdmRpZWNONUhFejFUMD0tLVhyaTZJdlZBMUdNWnBNU0QrWDFFQ1E9PQ%3D%3D--ab7943f35d872df42bb86b8086b5cec50d3ef0ce"
}
post_data = { 
 
     
    "commit": "Sign in",
    "utf8": "✓",
    "authenticity_token": "UFKuL5RE8DTUXAc0cddcawtX3gWADuVQInNPqIacBfESUsUZwZ8jNQ24sYpQVHS6vqFXlci9FqeTV9aZ+wqa+Q==",
    "login": "czwspider",
    "password": "qwer1234"
}


# 获取session对象经过session对象进行请求,session对象的做用就是自动记录Cookies值,代码中不须要关心
session = requests.session()
# 2. 发送请求获取响应
response = session.post(login_url,post_data,headers=headers)


# 3. 从响应中获取Cookies值而且在之后的通信中都使用这个Cookies值
# requests.utils.dict_from_cookiejar 用于把Cookie对象转换成词典对象
# print(requests.utils.dict_from_cookiejar(response.cookies))

# 4. 爬取
setting_headers = { 
 
     
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36",
}
setting_url = "https://github.com/settings/profile"
settting_resp = session.get(setting_url,headers=setting_headers)

# print(response.content.decode('utf-8'))
print(settting_resp.status_code)

with open('github_login_03.html','wb') as f:
    f.write(settting_resp.content)