#!/usr/bin/python # -*- coding: utf-8 -*- import requests # 1. 利用 hTTP headers 中 Cookie 字段实现登陆后的数据抓取 # 若是请求中携带 登陆后 Cookie 发送请求,就会表示已经登陆成功 headers = { "Cookie": "_ga=GA1.2.1855430798.1461857641; _octo=GH1.1.783519559.1525492869; tz=Asia%2FShanghai; _gat=1; user_session=kYlCFIbmw-cQzLHcexbjA365OWA7ecKmWA2sGN4oXTCNx9ae; __Host-user_session_same_site=kYlCFIbmw-cQzLHcexbjA365OWA7ecKmWA2sGN4oXTCNx9ae; logged_in=yes; dotcom_user=czwspider; _gh_sess=cVhpRy95OXJNdE85NWkwMlJST0NkK0oza3A2WEJ0aGxqTWQ0dzFFNHdRMTZMNUxLaHIyMmE2anc4TDh1VFdzT1UycitCbzJ6RHg1U2diYVJkdjU1d1phejk4S1ZKelcrLzFxOUhvb2hOTHZROUZ3RUM5NVN3RDdySjUzeXJQNjNTbUZBc0ZNYW9QdzFmZWFDSnRmd2VnNzMyNzBCOTUyazJudmxWeDRveHRBPS0tTjQ1Y1JFWXZlRFFKbnc3Vko1V3RrZz09--f2e6c23defd0c0e6d470eb2dcb91c7cc2ed54dc4", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36" } # 想要访问该页面就得放入登陆成功后的 Cookies 值 setting_url = "https://github.com/settings/profile" response = requests.get(setting_url,headers=headers) with open('github_login_01.html','wb') as f: f.write(response.content)
#!/usr/bin/python # -*- coding: utf-8 -*- # 模拟 github 登陆 # 0. 分析爬虫 ''' 登陆地址 肯定请求地址 https://github.com/session 肯定请求方式 POST 肯定请求的内容 commit: Sign in utf8: ✓ authenticity_token: FKPt8/jlSD6VqqKbJqQUylCZaArCLMEhyIYWtA12LSzK47nyaPOs8IoIZ04o5AGJiQIc04jX9b0lsWuETzc8+g== login: czwspider password: qwer1234 当请求成功 获取 页面的 Cookie 值而且,保存下来之后发送的请求都携带这个Cookie值 ''' import requests import requests.utils # 1。定义请求参数 login_url = "https://github.com/session" headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36", "Cookie": "_ga=GA1.2.1855430798.1461857641; _octo=GH1.1.783519559.1525492869; tz=Asia%2FShanghai; logged_in=no; _gat=1; _gh_sess=TTZZNnVPOTRwTlI2YW1UOWFqTUZwa1JHVENFOVpjYWxoekJLbS9GTGFQQkR4ODZVSTIrV1NTWUFQS0dETUhURmg4YVR3bGxjV1hJT2dMZ2NIOEptZlREUUxrOXV0eG1EcG5kUi9adUZQamFpWmxmMHVhY04vckJVWkxXbkNVa3ZHNCtjekF2WEVnMGVzaElqMnBpVUIvVGVSNzJmdjNQMFFxYWpONE1HMks2eDhpVzZ0Wk9ZQUZLMVJOOTRsYXJWUjV6VUNmRFhyaHlYczUzdUozSWR5M210akh3dkcvaXRhY2ZmanRNRC9IbElUMm5OSmkzVDhtbEwvSEdGWFMvd0xySWIxcFRrbGZ0RDQwQit5eGUvaGZKdWp5U1dYSnZ0VzRRb1FqUThZZXlreTBNU1RicUhheGJGQjFRcDlnN1N2b2RXRXRsT21lRFB5Q3RFSHQ3V1FpR05QSlo2TVBic2o3R0hDaUZOSmhST0l2ZXdabHVzVEdBcGdMRWpiS2lXME5jOWV2WTFJckFGUXI2WHpjeTZ5ZXpXMUJvSXlpdmRpZWNONUhFejFUMD0tLVhyaTZJdlZBMUdNWnBNU0QrWDFFQ1E9PQ%3D%3D--ab7943f35d872df42bb86b8086b5cec50d3ef0ce" } post_data = { "commit": "Sign in", "utf8": "✓", "authenticity_token": "UFKuL5RE8DTUXAc0cddcawtX3gWADuVQInNPqIacBfESUsUZwZ8jNQ24sYpQVHS6vqFXlci9FqeTV9aZ+wqa+Q==", "login": "czwspider", "password": "qwer1234" } # 2. 发送请求获取响应 response = requests.post(login_url,post_data,headers=headers) # 3. 从响应中获取Cookies值而且在之后的通信中都使用这个Cookies值 # requests.utils.dict_from_cookiejar 用于把Cookie对象转换成词典对象 # print(requests.utils.dict_from_cookiejar(response.cookies)) # 4. 爬取 setting_headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36", } setting_url = "https://github.com/settings/profile" settting_resp = requests.get(setting_url,headers=setting_headers,cookies=response.cookies) # print(response.content.decode('utf-8')) print(settting_resp.status_code) with open('github_login_02.html','wb') as f: f.write(settting_resp.content)
#!/usr/bin/python # -*- coding: utf-8 -*- # 模拟 github 登陆 # 0. 分析爬虫 ''' 登陆地址 肯定请求地址 https://github.com/session 肯定请求方式 POST 肯定请求的内容 commit: Sign in utf8: ✓ authenticity_token: FKPt8/jlSD6VqqKbJqQUylCZaArCLMEhyIYWtA12LSzK47nyaPOs8IoIZ04o5AGJiQIc04jX9b0lsWuETzc8+g== login: czwspider password: qwer1234 当请求成功 获取 页面的 Cookie 值而且,保存下来之后发送的请求都携带这个Cookie值 ''' import requests import requests.utils # 1。定义请求参数 login_url = "https://github.com/session" headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36", "Cookie": "_ga=GA1.2.1855430798.1461857641; _octo=GH1.1.783519559.1525492869; tz=Asia%2FShanghai; logged_in=no; _gat=1; _gh_sess=TTZZNnVPOTRwTlI2YW1UOWFqTUZwa1JHVENFOVpjYWxoekJLbS9GTGFQQkR4ODZVSTIrV1NTWUFQS0dETUhURmg4YVR3bGxjV1hJT2dMZ2NIOEptZlREUUxrOXV0eG1EcG5kUi9adUZQamFpWmxmMHVhY04vckJVWkxXbkNVa3ZHNCtjekF2WEVnMGVzaElqMnBpVUIvVGVSNzJmdjNQMFFxYWpONE1HMks2eDhpVzZ0Wk9ZQUZLMVJOOTRsYXJWUjV6VUNmRFhyaHlYczUzdUozSWR5M210akh3dkcvaXRhY2ZmanRNRC9IbElUMm5OSmkzVDhtbEwvSEdGWFMvd0xySWIxcFRrbGZ0RDQwQit5eGUvaGZKdWp5U1dYSnZ0VzRRb1FqUThZZXlreTBNU1RicUhheGJGQjFRcDlnN1N2b2RXRXRsT21lRFB5Q3RFSHQ3V1FpR05QSlo2TVBic2o3R0hDaUZOSmhST0l2ZXdabHVzVEdBcGdMRWpiS2lXME5jOWV2WTFJckFGUXI2WHpjeTZ5ZXpXMUJvSXlpdmRpZWNONUhFejFUMD0tLVhyaTZJdlZBMUdNWnBNU0QrWDFFQ1E9PQ%3D%3D--ab7943f35d872df42bb86b8086b5cec50d3ef0ce" } post_data = { "commit": "Sign in", "utf8": "✓", "authenticity_token": "UFKuL5RE8DTUXAc0cddcawtX3gWADuVQInNPqIacBfESUsUZwZ8jNQ24sYpQVHS6vqFXlci9FqeTV9aZ+wqa+Q==", "login": "czwspider", "password": "qwer1234" } # 获取session对象经过session对象进行请求,session对象的做用就是自动记录Cookies值,代码中不须要关心 session = requests.session() # 2. 发送请求获取响应 response = session.post(login_url,post_data,headers=headers) # 3. 从响应中获取Cookies值而且在之后的通信中都使用这个Cookies值 # requests.utils.dict_from_cookiejar 用于把Cookie对象转换成词典对象 # print(requests.utils.dict_from_cookiejar(response.cookies)) # 4. 爬取 setting_headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36", } setting_url = "https://github.com/settings/profile" settting_resp = session.get(setting_url,headers=setting_headers) # print(response.content.decode('utf-8')) print(settting_resp.status_code) with open('github_login_03.html','wb') as f: f.write(settting_resp.content)