本程序经过输入关键字(如CSDN)就会获取到全部包含关键字(如CSDN)的公众号。在逐一获取全部公众号下全部文章。固然,这只是理想状况,腾讯的反爬不是通常的厉害,他会直接封掉你的帐号。而每一个帐号又直接和微信号挂钩,因此目前我还不知道如何解决。是什么地方和微信号挂钩呢,在登陆公众号的时候须要用微信扫描二维码登陆。
若是持续爬取,每一个帐号一次大概会封一个小时左右,天天有几回机会,以后会封24小时。因此多的时候一天大概能爬取几千篇微信文章。固然,咱们只是纯粹的技术交流,爬取到的微信文章切不可商用,由于有版权问题。尊重版权是一种进步。
具体微信公众号的入口就用本身的微信公众号做为入口,在发布文章时有个连接功能,那就是入口。具体的描述网上有不少,在此就不详细叙述了。
个人代码对爬取数量进行了限制,每一个公众号最多爬取20页文章(能够在代码中改变),为的是可以多爬取一些公众号。
若是你须要持续的爬取某个公众号,在帐号被封掉以后,你能够记录下当前爬取到公众号的多少页,以及某个公众号文章的多少页。下次直接在代码中改成该页数就能持续爬取了。
很少说了,更多具体的细节,都在代码中了。
我保证 慢慢看懂如下的代码,对爬取微信公众号 必定不会让你失望的。html
#获取微信公众号,进行整站爬取 from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException import time import json import requests import re import random from pyquery import PyQuery from requests import RequestException import csv wechat_url = "https://mp.weixin.qq.com/" #公众号主页 account = "***************" #请使用本身的微信公众号登陆帐户 password = "***************" #请使用本身的微信公众号登陆密码 wechat_official_url = "https://mp.weixin.qq.com/cgi-bin/searchbiz?" #公众号搜索页面,获取公众号列表 header = { "Host": "mp.weixin.qq.com", "User - Agent": "Mozilla / 5.0(WindowsNT10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 71.0.3578.98Safari / 537.36" } article_url = "https://mp.weixin.qq.com/cgi-bin/appmsg?" #文章页面,获取文章列表 data_count = 1 ip_pool=list() proxy_ip=None #读取代理IP,得到全部Ip def get_ip_pool(): with open("proxy_ip.txt","r",encoding="utf-8") as f: content = f.readlines() for i in range(len(content)): content[i] = content[i].rstrip("\n").replace('\"', "") ip_pool.append(content[i]) #获取代理Ip def get_proxy_ip(): global proxy_ip proxy_ip = ip_pool[random.randint(0,len(ip_pool)-1)] #打开微信公众号主页面,登陆微信公众号 #获取cookie信息 def open_wechat_official(): browser = webdriver.Chrome() wait = WebDriverWait(browser, 2) try: post = {} browser.get(wechat_url) input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#header > div.banner > div > div > form > div.login_input_panel > div:nth-child(1) > div > span > input"))) input.clear() input.send_keys(account) psword = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"#header > div.banner > div > div > form > div.login_input_panel > div:nth-child(2) > div > span > input"))) psword.clear() psword.send_keys(password) time.sleep(2) submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#header > div.banner > div > div > form > div.login_btn_panel > a"))) submit.click() time.sleep(15) #等待用户手机扫码登陆 #将cookies写入文件,进行保存,长期使用 cookies_items = browser.get_cookies() # print(cookies_items) for cookie_item in cookies_items: post [cookie_item["name"]] = cookie_item["value"] cookie_str = json.dumps(post) with open("cookie.txt","w+",encoding="utf-8") as f: f.write(cookie_str) except TimeoutException: open_wechat_official() #开始微信公众号的爬取 # 获取token和cookie def get_token_and_cookie(): with open ("cookie.txt","r",encoding="utf-8") as f: cookie = f.read() cookies = json.loads(cookie) response = requests.get(url = wechat_url,headers=header,cookies=cookies) # print(response.url) pattern = re.compile("token=(\d+)",re.S) token = re.search(pattern,str(response.url))[1] # print(token) return cookies,token #进入微信公众号的接口处,输入关键字 #获取fakeid def get_wechat_fakeid(cookies,token,keyword): query_id = { "action" :"search_biz", "token" :token, "lang" : "zh_CN", "f" : "json", "ajax" : 1, "random" : random.random(), "query" : keyword, "begin" : "0", #控制页数 "count" :"5" } search_response = requests.get(wechat_official_url,cookies=cookies,headers=header,params=query_id) total_offical = search_response.json().get("total") # print(total_offical) #总的公众号个数,每页5个,自行处理 #获取搜索到的因此微信公众号 print("一共有%d个公众号" % total_offical) for num in range(0,int(total_offical)//5): print("第%d页公众号" % num) begin = num*5 query_id = { "action" :"search_biz", "token" :token, "lang" : "zh_CN", "f" : "json", "ajax" : 1, "random" : random.random(), "query" : keyword, "begin" : str(begin), #控制页数 "count" :"5" } search_response = requests.get(wechat_official_url,cookies=cookies,headers=header,params=query_id) for wechat_list in search_response.json().get("list"): # print(wechat_list) fakeid = wechat_list.get("fakeid") nickname = wechat_list.get("nickname") #微信名 alias = wechat_list.get("alias") #微信号 print("公众号的名字是:%s " % nickname) yield { "fakeid" : fakeid, "nickname" : nickname, "alias" : alias } print("公众号翻页中。。。") time.sleep(2) #进入具体的公众号,获取文章列表 def input_wechat_get_article(cookies,token,fakeid): query_id ={ "token" :token, "lang" : "zh_CN", "f" : "json", "ajax" : 1, "random" :random.random(), "action" :"list_ex", "begin" : 0, #控制页数 "count" : 5, "query" :"", "fakeid" :fakeid, "type":9 } appmsg_response = requests.get(article_url,cookies=cookies,headers=header,params=query_id) total_article = appmsg_response.json().get("app_msg_cnt") #总文章数,每页五个,自行处理 if total_article: print("该公众号共有%d个数据" % total_article) #获取全部文章 total_page = int(total_article)//5 if total_page == 0 : total_page +=1 for num in range(total_page): if num > 20 : break print("该公众号的第%d页数据" % num) begin = num*5 query_id ={ "token" :token, "lang" : "zh_CN", "f" : "json", "ajax" : 1, "random" :random.random(), "action" :"list_ex", "begin" : str(begin), #控制页数 "count" : 5, "query" :"", "fakeid" :fakeid, "type":9 }#cookies=cookies, appmsg_response = requests.get(article_url,cookies=cookies, headers=header, params=query_id) if appmsg_response.json().get("base_resp").get("err_msg") == "ok": for article in appmsg_response.json().get("app_msg_list"): article_link_url = article.get("link") article_title = article.get("title") article_time = article.get("update_time") yield { "article_link_url" : article_link_url, "article_title" : article_title, "article_time" : article_time } print("文章翻页中.....") time.sleep(2) else: time.sleep(4000) #获取文章的html代码 def get_article_html(url): try: response = requests.get(url,headers=header) if response.status_code == 200 : return response.text else: return None except RequestException: return None #获取文章内容 def parse_article_content(html): doc = PyQuery(html) content = doc(".rich_media_content p").text() content = content.replace("\r\n", "") content = content.replace("\n", "") return content #写入文件txt格式 def save_to_file(content): with open("wechat_official.txt","a+",encoding="utf-8") as f: f.write(json.dumps(content, ensure_ascii=False) + "\n") f.close() #写入文件标题 def write_title_file(): with open ("wechat_official.csv","a+",encoding="utf-8-sig",newline="") as f: wea_for=csv.writer(f,delimiter=",") wea_for.writerow(["微信名","微信号","标题","时间","内容"]) #写入文件内容 def write_content_file(content): with open ("wechat_official.csv","a+",encoding="utf-8-sig",newline="") as f: wea_for=csv.writer(f,delimiter=",") wea_for.writerow([content["nickname"],content["alias"],content["title"],content["date"],content["content"]]) #主函数 def main(): global data_count open_wechat_official() #获取cookies,通常不用每次都运行,天天运行一次 write_title_file() cookies,token = get_token_and_cookie() keyword = input("请输入要爬取的关键字:") for wechat_official in get_wechat_fakeid(cookies,token,keyword): # print(wechat_official) fakeid = wechat_official["fakeid"] nickname = wechat_official["nickname"] alias = wechat_official["alias"] for wechat_article in input_wechat_get_article(cookies,token,fakeid): # print(wechat_article) article_link_url = wechat_article["article_link_url"] article_title = wechat_article["article_title"] article_time = wechat_article["article_time"] date = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(article_time)) html = get_article_html(article_link_url) if html : content = parse_article_content(html) crawl_content = { "nickname" : nickname, #微信名 "alias" : alias, #微信号 "title" : article_title, "date" : date, #时间 "content" : content #内容 } write_content_file(crawl_content) # save_to_file(crawl_content) print("获取到第%d条数据" % data_count) data_count +=1 # print(crawl_content) if __name__ == "__main__": main()
运行结果以下:
。。。。。。。。。
当发现获取到的数据再也不更新时,必定要中止程序,由于我在程序中设置了睡眠时间,若是等睡眠时间在爬取,那么下次极有可能会被封24小时,因此只要记住当前在第几页公众号,下次在代码中输入该页数就能继续爬取了。固然若是只是爬取一篇公众号,那么改变的位置就不一样了,固然原理是同样的,就须要认真看懂代码后再操做了。固然有更好的方法能够破解这个封帐号的问题,只是我不会,毕竟我也只是一个小白。web
最后在展现一下爬取到的数据样式:ajax
还请多你们多指教,相互学习。json