应女友要求,为了能及时掌握技术动向,特地写了这个爬虫,天天定时爬取博客园首页并发送至微信。html
Python3.4浏览器
# -*-coding:utf-8 -*- import requests from requests import exceptions from bs4 import BeautifulSoup as bs import re from wxpy import * import schedule import time bot=Bot(cache_path=True) #获取网页内容 def getHtml(pageIndex): #定义请求头 假装成浏览器 headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} #pageIndex表明页数 payload={'CategoryType': 'SiteHome', 'ParentCategoryId': '0', 'CategoryId': '808', 'PageIndex': pageIndex, 'TotalPostCount': '4000'} try: r=requests.post('https://www.cnblogs.com/mvc/AggSite/PostList.aspx',data=payload,headers=headers) r.raise_for_status() r.encoding=r.apparent_encoding return r.text except requests.RequestException as e: return e.strerror #向微信文件传输助手发送消息 def sendblogmsg(content): #搜索本身的好友 #my_friend = bot.friends().search('')[0] my_friend=bot.file_helper my_friend.send(content) def job(): contents='' #i表示当前页数 for i in range(1,3): html=getHtml(i) soup=bs(html,"html.parser") blogs=soup.findAll("div",{'class':'post_item_body'}) for blog in blogs: title=blog.find('h3').get_text() summary=blog.find('p',{'class':'post_item_summary'}).get_text() link=blog.find('a',{'class':'titlelnk'})['href'] content='标题:'+title+'\n连接:'+link+'\n-----------\n' contents+=content sendblogmsg(contents) #定时 schedule.every().day.at("06:00").do(job) while True: schedule.run_pending() time.sleep(1) bot.join()