@python
以前写的博客都在csdn和博客园中
要将博客同步到本身的博客网站中
由于都是使用markdown格式书写的,因此直接爬取上传就完事git
分析了下博客园,发现可行。先登陆进入本身的博客主页,能够看到有下一页的标志,每一页包含若干个博客详情,博客详情中包含edit页(编辑页面)和正常页面(其余用户访问的),要获取的就是eidt页面的博客名字,博客内容。博客分类在edit页面中很差获取,转而去正常页面获取,发现是一个ajax请求,传入用户id和博客id就能够获取到分类。信息获取到了就保存在本地,按分类保存。保存后就使用post请求发送到本身的博客网站中,或者直接写入数据库github
从文件读取请求头字典,构造函数传入的是文件名ajax
class getHeaders(object): def __init__(self,path): self.dict_ = {} with open(path, "r",encoding="utf8") as f: line = f.readline() while line: a = line.split(":") self.clean_(a) try: self.dict_[a[0]] = a[1] except: pass line = f.readline() def clean_(self,list_): for i in range(len(list_)): list_[i] = list_[i].strip() if len(list_) > 2 :#说明有多个:号 try: #开头为空,说明元字符串开头有: list_.remove("") list_[0] = ":"+list_[0] except: #说明开头不为空 list_[1] = list_[1]+":"+list_[2]
爬博客园的代码数据库
import requests from lxml import etree from GetHeaders import getHeaders import os #博客园获取分类,传入ID catagory_url = "https://www.cnblogs.com/simon-idea/ajax/CategoriesTags.aspx?blogId=xxxxxx&postId=%s" #每一页中包含的url link_list = [] #筛选详情页的url detail_list = [] heads = getHeaders("博客园").dict_ for i in range(1,9): url = "https://www.cnblogs.com/simon-idea/default.html?page=%s" #7 url = url % str(i) req = requests.get(url, headers=heads) html = etree.HTML(req.content) every_page_links = html.xpath('//*[@id="mainContent"]/div/div/div/a/@href') link_list.extend(every_page_links) for i in link_list: if "Edit" in i : detail_list.append(i) # assert 1 ==2 for i in detail_list: url = i req = requests.get(url, headers=heads) a = req.content.decode(req.encoding) html = etree.HTML(a) # title //*[@id="Editor_Edit_txbTitle"]/@value title = html.xpath('//*[@id="Editor_Edit_txbTitle"]/@value')[0] # body //*[@id="Editor_Edit_EditorBody"] body = html.xpath('//*[@id="Editor_Edit_EditorBody"]/text()')[0] req = requests.get(catagory_url % i[-8:], headers=heads) a = req.content.decode(req.encoding) html = etree.HTML(a) catagory = html.xpath('//*[@id="BlogPostCategory"]/a/text()')[0] dirs = '博客/%s' % catagory if not os.path.exists(dirs): os.makedirs(dirs) with open("博客/%s/%s.md" % (catagory,title),"w",encoding="utf-8") as f: f.write(body)
上传的代码有不少坑,不完善
由于原博客做者的博客路径定义的有问题markdown
我的博客网站
我的GitHub地址
我的公众号:
app