pdf 转 txt

模块准备 :   pip  install pdfminer.six加密

 

 

import refrom pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManagerfrom pdfminer.converter import TextConverter, PDFPageAggregatorfrom pdfminer.layout import LAParamsfrom pdfminer.pdfparser import PDFParserfrom pdfminer.pdfdocument import PDFDocumentfrom pdfminer.pdfdevice import PDFDevicefrom pdfminer.pdfpage import PDFPageroot= r'C:\Users\jiaotianhang\Downloads\pdf'# 获取pdf文档fp = open('%s/%s'%(root,'ghi.pdf'), 'rb')# 建立一个与文档相关的解释器parser = PDFParser(fp)# pdf文档的对象,与解释器链接起来doc = PDFDocument(parser=parser)parser.set_document(doc=doc)# 若是是加密pdf,则输入密码# doc._initialize_password()# 建立pdf资源管理器resource = PDFResourceManager()# 参数分析器laparam = LAParams()# 建立一个聚合器device = PDFPageAggregator(resource, laparams=laparam)# 建立pdf页面解释器interpreter = PDFPageInterpreter(resource, device)# 获取页面的集合for page in PDFPage.get_pages(fp):    # 使用页面解释器来读取    interpreter.process_page(page)    # 使用聚合器来获取内容    layout = device.get_result()    for out in layout:        if hasattr(out, 'get_text'):            # print(out.get_text())            ooo = re.sub(r'\(cid:\d+\)','',out.get_text())            # 写入txt文件            if ooo.strip():                fw = open('exam3.txt', 'a',encoding='utf-8')                fw.write(ooo)                fw.close()            # fw.write(out.get_text())
相关文章
相关标签/搜索