模块准备 : pip install pdfminer.six加密
import refrom pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManagerfrom pdfminer.converter import TextConverter, PDFPageAggregatorfrom pdfminer.layout import LAParamsfrom pdfminer.pdfparser import PDFParserfrom pdfminer.pdfdocument import PDFDocumentfrom pdfminer.pdfdevice import PDFDevicefrom pdfminer.pdfpage import PDFPageroot= r'C:\Users\jiaotianhang\Downloads\pdf'# 获取pdf文档fp = open('%s/%s'%(root,'ghi.pdf'), 'rb')# 建立一个与文档相关的解释器parser = PDFParser(fp)# pdf文档的对象,与解释器链接起来doc = PDFDocument(parser=parser)parser.set_document(doc=doc)# 若是是加密pdf,则输入密码# doc._initialize_password()# 建立pdf资源管理器resource = PDFResourceManager()# 参数分析器laparam = LAParams()# 建立一个聚合器device = PDFPageAggregator(resource, laparams=laparam)# 建立pdf页面解释器interpreter = PDFPageInterpreter(resource, device)# 获取页面的集合for page in PDFPage.get_pages(fp): # 使用页面解释器来读取 interpreter.process_page(page) # 使用聚合器来获取内容 layout = device.get_result() for out in layout: if hasattr(out, 'get_text'): # print(out.get_text()) ooo = re.sub(r'\(cid:\d+\)','',out.get_text()) # 写入txt文件 if ooo.strip(): fw = open('exam3.txt', 'a',encoding='utf-8') fw.write(ooo) fw.close() # fw.write(out.get_text())