python读取pdf为文本
作者:互联网
from urllib.request import urlopen
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open
def readPDF(pdfFile):
rsrcmgr = PDFResourceManager()#资源管理器
retstr = StringIO()#分配内存
laparams = LAParams()#段落
device = TextConverter(rsrcmgr, retstr, laparams=laparams)#转换为文本
process_pdf(rsrcmgr, device, pdfFile)#抓取文本 #关闭设备 device.close() content = retstr.getvalue()#抓取字符 retstr.close()#关闭 return content
pdfFile = urlopen(“file:///C:/Users/Administrator/Desktop/爬虫简历.pdf”)
outputString = readPDF(pdfFile)
print(outputString)
pdfFile.close()
标签:读取,python,rsrcmgr,pdfminer,retstr,import,close,pdf,pdfFile 来源: https://blog.51cto.com/u_15177056/2725502