python批量把pdf转为txt
我的版本:python 3
-
首先 安装 pdfminer
pip install pdfminer3k -
代码部分
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open
import os
#pdf文件目录 输入目录 把要转的pdf扔进去
pdfDir = 'E:/APythonDownloadTxt/PDF/'
#txt文件目录 输出目录 转完的txt
txtDir = 'E:/APythonDownloadTxt/TXT/'
#读取pdf
def readPDF(pdfFile):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
process_pdf(rsrcmgr, device, pdfFile)
device.close()
content = retstr.getvalue()
retstr.close()
return content
#写入txt
def saveTxt(txt,fileName):
with open(txtDir + fileName + '.txt', "w") as f:
f.write(txt)
#执行
fileList = os.listdir(pdfDir)
print('文件总计:', len(fileList))
for fileName in fileList:
print(fileName.lower())
if fileName.lower().find('.pdf') != -1:
txt = readPDF(open(pdfDir + fileName, 'rb'))
saveTxt(txt, fileName.lower().split('.pdf')[0])