python批量把pdf转为txt

Author Avatar
没饲养员的猩猩 01月 08,2021
  • 在其它设备中阅读本文章

我的版本:python 3

  • 首先 安装 pdfminer
    pip install pdfminer3k

  • 代码部分

from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open
import os

#pdf文件目录 输入目录 把要转的pdf扔进去
pdfDir = 'E:/APythonDownloadTxt/PDF/'
#txt文件目录 输出目录 转完的txt
txtDir = 'E:/APythonDownloadTxt/TXT/'

#读取pdf
def readPDF(pdfFile):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
process_pdf(rsrcmgr, device, pdfFile)
device.close()
content = retstr.getvalue()
retstr.close()
return content

#写入txt
def saveTxt(txt,fileName):
with open(txtDir + fileName + '.txt', "w") as f:
f.write(txt)

#执行
fileList = os.listdir(pdfDir)
print('文件总计:', len(fileList))
for fileName in fileList:
print(fileName.lower())
if fileName.lower().find('.pdf') != -1:
txt = readPDF(open(pdfDir + fileName, 'rb'))
saveTxt(txt, fileName.lower().split('.pdf')[0])