pdf转图片
作者:互联网
最近,一直在研究PDF转图片,了解到流行的大约有4种方案,三叔实验了其中3种,现和大家分享。
背景PDF一般有两种格式,一种是扫描版,鼠标点不进去,一种是可编辑版,鼠标能够点进去复制里面文字,我手头上有2000+个PDF文件,大约有700个是扫描版,1300+个是可编辑版,为了后续工作,我需要将其统一转为图片,然后再OCR工作进行光学文字识别提取。
设计思路- 访问PDF文件所在路径,找出所有PDF文件名形成一个PDF文件名列表;
- 对PDF文件名列表循环,调用pdf2pic函数,将PDF分割成一张一张图片,存入临时文件夹下,并记录图片数目;
- 利用PIL模块将同一个PDF产生的图片拼接起来形成一张长图;
# -*- coding: utf-8 -*-
"""
Project_name:pdf2pic
Description:
Created on Tue Dec 8 08:59:21 2020
@author: 帅帅de三叔
"""
import os
import os.path
import fitz
from PIL import Image
pdfpath = r"D:\项目\pdf提取信息\pdf转图片" #原pdf文件路径
temp_imagepath = r"D:\项目\pdf提取信息\pdf转图片\临时图片" #用来存放临时图片路径
imagepath = r"D:\项目\pdf提取信息\pdf转图片\转化后的图片" #用来存放转化后的图片路径
def mergePic(m, temp_imagepath): #合并分割后的png图片形成一张长图
img_list = [] #用来存放png图片名称
for parent, dirname, filenames in os.walk(temp_imagepath):
for filename in filenames:
if ".png" in filename:
img_list.append(filename)
print(img_list[0:m])
if img_list:
img_name = img_list[0]
color_mod = 'RGBA' if img_name.endswith('.png') else 'RGB' # jpeg格式不支持RGBA
first_img = Image.open(temp_imagepath+os.sep+img_list[0])
height_size = first_img.size[1]
total_width = first_img.size[0]
total_height = height_size * m
left = 0
right = height_size
target = Image.new(color_mod, (total_width, total_height)) # 最终拼接的图像的大小
for img in img_list[0:m]:
target.paste(Image.open(temp_imagepath+os.sep+img), (0, left, total_width, right))
left += height_size
right += height_size
target.save(imagepath + os.sep + pdfname[:-4] + '_fitz.png', quality=100)
return img_name
def pdf2pic(): #将pdf一页一页切割转为一页一页的png图片
pdf = fitz.open(pdfpath+os.sep+pdfname) #打开pdf文件
for pg in range(0, pdf.pageCount):
page = pdf[pg] # 获得每一页的对象
trans = fitz.Matrix(3.0, 3.0).preRotate(0)
pm = page.getPixmap(matrix=trans, alpha=False) # 获得每一页的流对象
pm.writePNG(temp_imagepath + os.sep + '{:0>3d}.png'.format(pg + 1)) # 保存到临时图片文件夹下
pagecount = pdf.pageCount #pdf总页数
pdf.close() #关闭pdf文件
return pagecount
if __name__=="__main__":
pdfnames = [] # 用来存放pdf源文件名称
for parent, dirname, filenames in os.walk(pdfpath):
for filename in filenames:
if ".pdf" in filename:
pdfnames.append(filename)
for idx, pdfname in enumerate(pdfnames):
print("正在处理第 %d(5) 张名为 %s 文件"%(idx, pdfname))
pagecount = pdf2pic()
mergePic(pagecount, temp_imagepath)
pdf2image
# -*- coding: utf-8 -*-
"""
Project_name:pdf2image
Description: 利用pdf2image库转pdf为图片
Created on Tue Dec 8 13:19:09 2020
@author: 帅帅de三叔
"""
import os
import os.path
from PIL import Image
from pdf2image import convert_from_path
pdfpath = r"D:\项目\pdf提取信息\pdf转图片" #原pdf文件路径
temp_imagepath = r"D:\项目\pdf提取信息\pdf转图片\临时图片" #用来存放临时图片路径
imagepath = r"D:\项目\pdf提取信息\pdf转图片\转化后的图片" #用来存放转化后的图片路径
def pdf2image(): #将pdf一页一页切割转为一页一页的png图片
images = convert_from_path(pdfpath+os.sep+pdfname, dpi = 300)
for i, image in enumerate(images):
image.save(temp_imagepath+os.sep+'{:0>3d}.png'.format(i+1), "PNG")
pagecount = len(images) #pdf总页数
return pagecount
def mergePic(m, temp_imagepath): #合并分割后的png图片形成一张长图
img_list = [] #用来存放png图片名称
for parent, dirname, filenames in os.walk(temp_imagepath):
for filename in filenames:
if ".png" in filename:
img_list.append(filename)
print(img_list[0:m])
if img_list:
img_name = img_list[0]
color_mod = 'RGBA' if img_name.endswith('.png') else 'RGB' # jpeg格式不支持RGBA
first_img = Image.open(temp_imagepath+os.sep+img_list[0])
height_size = first_img.size[1]
total_width = first_img.size[0]
total_height = height_size * m
left = 0
right = height_size
target = Image.new(color_mod, (total_width, total_height)) # 最终拼接的图像的大小
for img in img_list[0:m]:
target.paste(Image.open(temp_imagepath+os.sep+img), (0, left, total_width, right))
left += height_size
right += height_size
target.save(imagepath + os.sep + pdfname[:-4] + '_images.png', quality=100)
return img_name
pdfnames = [] # 用来存放pdf源文件名称
for parent, dirname, filenames in os.walk(pdfpath):
for filename in filenames:
if ".pdf" in filename:
pdfnames.append(filename)
for idx, pdfname in enumerate(pdfnames):
print("正在处理第 %d(5) 张 %s 文件"%(idx, pdfname))
pagecount = pdf2image()
mergePic(pagecount, temp_imagepath)
wand方案
# -*- coding: utf-8 -*-
"""
Project_name:pdf2imageghosts
Description: wind方法将pdf转图片
Created on Tue Dec 8 17:16:00 2020
@author: 帅帅de三叔
"""
import os
import os.path
from PIL import Image as PILImage
from wand.image import Image
pdfpath = r"D:\项目\pdf提取信息\pdf转图片" #原pdf文件路径
temp_imagepath = r"D:\项目\pdf提取信息\pdf转图片\临时图片" #用来存放临时图片路径
if not os.path.exists(temp_imagepath):
os.mkdir(temp_imagepath)
imagepath = r"D:\项目\pdf提取信息\pdf转图片\转化后的图片" #用来存放转化后的图片路径
if not os.path.exists(temp_imagepath):
os.mkdir(temp_imagepath)
def mergePic(m, temp_imagepath): #合并分割后的png图片形成一张长图
img_list = [] #用来存放png图片名称
for parent, dirname, filenames in os.walk(temp_imagepath):
for filename in filenames:
if ".jpeg" in filename:
img_list.append(filename)
print(img_list[0:m])
if img_list:
img_name = img_list[0]
color_mod = 'RGBA' if img_name.endswith('.png') else 'RGB' # jpeg格式不支持RGBA
first_img = PILImage.open(temp_imagepath+os.sep+img_list[0])
height_size = first_img.size[1]
total_width = first_img.size[0]
total_height = height_size * m
left = 0
right = height_size
target = PILImage.new(color_mod, (total_width, total_height)) # 最终拼接的图像的大小
for img in img_list[0:m]:
target.paste(PILImage.open(temp_imagepath+os.sep+img), (0, left, total_width, right))
left += height_size
right += height_size
target.save(imagepath + os.sep + pdfname[:-4] + '_winds.png', quality=100)
return img_name
def wind_imagemagick_ghostscript(pdf_path, imgs_dir):
# 将pdf文件转为jpg图片文件
# ./PDF_FILE_NAME 为pdf文件路径和名称
# image_pdf = Image(filename='./demo1.pdf', resolution=300)
image_pdf = Image(filename=pdf_path, resolution =300)
image_jpeg = image_pdf.convert('png')
# wand已经将PDF中所有的独立页面都转成了独立的二进制图像对象。我们可以遍历这个大对象,并把它们加入到req_image序列中去。
req_image = []
for img in image_jpeg.sequence:
img_page = Image(image=img)
req_image.append(img_page.make_blob('jpeg'))
# 遍历req_image,保存为图片文件
i = 0
for img in req_image:
ff = open(imgs_dir + '\\' + str(i) + '.jpeg', 'wb')
ff.write(img)
ff.close()
i += 1
#print(len(req_image))
return len(req_image)
if __name__ == '__main__':
pdfnames = [] # 用来存放pdf源文件名称
for parent, dirname, filenames in os.walk(pdfpath):
for filename in filenames:
if ".pdf" in filename:
pdfnames.append(filename)
for idx, pdfname in enumerate(pdfnames):
print("正在处理第 %d(5) 张 %s 文件"%(idx, pdfname))
pdf_path = pdfpath + os.sep +pdfname
print(pdf_path)
imgs_dir = temp_imagepath
req_image = wind_imagemagick_ghostscript(pdf_path, imgs_dir)
mergePic(req_image, temp_imagepath)
结论
fitz方案简单高效,pdf2image最后出来的图片灰度有点大,wand方案出来的图片对比度比较高,适合这种文书类的,但是wand方案前期准备工作略微复杂,建议使用fitz方案;
参考文献
1,https://pypi.org/project/pdf2image/;https://www.cnblogs.com/justaman/p/12213353.html
2,https://blog.csdn.net/weixin_42081389/article/details/103712181?utm_medium=distribute.pc_relevant_download.none-task-blog-BlogCommendFromBaidu-6.nonecase&depth_1-utm_source=distribute.pc_relevant_download.none-task-blog-BlogCommendFromBaidu-6.nonecas
标签:imagepath,img,temp,pdf,os,图片 来源: https://blog.51cto.com/u_15255081/2870643