pdf转图片

2021-06-05 20:58:21 作者：互联网

最近，一直在研究PDF转图片，了解到流行的大约有4种方案，三叔实验了其中3种，现和大家分享。

背景

PDF一般有两种格式，一种是扫描版，鼠标点不进去，一种是可编辑版，鼠标能够点进去复制里面文字，我手头上有2000+个PDF文件，大约有700个是扫描版，1300+个是可编辑版，为了后续工作，我需要将其统一转为图片，然后再OCR工作进行光学文字识别提取。

设计思路

访问PDF文件所在路径，找出所有PDF文件名形成一个PDF文件名列表；
对PDF文件名列表循环，调用pdf2pic函数，将PDF分割成一张一张图片，存入临时文件夹下，并记录图片数目；
利用PIL模块将同一个PDF产生的图片拼接起来形成一张长图；

fitz方案

# -*- coding: utf-8 -*-
"""
Project_name:pdf2pic
Description:
Created on Tue Dec  8 08:59:21 2020
@author: 帅帅de三叔
"""
import os
import os.path
import fitz
from PIL import Image

pdfpath = r"D:\项目\pdf提取信息\pdf转图片" #原pdf文件路径
temp_imagepath = r"D:\项目\pdf提取信息\pdf转图片\临时图片" #用来存放临时图片路径
imagepath = r"D:\项目\pdf提取信息\pdf转图片\转化后的图片" #用来存放转化后的图片路径

def mergePic(m, temp_imagepath): #合并分割后的png图片形成一张长图
    img_list = [] #用来存放png图片名称
    for parent, dirname, filenames in os.walk(temp_imagepath):
       for filename in filenames:
           if ".png" in filename:
               img_list.append(filename) 
    print(img_list[0:m])

    if img_list:
        img_name = img_list[0]
        color_mod = 'RGBA' if img_name.endswith('.png') else 'RGB'  # jpeg格式不支持RGBA
        first_img = Image.open(temp_imagepath+os.sep+img_list[0])
        height_size = first_img.size[1]
        total_width = first_img.size[0]
        total_height = height_size * m
        left = 0
        right = height_size
        target = Image.new(color_mod, (total_width, total_height))  # 最终拼接的图像的大小
        for img in img_list[0:m]:
            target.paste(Image.open(temp_imagepath+os.sep+img), (0, left, total_width, right))
            left += height_size
            right += height_size
        target.save(imagepath + os.sep + pdfname[:-4] + '_fitz.png', quality=100)
        return img_name

def pdf2pic(): #将pdf一页一页切割转为一页一页的png图片
    pdf = fitz.open(pdfpath+os.sep+pdfname) #打开pdf文件
    for pg in range(0, pdf.pageCount):  
        page = pdf[pg] # 获得每一页的对象
        trans = fitz.Matrix(3.0, 3.0).preRotate(0)
        pm = page.getPixmap(matrix=trans, alpha=False) # 获得每一页的流对象
        pm.writePNG(temp_imagepath + os.sep + '{:0>3d}.png'.format(pg + 1))  # 保存到临时图片文件夹下
    pagecount = pdf.pageCount #pdf总页数
    pdf.close() #关闭pdf文件
    return pagecount

if __name__=="__main__":
    pdfnames = [] # 用来存放pdf源文件名称
    for parent, dirname, filenames in os.walk(pdfpath):
       for filename in filenames:
           if ".pdf" in filename:
               pdfnames.append(filename)
    for idx, pdfname in enumerate(pdfnames): 
        print("正在处理第 %d(5)  张名为 %s 文件"%(idx, pdfname))
        pagecount = pdf2pic()
        mergePic(pagecount, temp_imagepath)

pdf2image

# -*- coding: utf-8 -*-
"""
Project_name:pdf2image
Description: 利用pdf2image库转pdf为图片
Created on Tue Dec  8 13:19:09 2020
@author: 帅帅de三叔
"""
import os
import os.path
from PIL import Image
from pdf2image import convert_from_path
pdfpath = r"D:\项目\pdf提取信息\pdf转图片" #原pdf文件路径
temp_imagepath = r"D:\项目\pdf提取信息\pdf转图片\临时图片" #用来存放临时图片路径
imagepath = r"D:\项目\pdf提取信息\pdf转图片\转化后的图片" #用来存放转化后的图片路径


def pdf2image(): #将pdf一页一页切割转为一页一页的png图片
    images = convert_from_path(pdfpath+os.sep+pdfname, dpi = 300)
    for i, image in enumerate(images):
        image.save(temp_imagepath+os.sep+'{:0>3d}.png'.format(i+1), "PNG")   
    pagecount = len(images) #pdf总页数
    return pagecount

def mergePic(m, temp_imagepath): #合并分割后的png图片形成一张长图
    img_list = [] #用来存放png图片名称
    for parent, dirname, filenames in os.walk(temp_imagepath):
       for filename in filenames:
           if ".png" in filename:
               img_list.append(filename) 
    print(img_list[0:m])

    if img_list:
        img_name = img_list[0]
        color_mod = 'RGBA' if img_name.endswith('.png') else 'RGB'  # jpeg格式不支持RGBA
        first_img = Image.open(temp_imagepath+os.sep+img_list[0])
        height_size = first_img.size[1]
        total_width = first_img.size[0]
        total_height = height_size * m
        left = 0
        right = height_size
        target = Image.new(color_mod, (total_width, total_height))  # 最终拼接的图像的大小
        for img in img_list[0:m]:
            target.paste(Image.open(temp_imagepath+os.sep+img), (0, left, total_width, right))
            left += height_size
            right += height_size
        target.save(imagepath + os.sep + pdfname[:-4] + '_images.png', quality=100)
        return img_name


pdfnames = [] # 用来存放pdf源文件名称
for parent, dirname, filenames in os.walk(pdfpath):
   for filename in filenames:
       if ".pdf" in filename:
           pdfnames.append(filename)
               
for idx, pdfname in enumerate(pdfnames): 
    print("正在处理第 %d(5)  张 %s 文件"%(idx, pdfname))
    pagecount = pdf2image()
    mergePic(pagecount, temp_imagepath)

wand方案

# -*- coding: utf-8 -*-
"""
Project_name:pdf2imageghosts
Description: wind方法将pdf转图片
Created on Tue Dec  8 17:16:00 2020
@author: 帅帅de三叔
"""
import os
import os.path
from PIL import Image as PILImage
from wand.image import Image
pdfpath = r"D:\项目\pdf提取信息\pdf转图片" #原pdf文件路径
temp_imagepath = r"D:\项目\pdf提取信息\pdf转图片\临时图片" #用来存放临时图片路径
if not os.path.exists(temp_imagepath):
    os.mkdir(temp_imagepath)
imagepath = r"D:\项目\pdf提取信息\pdf转图片\转化后的图片" #用来存放转化后的图片路径
if not os.path.exists(temp_imagepath):
    os.mkdir(temp_imagepath)



def mergePic(m, temp_imagepath): #合并分割后的png图片形成一张长图
    img_list = [] #用来存放png图片名称
    for parent, dirname, filenames in os.walk(temp_imagepath):
       for filename in filenames:
           if ".jpeg" in filename:
               img_list.append(filename) 
    print(img_list[0:m])

    if img_list:
        img_name = img_list[0]
        color_mod = 'RGBA' if img_name.endswith('.png') else 'RGB'  # jpeg格式不支持RGBA
        first_img = PILImage.open(temp_imagepath+os.sep+img_list[0])
        height_size = first_img.size[1]
        total_width = first_img.size[0]
        total_height = height_size * m
        left = 0
        right = height_size
        target = PILImage.new(color_mod, (total_width, total_height))  # 最终拼接的图像的大小
        for img in img_list[0:m]:
            target.paste(PILImage.open(temp_imagepath+os.sep+img), (0, left, total_width, right))
            left += height_size
            right += height_size
        target.save(imagepath + os.sep + pdfname[:-4] + '_winds.png', quality=100)
        return img_name

def wind_imagemagick_ghostscript(pdf_path, imgs_dir):
    # 将pdf文件转为jpg图片文件
    # ./PDF_FILE_NAME 为pdf文件路径和名称
    # image_pdf = Image(filename='./demo1.pdf', resolution=300)
    image_pdf = Image(filename=pdf_path, resolution =300)
    image_jpeg = image_pdf.convert('png')

    # wand已经将PDF中所有的独立页面都转成了独立的二进制图像对象。我们可以遍历这个大对象，并把它们加入到req_image序列中去。
    req_image = []
    for img in image_jpeg.sequence:
        img_page = Image(image=img)
        req_image.append(img_page.make_blob('jpeg'))

    # 遍历req_image,保存为图片文件
    i = 0
    for img in req_image:
        ff = open(imgs_dir + '\\' + str(i) + '.jpeg', 'wb')
        ff.write(img)
        ff.close()
        i += 1
    #print(len(req_image))
    return len(req_image)


if __name__ == '__main__':
    pdfnames = [] # 用来存放pdf源文件名称
    for parent, dirname, filenames in os.walk(pdfpath):
       for filename in filenames:
           if ".pdf" in filename:
               pdfnames.append(filename)
               
    for idx, pdfname in enumerate(pdfnames): 
        print("正在处理第 %d(5)  张 %s 文件"%(idx, pdfname))
        pdf_path = pdfpath + os.sep +pdfname
        print(pdf_path)
        imgs_dir = temp_imagepath
        req_image = wind_imagemagick_ghostscript(pdf_path, imgs_dir)
        mergePic(req_image, temp_imagepath)

结论

fitz方案简单高效，pdf2image最后出来的图片灰度有点大，wand方案出来的图片对比度比较高，适合这种文书类的，但是wand方案前期准备工作略微复杂，建议使用fitz方案；

参考文献
1，https://pypi.org/project/pdf2image/;https://www.cnblogs.com/justaman/p/12213353.html

2，https://blog.csdn.net/weixin_42081389/article/details/103712181?utm_medium=distribute.pc_relevant_download.none-task-blog-BlogCommendFromBaidu-6.nonecase&depth_1-utm_source=distribute.pc_relevant_download.none-task-blog-BlogCommendFromBaidu-6.nonecas
在这里插入图片描述

标签：imagepath,img,temp,pdf,os,图片
来源： https://blog.51cto.com/u_15255081/2870643