处理 PDF 和 Word 文档
书到用时方恨少、事非经过不知难。——陆游
PDF 文档需要第三方类库
pip install PyPDF2
使用
使用
PyPDF2 没有办法从文档中提取图像,图表或者其他媒体,它可以提取文本,并将文本返回 Python 字符串,加密的 PDF 不能支持
PyPDF2 包含了 PdfFileReader, PdfFileWriter, PdfFileMerger,PageObject
从 PDF 读取文件
- PdfFileReader,getPage(0),extractText(),close()
from PyPDF2 import PdfFileReader, PdfFileWriter
readFile = './测试.pdf'
writeFile = './写入.pdf'
def extract_information(pdf_path):
with open(pdf_path, 'rb') as f:
pdf = PdfFileReader(f)
information = pdf.getDocumentInfo()
# print(information) # 获取信息
# print(pdf.numPages) # 获取所有页码
Page = pdf.getPage(0) # 获取到页面
Page_content = Page.extractText() # 获取页面的内容
print(Page_content) # 这样就获取到了内容
extract_information(readFile)
处理解密 PDF
isEncrypted 判断有没有密码
decrypt 输入密码
from PyPDF2 import PdfFileReader, PdfFileWriter
readFile = './测试.pdf'
writeFile = './写入.pdf'
def extract_information(pdf_path):
with open(pdf_path, 'rb') as f:
pdf = PdfFileReader(f)
print(pdf.isEncrypted) # 要是 True 就代表加密,False 就代表不加密 # 如果要是 True
passwordflag = pdf.decrypt('密码') # decrypt 就是输入的密码
if passwordflag == 0: # 0 就表示输入错误 1 就表示输入正确
print("密码输入失败")
else:
print("密码输入对了")
extract_information(readFile)
合并 PDF
from PyPDF2 import PdfFileReader, PdfFileWriter
def merge_pdfs(paths, output):
pdf_writer = PdfFileWriter()
for path in paths:
pdf_reader = PdfFileReader(path)
for page in range(pdf_reader.getNumPages()):
# 将每页添加到writer对象
pdf_writer.addPage(pdf_reader.getPage(page))
# 写入合并的pdf
with open(output, 'wb') as out:
pdf_writer.write(out)
if __name__ == '__main__':
paths = ['测试.pdf', '测试2.pdf']
merge_pdfs(paths, output='merged.pdf')
拆分 PDF
- 拆分
from PyPDF2 import PdfFileReader, PdfFileWriter
def split(path, name_of_split):
pdf = PdfFileReader(path)
for page in range(pdf.getNumPages()):
pdf_writer = PdfFileWriter()
pdf_writer.addPage(pdf.getPage(page))
output = f'{name_of_split}{page}.pdf'
with open(output, 'wb') as output_pdf:
pdf_writer.write(output_pdf)
if __name__ == '__main__':
path = './merged.pdf'
split(path, 'jupyter_page')
else:
print("失败了")