pdf转word| exe程序
右键以管理员身份运行,输入文件夹地址或文件地址
链接: https://pan.baidu.com/s/1SgZf9jchC7NnlGzsHiRIBA 密码: 084r
安装模块
-
pdfminer3k
-
python-docx
读取转换
#!/usr/bin/python
# -*- coding: utf-8 -*-
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
from docx import Document
from docx.opc.exceptions import PackageNotFoundError
import os
import re
document = Document()
def createWord(wordpath, pdfpath):
# rb以二进制读模式打开本地pdf文件
fn = open(pdfpath,'rb')
# 创建一个pdf文档分析器
parser = PDFParser(fn)
# 创建一个PDF文档
doc = PDFDocument()
# 连接分析器 与文档对象
parser.set_document(doc)
doc.set_parser(parser)
# 提供初始化密码doc.initialize("lianxipython")
# 如果没有密码 就创建一个空的字符串
doc.initialize("")
# 检测文档是否提供txt转换,不提供就忽略
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
else:
# 创建PDf资源管理器
resource = PDFResourceManager()
# 创建一个PDF参数分析器
laparams = LAParams()
# 创建聚合器,用于读取文档的对象
device = PDFPageAggregator(resource,laparams=laparams)
# 创建解释器,对文档编码,解释成Python能够识别的格式
interpreter = PDFPageInterpreter(resource,device)
# 循环遍历列表,每次处理一页的内容
# doc.get_pages() 获取page列表
for page in doc.get_pages():
# 利用解释器的process_page()方法解析读取单独页数
interpreter.process_page(page)
# 使用聚合器get_result()方法获取内容
layout = device.get_result()
# 这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象
for out in layout:
# 判断是否含有get_text()方法,获取我们想要的文字
if hasattr(out,"get_text"):
# print(out.get_text(), type(out.get_text()))
content = out.get_text().replace(u'\xa0', u' ') # 将'\xa0'替换成u' '空格,这个\xa0就是&nbps空格
# with open('test.txt','a') as f:
# f.write(out.get_text().replace(u'\xa0', u' ')+'\n')
document.add_paragraph(
content, style='ListBullet' # 添加段落,样式为unordered list类型
)
document.save(wordpath) # 保存这个文档
print("转化成功")
#遍历当前目录,并把Pdf文件转换为Word
def pdfToWord():
# 获取当前运行路径
path = input("请输入pdf文件夹路径或者文件路径:")
#判断是文件夹还是文件
if os.path.isdir(path):
print("读取文件夹内容")
# 获取所有文件名的列表
filename_list = os.listdir(path)
# 获取所有pdf文件名列表
pdfname_list = [filename for filename in filename_list \
if filename.endswith((".pdf"))]
#print(filename_list)
for pdfname in pdfname_list:
# 分离pdf文件名称和后缀,转化为word名称
wordname = os.path.splitext(pdfname)[0] + '.docx'
# 如果当前pdf文件对应的word文件存在,则不转化
if wordname in filename_list:
print("文件夹中pdf已经存在相对应word文档")
continue
print("转换中..."+wordname)
# 拼接 路径和文件名
wordpath = os.path.join(path, wordname)
pdfpath = os.path.join(path, pdfname)
createWord(wordpath,pdfpath)
else:
print("文件转换...")
print(path)
createWord(path.replace(".pdf",".docx"),path)
#获取word内容并转换成txt
def readWord():
path = input("请输入文件路径:")
if os.path.isdir(path):
print("读取文件夹内容")
# 获取所有文件名的列表
filename_list = os.listdir(path)
# 获取所有pdf文件名列表
pdfname_list = [filename for filename in filename_list \
if filename.endswith((".docx"))]
for wordname in pdfname_list:
print("正在处理》〉》〉》〉"+wordname)
try:
word = Document(path+wordname)
except PackageNotFoundError:
print("未找到文件"+path+wordname)
continue
lens = len(word.paragraphs)
allWord = ""
for i in range(lens):
allWord += word.paragraphs[i].text
#re.sub正则表达式替换字符串
wordList = re.sub(r"( ){2,}"," ",re.sub(r"([^a-zA-Z])"," ",allWord)).split(" ")
file = open("/Users/helei/Documents/英语文档/all.txt",mode="a+")
for word in wordList:
if word=="n":
continue
if word == "ad":
continue
if word == "a":
continue
if word == "vt":
continue
if word == "vi":
continue
if word == "pron":
continue
if word == "adj":
continue
if word == "adv":
continue
if word == "prep":
continue
if word == "conj":
continue
file.write(str(word)+"\n")
file.close()
else:
word = Document(path)
lens = len(word.paragraphs)
print("总共"+str(lens)+"行")
allWord = ""
for i in range(lens):
print("正在处理文件第"+str(i)+"行")
allWord += word.paragraphs[i].text
wordList = re.sub(r"( ){2,}"," ",re.sub(r"([^a-z])"," ",allWord)).split(" ")
file = open("/Users/helei/Documents/英语文档/all.txt",mode="a+")
wordStr = ""
print("总共"+str(len(wordList))+"单词")
numword = 0
for word in wordList:
numword = numword+1
print("正在处理文件第"+str(numword)+"个单词")
if word=="n":
continue
if word == "ad":
continue
if word == "a":
continue
if word == "vt":
continue
if word == "vi":
continue
if word == "pron":
continue
if word == "adj":
continue
if word == "adv":
continue
if word == "prep":
continue
if word == "conj":
continue
wordStr += str(word)+"\n"
file.write(wordStr)
file.close()
#逐行读取txt文件
def readTxt():
file = open("/Users/helei/Documents/英语文档/allRes.txt",mode="r")
line = file.readline()
num = 0
word = []
while line:
num = num+1
#print(line,end="")
word.append(line.replace("\n",""))
line = file.readline()
print(num)
resWord = list(set(word))
resWord.sort()
print(resWord)
print(len(resWord))
#file = open("/Users/helei/Documents/英语文档/allRes.txt",mode="w+")
#for word in resWord:
# file.write(word+"\n")
#file.close()
if __name__ == '__main__':
#pdfToWord()
#readWord()
readTxt()
非特殊说明,本博所有文章均为博主原创。