程序師世界是廣大編程愛好者互助、分享、學習的平台,程序師世界有你更精彩!
首頁
編程語言
C語言|JAVA編程
Python編程
網頁編程
ASP編程|PHP編程
JSP編程
數據庫知識
MYSQL數據庫|SqlServer數據庫
Oracle數據庫|DB2數據庫
您现在的位置: 程式師世界 >> 編程語言 >  >> 更多編程語言 >> Python

python提取PDF文件

編輯:Python

正在研究

import os
import pdfplumber
from common.log import Log
from common.data_process import FileOperate
from common.config_reader import ConfigReader
import datetime
class BaseMethod:
def __init__(self):
self.log = Log()
self.file_path = ConfigReader().get_value("file", "Case_file_path")
self.fileoperate = FileOperate()
# self.runstatus_file_path = ConfigReader().get_value("file", "runstatus_file_path")
def open_path(self):
# 獲取pdf文件路徑
(self.dirname, self.filename) = os.path.split(self.file_path)
(self.file, extension) = os.path.splitext(self.filename)
if self.dirname == "":
return -1
elif self.filename == "":
return -1
else:
return 0
def as_name(self):
# pdf 提取信息後另存為的路徑
if not os.path.exists(self.dirname):
os.mkdir(self.dirname)
timestr = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
self.savefile = os.path.join(self.dirname, self.file+'-'+timestr)
try:
if len(self.savefile) <= 100:
return self.savefile
except:
self.log.logMsg(2, 'Failed to get file')
return None
def pages(self):
pages = pdfplumber.open(self.file_path).pages
for i in range(len(pages)):
page = pages[i]
return page
def as_txt_file(self):
# 讀取pdf文件,寫入txt文件
txt_file = self.as_name()
try:
table = self.pages().extract_text()
self.fileoperate.writefile(txt_file + ".txt", "txt", table, "a+")
self.log.logMsg(4, 'writing txt file succeeded')
except:
self.log.logMsg(4, 'failed to write txt file')
def page_words(self):
words = self.pages().extract_words()
if words != []:
return words
else:
return None
def page_crop(self):
# pass
x0, top, x1, bottom = 16.0, 248.52601999999996, 818.0, 393.97403999999995
a = tuple([x0, top, x1, bottom])
words = self.pages().crop(a).extract_words()
print(words)

運行機制:

import os
from common.base import BaseMethod
from common.log import Log
from common.dopdf import DoPdf
class RunCase:
def __init__(self):
self.bm = BaseMethod()
self.log = Log()
def exec_case(self):
res = self.bm.open_path()
if res == 0:
# self.bm.as_txt_file()
self.bm.page_crop()
# words = self.bm.page_words()
# for i in words:
# if 'Pool' in i['text']: # x0,bottom = 16.0,248.52601999999996
# pass
# if 'Total' in i['text']: # 'x1': 818.0
# pass
# if 'Signed' in i['text']: # 'top': 393.97403999999995
# pass
# x0,bottom,x1,top'==>(x0, top, x1, bottom)=>(16.0,248.52601999999996,818.0,393.97403999999995)
if __name__ == '__main__':
RunCase().exec_case()

讀取路徑下多個pdf:

from common.log import Log
import os
class OperationPdf:
def __init__(self):
self.log =Log()
def get_pdflist(self):
# 讀取路徑下多個pdf:
file_list = []
root_dir = r'E:\joinkwang\Documents\web_frameword\data'
dir_or_files = os.listdir(root_dir)
for dir_file in dir_or_files:
dir_file_path = os.path.join(root_dir, dir_file)
if os.path.isdir(dir_file_path):
pass
else:
filepath, filename = os.path.split(dir_file_path)
file_list.append(filename)
# lambda函數用於指定過濾列表元素的條件。
# 例如filter(lambda x: x % 3 == 0, [1, 2, 3])指定將列表[1,2,3]中能夠被3整除的元素過濾出來,其結果是[3]。
case_list = list(filter(lambda case: ".pdf" in case, file_list))
if not case_list:
# print('sdsdsdsds')
raise Exception(self.log.logMsg(3, ("case not exist")))
return case_list
if __name__ == '__main__':
pdffile = OperationPdf().get_pdflist()
print(pdffile)

  1. 上一篇文章:
  2. 下一篇文章:
Copyright © 程式師世界 All Rights Reserved