将有大量表格图片的Word/PDF识别成文本Word(Python调用阿里云读光接口)
将有大量表格图片的word转换成文本格式(依然是word格式),暂且记录下,一时间写的比较乱,有空了再好好梳理和调整。
主要是通过“阿里云读光PDF识别”(官方网址)的API实现的,这边的流程是:先把包含大量表格图片的word转换成PDF,然后将PDF拆分(因为API每次调用最多识别20页),然后请求识别,返回拆分后对应的word,再将word合并起来。
#!/usr/bin/env python# coding=utf-8from PyPDF2 import PdfFileReader, PdfFileWriterimport osimport urllib.requestimport urllib.parseimport jsonimport timeimport base64from os.path import abspathfrom win32com import clienturl_request="https://generalpdf.market.alicloudapi.com/ocrservice/pdf" #apiAppCode = "" #appcode(需个人实名购买,500次免费)headers = {'Authorization': 'APPCODE ' + AppCode,'Content-Type': 'application/json; charset=UTF-8'}# PDF文件分割,由于每次def split_pdf(read_file, pagelenth): #pagelenth每一次分割的页数try:fp_read_file = open(read_file, 'rb')pdf_input = PdfFileReader(fp_read_file) # 将要分割的PDF内容格式话page_count = pdf_input.getNumPages() # 获取PDF页数print(page_count) # 打印页数start_page=1write_file=0while start_page<=page_count:end_page=start_page+pagelenth-1if end_page>page_count:end_page=page_count #控制有效页数write_file+=1pdf_file = f'{write_file}.pdf'try:print(f'开始分割{start_page}页-{end_page}页,保存为{pdf_file}......')pdf_output = PdfFileWriter() # 实例一个 PDF文件编写器for i in range(start_page-1, end_page):pdf_output.addPage(pdf_input.getPage(i))with open(pdf_file, 'wb') as sub_fp:pdf_output.write(sub_fp)print(f'完成分割{start_page}页-{end_page}页,保存为{pdf_file}!')except IndexError:print(f'分割页数超过了PDF的页数')start_page=end_page+1except Exception as e:print(e)finally:fp_read_file.close()#请求阿里云读光PDF识别def posturl(url,data={}):try:params=json.dumps(data).encode(encoding='UTF8') #data为请求数据,参考官方格式req = urllib.request.Request(url, params, headers)r = urllib.request.urlopen(req)html =r.read()r.close();return html.decode("utf8")except urllib.error.HTTPError as e:print(e.code)print(e.read().decode("utf8"))time.sleep(1)#返回jason的str#word合并def docx_merge(files,final_docx):word=client.gencache.EnsureDispatch('Word.Application')word.Visible=Truenew_document=word.Documents.Add() #创建新文档for fn in files:fn = abspath(fn)temp_document = word.Documents.Open(fn)word.Selection.WholeStory()word.Selection.Copy() #将分文档的内容全部复制进剪贴板temp_document.Close()new_document.Range() #粘贴到新文档word.Selection.Delete()word.Selection.Paste()new_document.SaveAs(final_docx)new_document.Close()word.Quit()#word合并可能会出现com_error: (-2147417848, '被调用的对象已与其客户端断开连接。', None, None),此时手动打开一下word客户端再执行就可以了if __name__=="__main__":#参数path = "../Untitled Folder/" #中间拆分文件过渡的文件夹位置(只放拆分文件)final_docx = r'C:/Users/XXX/Desktop/result.docx' #最终目标pdf_source = 'C:/Users/XXX/Desktop/XXX.pdf' #PDF源split_pdf(pdf_source, pagelenth=20) #每次切20页files = os.listdir(path)#print(files)filenames = [file for file in files if file[-3:]=='pdf']#filenames.sort(key=lambda x:x[0])print(filenames) #打印PDF拆分的文件名dicts=[]for body in filenames:with open(f'../Untitled Folder/{body}', 'rb') as f: # 挨个读取PDF拆分后的文件data = f.read()encodestr = str(base64.b64encode(data),'utf-8') #b64编码后需要转成str格式dicts.append({'fileBase64': encodestr,"table": True,'fileType': 'word'})docx_list=[]for d in dicts:html = posturl(url_request, data=d)a=eval(html)result=a['fileBase64'] #获取返回的word内容(ba64编码)ans=base64.b64decode(result) #解码tmp=f"{dicts.index(d)}.docx" #分文档docx_list.append(tmp)with open(tmp, 'wb') as f:f.write(base64.b64decode(result))f.close()print(docx_list) #打印生成的docx文件名docx_merge(docx_list,final_docx)
赞 (0)
