-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapi_parser.py
405 lines (370 loc) · 18.6 KB
/
api_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
# sudo nohup python3 -u api_parser.py >> log.txt 2>&1 &
import argparse,pdb
import json
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0' #设置工作的显卡位置
import shutil
from typing import List, Optional
import pydantic
import uvicorn
from fastapi import Body, FastAPI, File, Form, Query, UploadFile, WebSocket
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing_extensions import Annotated
from starlette.responses import RedirectResponse
from langchain.document_loaders import UnstructuredFileLoader, TextLoader
from utils.api_utils import BaseResponse, SummaryResponse
import datetime
from time import ctime
from loguru import logger
from loader import UnstructuredPaddleImageLoader, UnstructuredPaddlePDFLoader,UnstructuredWordLoader,UnstructuredAudioLoader
from langchain.text_splitter import CharacterTextSplitter
from io import BytesIO
import requests
from OpenSSL import crypto, SSL
OPEN_CROSS_DOMAIN=False
ROBOT_FILE_SUMMARY = "./sourcefiles"
# 文本分句长度
SENTENCE_SIZE = 100
# 匹配后单段上下文长度
CHUNK_SIZE = 250
def generate_certificate(
organization="Haier",
common_name="113.105.90.161:8020",
country="NL",
duration=(365 * 24 * 60 * 60),
keyfilename="key.pem",
certfilename="cert.pem"):
k = crypto.PKey()
k.generate_key(crypto.TYPE_RSA, 4096)
cert = crypto.X509()
cert.get_subject().C = country
cert.get_subject().O = organization
cert.get_subject().CN = common_name
cert.gmtime_adj_notBefore(0)
cert.gmtime_adj_notAfter(duration)
cert.set_issuer(cert.get_subject())
cert.set_pubkey(k)
cert.sign(k, 'sha512')
with open(keyfilename, "wt") as keyfile:
keyfile.write(crypto.dump_privatekey(crypto.FILETYPE_PEM, k).decode("utf-8"))
with open(certfilename, "wt") as certfile:
certfile.write(crypto.dump_certificate(crypto.FILETYPE_PEM, cert).decode("utf-8"))
class ChineseTextSplitter(CharacterTextSplitter):
def __init__(self, pdf: bool = False, sentence_size: int = SENTENCE_SIZE, **kwargs):
super().__init__(**kwargs)
self.pdf = pdf
self.sentence_size = sentence_size
def split_text(self, text: str) -> List[str]:
return [text]
def load_file(filepath, sentence_size=SENTENCE_SIZE,file_id="",user_id="",page_num=1,pages=False,enhanced=True):
textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
if filepath.lower().endswith(".md"):
loader = UnstructuredFileLoader(filepath)
docs = loader.load()
elif filepath.lower().endswith(".txt"):
loader = TextLoader(filepath, autodetect_encoding=True)
#loader = TextLoader(filepath, encoding="utf-8", autodetect_encoding=True)
docs = loader.load_and_split(textsplitter)
elif filepath.lower().endswith(".pdf"):
loader = UnstructuredPaddlePDFLoader(filepath,mode="elements",pages=pages,enhanced=enhanced)
# textsplitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size)
docs = loader.load_and_split(textsplitter)
elif filepath.lower().endswith(".jpg") or filepath.lower().endswith(".png") or filepath.lower().endswith(".jpeg"):
loader = UnstructuredPaddleImageLoader(filepath, mode="elements")
# textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
docs = loader.load_and_split(text_splitter=textsplitter)
elif filepath.lower().endswith(".mp3") or filepath.lower().endswith(".wav") :
loader = UnstructuredAudioLoader(filepath, mode="elements")
# textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
docs = loader.load_and_split(text_splitter=textsplitter)
elif filepath.lower().endswith(".docx") or filepath.lower().endswith(".doc"):
loader = UnstructuredWordLoader(filepath,mode="elements",pages=pages)
docs = loader.load_and_split(text_splitter=textsplitter)
else:
loader = UnstructuredFileLoader(filepath, mode="elements")
# textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
docs = loader.load_and_split(text_splitter=textsplitter)
return docs
async def unified_parser_columns( #async
file: UploadFile = File(description="A single binary file"),
file_id: str = Body(..., description="File ID"),
user_id: str = Body(..., description="User ID"),
file_name:str = Body(..., description="file name"),
):
"""
统一解析器,解析成字符串,文档类支持pdf,word,图片类支持jpg、jpeg或png格式,音频类支持mp3、wav等格式,本接口对pdf 分栏效果要好些
"""
logger.info('file_id:{0},user_id:{1},file_name:{2},file.filename:{3},time is {4}'.format(file_id,user_id,file_name,file.filename,ctime()))
if not file_name: #为空
file_name=file.filename
path_temp=(file_id+'_'+user_id+"_"+file_name).replace("/","").replace("\\","")
file_path = os.path.join(ROBOT_FILE_SUMMARY, path_temp)
if not os.path.exists(file_path): #当前目录不存在
try:
file_content = await file.read() # 读取上传文件的内容,await
if file_content == "":
logger.error("上传文件为空:{}", e)
return SummaryResponse(code=500, msg="上传文件失败", data="")
with open(file_path, "wb") as f:
f.write(file_content)
except Exception as e:
logger.error("上传文件失败:{}", e)
return SummaryResponse(code=500, msg="上传文件失败", data="")
if not os.path.exists(file_path):
logger.error("文件保存失败:{}", e)
return SummaryResponse(code=500, msg="上传文件失败", data="")
#判断中间文件是否存在
if file_name.endswith('.txt'): #txt 没有中间文件
file_path_text = file_path
else:
file_path_text = os.path.join(os.path.join(ROBOT_FILE_SUMMARY,'tmp_files'), file_id+'_'+user_id+"_"+file_name+'.txt')
if os.path.exists(file_path_text) :
pass
else:#不存在中间文件
print('in unified_parser,load_file again')
# 解析文件,生成中间文件
time1 = datetime.datetime.now()
try:
docs = load_file(file_path, sentence_size=SENTENCE_SIZE,file_id=file_id, user_id=user_id,enhanced=True)
logger.info("{} 已成功加载", file)
except Exception as e:
logger.error("解析文件失败:{}", e)
return SummaryResponse(code=500, msg="解析文件失败", data="")
time2 = datetime.datetime.now()
logger.info("解析+切分耗时:{}秒", (time2 - time1).seconds)
#判断中间文件是否存在
if os.path.exists(file_path_text) :
with open(file_path_text,'r',encoding='utf-8') as f:
data=f.read()
return SummaryResponse(code=200, msg="请求成功", data=data)
else:#不存在中间文件
logger.error("中间文件不存在:{0}".format(file_path_text))
return SummaryResponse(code=501, msg="解析文件失败",data="")
async def unified_parser( #async
file: UploadFile = File(description="A single binary file"),
file_id: str = Body(..., description="File ID"),
user_id: str = Body(..., description="User ID"),
file_name:str = Body(..., description="file name"),
):
"""
统一解析器,解析成字符串,文档类支持pdf,word,图片类支持jpg、jpeg或png格式,音频类支持mp3、wav等格式
"""
#pdb.set_trace()
logger.info('file_id:{0},user_id:{1},file_name:{2},file.filename:{3},time is {4}'.format(file_id,user_id,file_name,file.filename,ctime()))
if not file_name: #为空
file_name=file.filename
path_temp=(file_id+'_'+user_id+"_"+file_name).replace("/","").replace("\\","")
file_path = os.path.join(ROBOT_FILE_SUMMARY, path_temp)
if not os.path.exists(file_path): #当前目录不存在
try:
file_content = await file.read() # 读取上传文件的内容,await
if file_content == "":
logger.error("上传文件为空:{}", e)
return SummaryResponse(code=500, msg="上传文件失败", data="")
with open(file_path, "wb") as f:
f.write(file_content)
except Exception as e:
logger.error("上传文件失败:{}", e)
return SummaryResponse(code=500, msg="上传文件失败", data="")
if not os.path.exists(file_path):
logger.error("文件保存失败:{}", e)
return SummaryResponse(code=500, msg="上传文件失败", data="")
#判断中间文件是否存在
if file_name.endswith('.txt'): #txt 没有中间文件
file_path_text = file_path
else:
file_path_text = os.path.join(os.path.join(ROBOT_FILE_SUMMARY,'tmp_files'), file_id+'_'+user_id+"_"+file_name+'.txt')
if os.path.exists(file_path_text) :
pass
else:#不存在中间文件
print('in unified_parser,load_file again')
# 解析文件,生成中间文件
time1 = datetime.datetime.now()
try:
docs = load_file(file_path, sentence_size=SENTENCE_SIZE,file_id=file_id, user_id=user_id)
logger.info("{} 已成功加载", file)
except Exception as e:
logger.error("解析文件失败:{}", e)
return SummaryResponse(code=500, msg="解析文件失败", data="")
time2 = datetime.datetime.now()
logger.info("解析+切分耗时:{}秒", (time2 - time1).seconds)
#判断中间文件是否存在
if os.path.exists(file_path_text) :
with open(file_path_text,'r',encoding='utf-8') as f:
data=f.read()
return SummaryResponse(code=200, msg="请求成功", data=data)
else:#不存在中间文件
logger.error("中间文件不存在:{0}".format(file_path_text))
return SummaryResponse(code=501, msg="解析文件失败",data="")
async def unified_parser_pages( #async
file: UploadFile = File(description="A single binary file"),
file_id: str = Body(..., description="File ID"),
user_id: str = Body(..., description="User ID"),
file_name:str = Body(..., description="file name"),
):
"""
统一解析器,解析成字符串,文档类支持pdf,word,图片类支持jpg、jpeg或png格式,音频类支持mp3、wav等格式,本接口针对pdf解析增加了带页码的功能,返回的字符串是一个json数据,里面包含页码信息,
"""
#pdb.set_trace()
logger.info('file_id:{0},user_id:{1},file_name:{2},file.filename:{3},time is {4}'.format(file_id,user_id,file_name,file.filename,ctime()))
if not file_name: #为空
file_name=file.filename
path_temp=(file_id+'_'+user_id+"_"+file_name).replace("/","").replace("\\","")
file_path = os.path.join(ROBOT_FILE_SUMMARY, path_temp)
if not os.path.exists(file_path): #当前目录不存在
try:
file_content = await file.read() # 读取上传文件的内容,await
if file_content == "":
logger.error("上传文件为空:{}", e)
return SummaryResponse(code=500, msg="上传文件失败", data="")
with open(file_path, "wb") as f:
f.write(file_content)
except Exception as e:
logger.error("上传文件失败:{}", e)
return SummaryResponse(code=500, msg="上传文件失败", data="")
if not os.path.exists(file_path):
logger.error("文件保存失败:{}", e)
return SummaryResponse(code=500, msg="上传文件失败", data="")
#判断中间文件是否存在
if file_name.endswith('.txt'): #txt 没有中间文件
file_path_text = file_path
else:
file_path_text = os.path.join(os.path.join(ROBOT_FILE_SUMMARY,'tmp_files'), file_id+'_'+user_id+"_"+file_name+'.txt')
if os.path.exists(file_path_text):
pass
else:#不存在中间文件
print('in unified_parser,load_file again')
# 解析文件,生成中间文件
time1 = datetime.datetime.now()
try:
docs = load_file(file_path, sentence_size=SENTENCE_SIZE,file_id=file_id, user_id=user_id,pages=True)
logger.info("{} 已成功加载", file)
except Exception as e:
logger.error("解析文件失败:{}", e)
return SummaryResponse(code=500, msg="解析文件失败", data="")
time2 = datetime.datetime.now()
logger.info("解析+切分耗时:{}秒", (time2 - time1).seconds)
#判断中间文件是否存在
if os.path.exists(file_path_text):
with open(file_path_text,'r',encoding='utf-8') as f:
data=f.read()
return SummaryResponse(code=200, msg="请求成功", data=data)
else:#不存在中间文件
logger.error("中间文件不存在:{0}".format(file_path_text))
return SummaryResponse(code=501, msg="解析文件失败",data="")
def unified_parser_url( #async
url: str = Body(..., description="File ID"),
file_id: str = Body(..., description="File ID"),
user_id: str = Body(..., description="User ID"),
file_name:str = Body(..., description="file name"),
):
"""
统一解析器,解析成字符串,文档类支持pdf,word,图片类支持jpg、jpeg或png格式,音频类支持mp3、wav等格式,本接口针对pdf解析增加了带页码的功能,返回的字符串是一个json数据,里面包含页码信息,源文件是url 形式
"""
#pdb.set_trace()
logger.info('url:{0},file_id:{1},user_id:{2},file_name:{3},time is {4}'.format(url,file_id,user_id,file_name,ctime()))
try:
response = requests.get(url,timeout=120)
except Exception as e:
logger.error("url error:{0}".format(e))
return SummaryResponse( code=505,msg="解析URL出错",data="")
if response.status_code!=200:
logger.error("url error,result status_code!=200,result:{0}".format(response))
return SummaryResponse(code=506,msg="解析URL失败",data="")
path_temp=(file_id+'_'+user_id+"_"+file_name).replace("/","").replace("\\","")
file_path = os.path.join(ROBOT_FILE_SUMMARY, path_temp)
if not os.path.exists(file_path): #当前目录不存在
try:
file_content = response.content # 读取上传文件的内容,await
if file_content == "":
logger.error("上传文件为空:{}", e)
return SummaryResponse(code=500, msg="上传文件失败", data="")
with open(file_path, "wb") as f:
f.write(file_content)
except Exception as e:
logger.error("上传文件失败:{}", e)
return SummaryResponse(code=500, msg="上传文件失败", data="")
if not os.path.exists(file_path):
logger.error("文件保存失败:{}", e)
return SummaryResponse(code=500, msg="上传文件失败", data="")
#判断中间文件是否存在
if file_name.endswith('.txt'): #txt 没有中间文件
file_path_text = file_path
else:
file_path_text = os.path.join(os.path.join(ROBOT_FILE_SUMMARY,'tmp_files'), file_id+'_'+user_id+"_"+file_name+'.txt')
if os.path.exists(file_path_text):
pass
else:#不存在中间文件
print('in unified_parser,load_file again')
# 解析文件,生成中间文件
time1 = datetime.datetime.now()
try:
docs = load_file(file_path, sentence_size=SENTENCE_SIZE,file_id=file_id, user_id=user_id,pages=True)
logger.info("{} 已成功加载", url)
except Exception as e:
logger.error("解析文件失败:{}", e)
return SummaryResponse(code=500, msg="解析文件失败", data="")
time2 = datetime.datetime.now()
logger.info("解析+切分耗时:{}秒", (time2 - time1).seconds)
#判断中间文件是否存在
if os.path.exists(file_path_text):
with open(file_path_text,'r',encoding='utf-8') as f:
data=f.read()
return SummaryResponse(code=200, msg="请求成功", data=data)
else:#不存在中间文件
logger.error("中间文件不存在:{0}".format(file_path_text))
return SummaryResponse(code=501, msg="解析文件失败",data="")
async def document():
return RedirectResponse(url="/docs")
def api_start(host, port,ssl_keyfile, ssl_certfile):
global app
app = FastAPI()
# Add CORS middleware to allow all origins
# 在config.py中设置OPEN_DOMAIN=True,允许跨域
# set OPEN_DOMAIN=True in config.py to allow cross-domain
if OPEN_CROSS_DOMAIN:
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
app.get("/",
response_model=BaseResponse,
summary="swagger 文档")(document)
#行研机器人解析pdf接口
app.post("/local_doc_qa/unified_parser_columns",
response_model=SummaryResponse,
tags=["统一解析器"],
summary="文档解析")(unified_parser_columns)
app.post("/local_doc_qa/unified_parser",
response_model=SummaryResponse,
tags=["统一解析器"],
summary="文档解析")(unified_parser)
# 带页码
app.post("/local_doc_qa/unified_parser_pages",
response_model=SummaryResponse,
tags=["统一解析器"],
summary="文档解析")(unified_parser_pages)
app.post("/local_doc_qa/unified_parser_url",
response_model=SummaryResponse,
tags=["统一解析器"],
summary="文档解析")(unified_parser_url)
uvicorn.run(app, host=host, port=port,ssl_keyfile=ssl_keyfile, ssl_certfile=ssl_certfile)
if __name__ == "__main__":
generate_certificate()
if not os.path.exists(ROBOT_FILE_SUMMARY):
os.mkdir(ROBOT_FILE_SUMMARY)
file_path='23年地产行业动态-1月.docx'
__spec__ = "ModuleSpec(name='builtins', loader=<class '_frozen_importlib.BuiltinImporter'>)"
parser = argparse.ArgumentParser(prog='a unified parser',
description='a unified parser')
parser.add_argument("--host", type=str, default="0.0.0.0")
parser.add_argument("--port", type=int, default=15282)
# 初始化消息
args = parser.parse_args()
args_dict = vars(args)
# api_start(args.host, args.port,ssl_keyfile="./key.pem", ssl_certfile="./cert.pem") #使用ssl 进行安全验证
api_start(args.host, args.port)