From b3d60b961f227603961ef30df53e4dc0f2cacd02 Mon Sep 17 00:00:00 2001 From: myhloli Date: Mon, 20 Jan 2025 16:23:18 +0800 Subject: [PATCH] fix(ocr): improve ONNX model initialization and error handling - Add key length validation for ONNX model initialization - Move import statements to the top of the file - Wrap model initialization in a try-except block for better error handling - Refactor code to improve readability and maintainability --- .../sub_modules/ocr/paddleocr/ocr_utils.py | 59 +++++++++++-------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py b/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py index 90f74d0c..90eb84a3 100644 --- a/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +++ b/magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py @@ -7,6 +7,8 @@ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line +import importlib.resources +from paddleocr import PaddleOCR from ppocr.utils.utility import check_and_read @@ -327,30 +329,35 @@ def get_onnx_model(self, **kwargs): return self._models[key] def onnx_model_init(key): - - import importlib.resources - - with importlib.resources.path('rapidocr_onnxruntime.models','') as resource_path: - onnx_model = None - additional_ocr_params = { - "use_onnx": True, - "det_model_dir": f'{resource_path}/ch_PP-OCRv4_det_infer.onnx', - "rec_model_dir": f'{resource_path}/ch_PP-OCRv4_rec_infer.onnx', - "cls_model_dir": f'{resource_path}/ch_ppocr_mobile_v2.0_cls_infer.onnx', - "det_db_box_thresh": key[1], - "use_dilation": key[2], - "det_db_unclip_ratio": key[3], - } - # logger.info(f"additional_ocr_params: {additional_ocr_params}") - - if key[0] is not None: - additional_ocr_params["lang"] = key[0] - - from paddleocr import PaddleOCR - onnx_model = PaddleOCR(**additional_ocr_params) - - if onnx_model is None: - logger.error('model init failed') + if len(key) < 4: + logger.error('Invalid key length, expected at least 4 elements') exit(1) - else: - return onnx_model \ No newline at end of file + + try: + with importlib.resources.path('rapidocr_onnxruntime.models', '') as resource_path: + additional_ocr_params = { + "use_onnx": True, + "det_model_dir": f'{resource_path}/ch_PP-OCRv4_det_infer.onnx', + "rec_model_dir": f'{resource_path}/ch_PP-OCRv4_rec_infer.onnx', + "cls_model_dir": f'{resource_path}/ch_ppocr_mobile_v2.0_cls_infer.onnx', + "det_db_box_thresh": key[1], + "use_dilation": key[2], + "det_db_unclip_ratio": key[3], + } + + if key[0] is not None: + additional_ocr_params["lang"] = key[0] + + # logger.info(f"additional_ocr_params: {additional_ocr_params}") + + onnx_model = PaddleOCR(**additional_ocr_params) + + if onnx_model is None: + logger.error('model init failed') + exit(1) + else: + return onnx_model + + except Exception as e: + logger.exception(f'Error initializing model: {e}') + exit(1) \ No newline at end of file