From 2de1d0ef0529b2940222f7ea91e98ade57b5b85d Mon Sep 17 00:00:00 2001 From: myhloli Date: Wed, 13 Nov 2024 16:29:16 +0800 Subject: [PATCH] fix(ocr_mkcontent): improve handling of single-character content - Add digit check for single-character content to avoid adding unnecessary spaces --- magic_pdf/dict2md/ocr_mkcontent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py index 2e2ce76e..63e11d72 100644 --- a/magic_pdf/dict2md/ocr_mkcontent.py +++ b/magic_pdf/dict2md/ocr_mkcontent.py @@ -168,7 +168,7 @@ def merge_para_with_text(para_block): # 如果是前一行带有-连字符,那么末尾不应该加空格 if __is_hyphen_at_line_end(content): para_text += content[:-1] - elif len(content) == 1 and content not in ['A', 'I', 'a', 'i']: + elif len(content) == 1 and content not in ['A', 'I', 'a', 'i'] and not content.isdigit(): para_text += content else: # 西方文本语境下 content间需要空格分隔 para_text += f"{content} "