From 29681c4f79c61440d0447dfb43e6a625b53f8e74 Mon Sep 17 00:00:00 2001 From: myhloli Date: Thu, 9 Jan 2025 23:39:04 +0800 Subject: [PATCH] fix(language): enhance language detection and text processing - Improve language detection by removing newline characters from the input text - Add error handling and fallback mechanism to deal with text containing control characters --- magic_pdf/libs/language.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/magic_pdf/libs/language.py b/magic_pdf/libs/language.py index 396c9008..76e2eac9 100644 --- a/magic_pdf/libs/language.py +++ b/magic_pdf/libs/language.py @@ -16,11 +16,14 @@ def detect_lang(text: str) -> str: if len(text) == 0: return "" + + text = text.replace("\n", "") try: lang_upper = detect_language(text) except: html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]]) lang_upper = detect_language(html_no_ctrl_chars) + try: lang = lang_upper.lower() except: