diff --git a/magic_pdf/libs/language.py b/magic_pdf/libs/language.py index 396c9008..76e2eac9 100644 --- a/magic_pdf/libs/language.py +++ b/magic_pdf/libs/language.py @@ -16,11 +16,14 @@ def detect_lang(text: str) -> str: if len(text) == 0: return "" + + text = text.replace("\n", "") try: lang_upper = detect_language(text) except: html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]]) lang_upper = detect_language(html_no_ctrl_chars) + try: lang = lang_upper.lower() except: