From 29681c4f79c61440d0447dfb43e6a625b53f8e74 Mon Sep 17 00:00:00 2001
From: myhloli <moe@myhloli.com>
Date: Thu, 9 Jan 2025 23:39:04 +0800
Subject: [PATCH] fix(language): enhance language detection and text processing

- Improve language detection by removing newline characters from the input text
- Add error handling and fallback mechanism to deal with text containing control characters
---
 magic_pdf/libs/language.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/magic_pdf/libs/language.py b/magic_pdf/libs/language.py
index 396c9008..76e2eac9 100644
--- a/magic_pdf/libs/language.py
+++ b/magic_pdf/libs/language.py
@@ -16,11 +16,14 @@ def detect_lang(text: str) -> str:
 
     if len(text) == 0:
         return ""
+
+    text = text.replace("\n", "")
     try:
         lang_upper = detect_language(text)
     except:
         html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
         lang_upper = detect_language(html_no_ctrl_chars)
+
     try:
         lang = lang_upper.lower()
     except: