diff --git a/.gitignore b/.gitignore index be92906a..e00d71a1 100644 --- a/.gitignore +++ b/.gitignore @@ -50,4 +50,5 @@ debug_utils/ _build/ -output/ \ No newline at end of file +output/ +/magic_pdf.egg-info/ diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py index a9059c96..ae804825 100644 --- a/magic_pdf/dict2md/ocr_mkcontent.py +++ b/magic_pdf/dict2md/ocr_mkcontent.py @@ -135,9 +135,10 @@ def merge_para_with_text(para_block): block_lang = detect_lang(block_text) para_text = '' + last_line_content = '' for i, line in enumerate(para_block['lines']): - if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False): + if i >= 1 and (line.get(ListLineTag.IS_LIST_START_LINE, False) or len(last_line_content) < 20): para_text += ' \n' for j, span in enumerate(line['spans']): @@ -152,6 +153,7 @@ def merge_para_with_text(para_block): content = f"\n$$\n{span['content']}\n$$\n" content = content.strip() + last_line_content = re.sub(r'\s+', '', content) if content: langs = ['zh', 'ja', 'ko'] diff --git a/requirements.txt b/requirements.txt index 86e03dc0..9e7343a8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,5 +9,5 @@ PyMuPDF>=1.24.9 scikit-learn>=1.0.2 torch>=2.2.2 transformers -pdfminer.six==20231228 +pdfminer.six # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator. diff --git a/scripts/download_models.py b/scripts/download_models.py index 2a8153ed..4c4d7ed7 100644 --- a/scripts/download_models.py +++ b/scripts/download_models.py @@ -35,9 +35,6 @@ def download_and_modify_json(url, local_filename, modifications): "models/Layout/LayoutLMv3/*", "models/Layout/YOLO/*", "models/MFD/YOLO/*", - "models/MFR/unimernet_small/*", - "models/TabRec/TableMaster/*", - "models/TabRec/StructEqTable/*", ] model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns) layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')