From 7340a7b509cbeeb8b9f524a3d356a4a7d364f169 Mon Sep 17 00:00:00 2001 From: qiangqiang199 <82048227+qiangqiang199@users.noreply.github.com> Date: Wed, 6 Nov 2024 18:46:52 +0800 Subject: [PATCH 01/11] =?UTF-8?q?=E6=96=B0=E5=A2=9EDocLayout-YOLO=E8=B6=85?= =?UTF-8?q?=E9=93=BE=E6=8E=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README_zh-CN.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README_zh-CN.md b/README_zh-CN.md index e232784c..76cc383b 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -429,6 +429,7 @@ TODO # Acknowledgments - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) +- [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO) - [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy) - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) - [PyMuPDF](https://github.com/pymupdf/PyMuPDF) From b6ce503e85de1b5b8c940c0f929b6453003c63e5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Sun, 10 Nov 2024 01:35:12 +0000 Subject: [PATCH 02/11] @hyastar has signed the CLA in opendatalab/MinerU#916 --- signatures/version1/cla.json | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/signatures/version1/cla.json b/signatures/version1/cla.json index e3412545..50c4754a 100644 --- a/signatures/version1/cla.json +++ b/signatures/version1/cla.json @@ -71,6 +71,14 @@ "created_at": "2024-10-26T17:39:26Z", "repoId": 765083837, "pullRequestNo": 793 + }, + { + "name": "hyastar", + "id": 117415976, + "comment_id": 2466539016, + "created_at": "2024-11-10T01:32:42Z", + "repoId": 765083837, + "pullRequestNo": 916 } ] } \ No newline at end of file From 041e3efb3a3f761386759ba14141827c250f0cf9 Mon Sep 17 00:00:00 2001 From: Xiaomeng Zhao Date: Mon, 11 Nov 2024 14:40:00 +0800 Subject: [PATCH 03/11] Update download_models.py --- scripts/download_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/download_models.py b/scripts/download_models.py index ed1ee5c3..23e07608 100644 --- a/scripts/download_models.py +++ b/scripts/download_models.py @@ -45,7 +45,7 @@ def download_and_modify_json(url, local_filename, modifications): print(f'model_dir is: {model_dir}') print(f'layoutreader_model_dir is: {layoutreader_model_dir}') - json_url = 'https://gitee.com/myhloli/MinerU/raw/dev/magic-pdf.template.json' + json_url = 'https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.template.json' config_file_name = 'magic-pdf.json' home_dir = os.path.expanduser('~') config_file = os.path.join(home_dir, config_file_name) From 8fda652ac2b548730fbcbc219ba719431e56e201 Mon Sep 17 00:00:00 2001 From: Xiaomeng Zhao Date: Mon, 11 Nov 2024 14:41:48 +0800 Subject: [PATCH 04/11] Update download_models_hf.py --- scripts/download_models_hf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/download_models_hf.py b/scripts/download_models_hf.py index 5e6b8dce..e2af5a09 100644 --- a/scripts/download_models_hf.py +++ b/scripts/download_models_hf.py @@ -52,7 +52,7 @@ def download_and_modify_json(url, local_filename, modifications): print(f'model_dir is: {model_dir}') print(f'layoutreader_model_dir is: {layoutreader_model_dir}') - json_url = 'https://github.com/opendatalab/MinerU/raw/dev/magic-pdf.template.json' + json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json' config_file_name = 'magic-pdf.json' home_dir = os.path.expanduser('~') config_file = os.path.join(home_dir, config_file_name) From 7644bb2a939857814ad1c60824b20ff82b7b6948 Mon Sep 17 00:00:00 2001 From: Xiaomeng Zhao Date: Mon, 11 Nov 2024 15:07:23 +0800 Subject: [PATCH 05/11] Update how_to_download_models_zh_cn.md --- docs/how_to_download_models_zh_cn.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/how_to_download_models_zh_cn.md b/docs/how_to_download_models_zh_cn.md index c1f4e111..e8e2e1a0 100644 --- a/docs/how_to_download_models_zh_cn.md +++ b/docs/how_to_download_models_zh_cn.md @@ -8,7 +8,7 @@ 方法一:从 Hugging Face 下载模型

使用python脚本 从Hugging Face下载模型文件

pip install huggingface_hub
-wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models_hf.py -O download_models_hf.py
+wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
 python download_models_hf.py

python脚本会自动下载模型文件并配置好配置文件中的模型目录

@@ -19,7 +19,7 @@ python download_models_hf.py ```bash pip install modelscope -wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models.py -O download_models.py +wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models.py -O download_models.py python download_models.py ``` python脚本会自动下载模型文件并配置好配置文件中的模型目录 From 1a87c415fd75a85053acf9e93001b660b5751e2f Mon Sep 17 00:00:00 2001 From: Xiaomeng Zhao Date: Mon, 11 Nov 2024 15:07:48 +0800 Subject: [PATCH 06/11] Update how_to_download_models_en.md --- docs/how_to_download_models_en.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/how_to_download_models_en.md b/docs/how_to_download_models_en.md index 93b4c8b4..e0abed39 100644 --- a/docs/how_to_download_models_en.md +++ b/docs/how_to_download_models_en.md @@ -8,7 +8,7 @@ Model downloads are divided into initial downloads and updates to the model dire Use a Python Script to Download Model Files from Hugging Face ```bash pip install huggingface_hub -wget https://github.com/opendatalab/MinerU/raw/master/docs/download_models_hf.py -O download_models_hf.py +wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py python download_models_hf.py ``` The Python script will automatically download the model files and configure the model directory in the configuration file. From 07f9fd00ab0e8a686673a0911e1478e4b816770f Mon Sep 17 00:00:00 2001 From: Xiaomeng Zhao Date: Mon, 11 Nov 2024 15:08:12 +0800 Subject: [PATCH 07/11] Delete docs/download_models.py --- docs/download_models.py | 65 ----------------------------------------- 1 file changed, 65 deletions(-) delete mode 100644 docs/download_models.py diff --git a/docs/download_models.py b/docs/download_models.py deleted file mode 100644 index 6c79db19..00000000 --- a/docs/download_models.py +++ /dev/null @@ -1,65 +0,0 @@ - -import json -import os - -import requests -from modelscope import snapshot_download - - -def download_json(url): - # 下载JSON文件 - response = requests.get(url) - response.raise_for_status() # 检查请求是否成功 - return response.json() - - -def download_and_modify_json(url, local_filename, modifications): - if os.path.exists(local_filename): - data = json.load(open(local_filename)) - config_version = data.get('config_version', '0.0.0') - if config_version < '1.0.0': - data = download_json(url) - else: - data = download_json(url) - - - # 修改内容 - for key, value in modifications.items(): - data[key] = value - - # 保存修改后的内容 - with open(local_filename, 'w', encoding='utf-8') as f: - json.dump(data, f, ensure_ascii=False, indent=4) - - -if __name__ == '__main__': - - mineru_patterns = [ - "models/Layout/LayoutLMv3/*", - "models/Layout/YOLO/*", - "models/MFD/YOLO/*", - "models/MFR/unimernet_small/*", - "models/TabRec/TableMaster/*", - "models/TabRec/StructEqTable/*", - ] - model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns) - layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader') - model_dir = model_dir + '/models' - print(f'model_dir is: {model_dir}') - print(f'layoutreader_model_dir is: {layoutreader_model_dir}') - - json_url = 'https://gitee.com/myhloli/MinerU/raw/dev/magic-pdf.template.json' - config_file_name = 'magic-pdf.json' - home_dir = os.path.expanduser('~') - - config_file = os.path.join(home_dir, config_file_name) - - json_mods = { - 'models-dir': model_dir, - 'layoutreader-model-dir': layoutreader_model_dir, - } - - download_and_modify_json(json_url, config_file, json_mods) - - print(f'The configuration file has been configured successfully, the path is: {config_file}') - From 8ca9eb32463ba9c0f9a2cb603ec36daa7caa7996 Mon Sep 17 00:00:00 2001 From: Xiaomeng Zhao Date: Mon, 11 Nov 2024 15:08:20 +0800 Subject: [PATCH 08/11] Delete docs/download_models_hf.py --- docs/download_models_hf.py | 70 -------------------------------------- 1 file changed, 70 deletions(-) delete mode 100644 docs/download_models_hf.py diff --git a/docs/download_models_hf.py b/docs/download_models_hf.py deleted file mode 100644 index 9dfda1e5..00000000 --- a/docs/download_models_hf.py +++ /dev/null @@ -1,70 +0,0 @@ -import json -import os - -import requests -from huggingface_hub import snapshot_download - - -def download_json(url): - # 下载JSON文件 - response = requests.get(url) - response.raise_for_status() # 检查请求是否成功 - return response.json() - - -def download_and_modify_json(url, local_filename, modifications): - if os.path.exists(local_filename): - data = json.load(open(local_filename)) - config_version = data.get('config_version', '0.0.0') - if config_version < '1.0.0': - data = download_json(url) - else: - data = download_json(url) - - - # 修改内容 - for key, value in modifications.items(): - data[key] = value - - # 保存修改后的内容 - with open(local_filename, 'w', encoding='utf-8') as f: - json.dump(data, f, ensure_ascii=False, indent=4) - - -if __name__ == '__main__': - - mineru_patterns = [ - "models/Layout/LayoutLMv3/*", - "models/Layout/YOLO/*", - "models/MFD/YOLO/*", - "models/MFR/unimernet_small/*", - "models/TabRec/TableMaster/*", - "models/TabRec/StructEqTable/*", - ] - model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns) - - layoutreader_pattern = [ - "*.json", - "*.safetensors", - ] - layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern) - - model_dir = model_dir + '/models' - print(f'model_dir is: {model_dir}') - print(f'layoutreader_model_dir is: {layoutreader_model_dir}') - - json_url = 'https://github.com/opendatalab/MinerU/raw/dev/magic-pdf.template.json' - config_file_name = 'magic-pdf.json' - home_dir = os.path.expanduser('~') - - config_file = os.path.join(home_dir, config_file_name) - - json_mods = { - 'models-dir': model_dir, - 'layoutreader-model-dir': layoutreader_model_dir, - } - - download_and_modify_json(json_url, config_file, json_mods) - - print(f'The configuration file has been configured successfully, the path is: {config_file}') - From 5267347511d975aedbe18090ae05e7a0f0b6cfae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=B3=E5=A6=82=E8=80=81=E7=8B=97?= Date: Wed, 13 Nov 2024 10:40:29 +0800 Subject: [PATCH 09/11] =?UTF-8?q?=E4=BF=AE=E5=A4=8DDockerfile=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E8=B7=AF=E5=BE=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index dbbe7949..870432d7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -42,7 +42,7 @@ RUN /bin/bash -c "wget https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.tem # Download models and update the configuration file RUN /bin/bash -c "pip3 install modelscope && \ - wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models.py && \ + wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models.py && \ python3 download_models.py && \ sed -i 's|cpu|cuda|g' /root/magic-pdf.json" From 4c946d5b2136cf183f2b9be17813e84dd7731c3e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 13 Nov 2024 02:57:49 +0000 Subject: [PATCH 10/11] @kimi360 has signed the CLA in opendatalab/MinerU#938 --- signatures/version1/cla.json | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/signatures/version1/cla.json b/signatures/version1/cla.json index 50c4754a..79b8b633 100644 --- a/signatures/version1/cla.json +++ b/signatures/version1/cla.json @@ -79,6 +79,14 @@ "created_at": "2024-11-10T01:32:42Z", "repoId": 765083837, "pullRequestNo": 916 + }, + { + "name": "kimi360", + "id": 3158007, + "comment_id": 2472266659, + "created_at": "2024-11-13T02:57:34Z", + "repoId": 765083837, + "pullRequestNo": 938 } ] } \ No newline at end of file From d0558abb43844102ba4e7d7b56c7953531b33d67 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 13 Nov 2024 09:37:58 +0000 Subject: [PATCH 11/11] @ProseGuys has signed the CLA in opendatalab/MinerU#945 --- signatures/version1/cla.json | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/signatures/version1/cla.json b/signatures/version1/cla.json index 79b8b633..c7009647 100644 --- a/signatures/version1/cla.json +++ b/signatures/version1/cla.json @@ -87,6 +87,14 @@ "created_at": "2024-11-13T02:57:34Z", "repoId": 765083837, "pullRequestNo": 938 + }, + { + "name": "ProseGuys", + "id": 45124798, + "comment_id": 2472990455, + "created_at": "2024-11-13T09:37:42Z", + "repoId": 765083837, + "pullRequestNo": 945 } ] } \ No newline at end of file