diff --git a/README_zh-CN.md b/README_zh-CN.md index e232784c..76cc383b 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -429,6 +429,7 @@ TODO # Acknowledgments - [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) +- [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO) - [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy) - [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) - [PyMuPDF](https://github.com/pymupdf/PyMuPDF) diff --git a/docs/download_models.py b/docs/download_models.py deleted file mode 100644 index 6c79db19..00000000 --- a/docs/download_models.py +++ /dev/null @@ -1,65 +0,0 @@ - -import json -import os - -import requests -from modelscope import snapshot_download - - -def download_json(url): - # 下载JSON文件 - response = requests.get(url) - response.raise_for_status() # 检查请求是否成功 - return response.json() - - -def download_and_modify_json(url, local_filename, modifications): - if os.path.exists(local_filename): - data = json.load(open(local_filename)) - config_version = data.get('config_version', '0.0.0') - if config_version < '1.0.0': - data = download_json(url) - else: - data = download_json(url) - - - # 修改内容 - for key, value in modifications.items(): - data[key] = value - - # 保存修改后的内容 - with open(local_filename, 'w', encoding='utf-8') as f: - json.dump(data, f, ensure_ascii=False, indent=4) - - -if __name__ == '__main__': - - mineru_patterns = [ - "models/Layout/LayoutLMv3/*", - "models/Layout/YOLO/*", - "models/MFD/YOLO/*", - "models/MFR/unimernet_small/*", - "models/TabRec/TableMaster/*", - "models/TabRec/StructEqTable/*", - ] - model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns) - layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader') - model_dir = model_dir + '/models' - print(f'model_dir is: {model_dir}') - print(f'layoutreader_model_dir is: {layoutreader_model_dir}') - - json_url = 'https://gitee.com/myhloli/MinerU/raw/dev/magic-pdf.template.json' - config_file_name = 'magic-pdf.json' - home_dir = os.path.expanduser('~') - - config_file = os.path.join(home_dir, config_file_name) - - json_mods = { - 'models-dir': model_dir, - 'layoutreader-model-dir': layoutreader_model_dir, - } - - download_and_modify_json(json_url, config_file, json_mods) - - print(f'The configuration file has been configured successfully, the path is: {config_file}') - diff --git a/docs/download_models_hf.py b/docs/download_models_hf.py deleted file mode 100644 index 9dfda1e5..00000000 --- a/docs/download_models_hf.py +++ /dev/null @@ -1,70 +0,0 @@ -import json -import os - -import requests -from huggingface_hub import snapshot_download - - -def download_json(url): - # 下载JSON文件 - response = requests.get(url) - response.raise_for_status() # 检查请求是否成功 - return response.json() - - -def download_and_modify_json(url, local_filename, modifications): - if os.path.exists(local_filename): - data = json.load(open(local_filename)) - config_version = data.get('config_version', '0.0.0') - if config_version < '1.0.0': - data = download_json(url) - else: - data = download_json(url) - - - # 修改内容 - for key, value in modifications.items(): - data[key] = value - - # 保存修改后的内容 - with open(local_filename, 'w', encoding='utf-8') as f: - json.dump(data, f, ensure_ascii=False, indent=4) - - -if __name__ == '__main__': - - mineru_patterns = [ - "models/Layout/LayoutLMv3/*", - "models/Layout/YOLO/*", - "models/MFD/YOLO/*", - "models/MFR/unimernet_small/*", - "models/TabRec/TableMaster/*", - "models/TabRec/StructEqTable/*", - ] - model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns) - - layoutreader_pattern = [ - "*.json", - "*.safetensors", - ] - layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern) - - model_dir = model_dir + '/models' - print(f'model_dir is: {model_dir}') - print(f'layoutreader_model_dir is: {layoutreader_model_dir}') - - json_url = 'https://github.com/opendatalab/MinerU/raw/dev/magic-pdf.template.json' - config_file_name = 'magic-pdf.json' - home_dir = os.path.expanduser('~') - - config_file = os.path.join(home_dir, config_file_name) - - json_mods = { - 'models-dir': model_dir, - 'layoutreader-model-dir': layoutreader_model_dir, - } - - download_and_modify_json(json_url, config_file, json_mods) - - print(f'The configuration file has been configured successfully, the path is: {config_file}') - diff --git a/docs/how_to_download_models_en.md b/docs/how_to_download_models_en.md index 93b4c8b4..e0abed39 100644 --- a/docs/how_to_download_models_en.md +++ b/docs/how_to_download_models_en.md @@ -8,7 +8,7 @@ Model downloads are divided into initial downloads and updates to the model dire Use a Python Script to Download Model Files from Hugging Face ```bash pip install huggingface_hub -wget https://github.com/opendatalab/MinerU/raw/master/docs/download_models_hf.py -O download_models_hf.py +wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py python download_models_hf.py ``` The Python script will automatically download the model files and configure the model directory in the configuration file. diff --git a/docs/how_to_download_models_zh_cn.md b/docs/how_to_download_models_zh_cn.md index c1f4e111..e8e2e1a0 100644 --- a/docs/how_to_download_models_zh_cn.md +++ b/docs/how_to_download_models_zh_cn.md @@ -8,7 +8,7 @@ 方法一:从 Hugging Face 下载模型

使用python脚本 从Hugging Face下载模型文件

pip install huggingface_hub
-wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models_hf.py -O download_models_hf.py
+wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
 python download_models_hf.py

python脚本会自动下载模型文件并配置好配置文件中的模型目录

@@ -19,7 +19,7 @@ python download_models_hf.py ```bash pip install modelscope -wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models.py -O download_models.py +wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models.py -O download_models.py python download_models.py ``` python脚本会自动下载模型文件并配置好配置文件中的模型目录 diff --git a/scripts/download_models.py b/scripts/download_models.py index ed1ee5c3..23e07608 100644 --- a/scripts/download_models.py +++ b/scripts/download_models.py @@ -45,7 +45,7 @@ def download_and_modify_json(url, local_filename, modifications): print(f'model_dir is: {model_dir}') print(f'layoutreader_model_dir is: {layoutreader_model_dir}') - json_url = 'https://gitee.com/myhloli/MinerU/raw/dev/magic-pdf.template.json' + json_url = 'https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.template.json' config_file_name = 'magic-pdf.json' home_dir = os.path.expanduser('~') config_file = os.path.join(home_dir, config_file_name) diff --git a/scripts/download_models_hf.py b/scripts/download_models_hf.py index 5e6b8dce..e2af5a09 100644 --- a/scripts/download_models_hf.py +++ b/scripts/download_models_hf.py @@ -52,7 +52,7 @@ def download_and_modify_json(url, local_filename, modifications): print(f'model_dir is: {model_dir}') print(f'layoutreader_model_dir is: {layoutreader_model_dir}') - json_url = 'https://github.com/opendatalab/MinerU/raw/dev/magic-pdf.template.json' + json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json' config_file_name = 'magic-pdf.json' home_dir = os.path.expanduser('~') config_file = os.path.join(home_dir, config_file_name) diff --git a/signatures/version1/cla.json b/signatures/version1/cla.json index e3412545..c7009647 100644 --- a/signatures/version1/cla.json +++ b/signatures/version1/cla.json @@ -71,6 +71,30 @@ "created_at": "2024-10-26T17:39:26Z", "repoId": 765083837, "pullRequestNo": 793 + }, + { + "name": "hyastar", + "id": 117415976, + "comment_id": 2466539016, + "created_at": "2024-11-10T01:32:42Z", + "repoId": 765083837, + "pullRequestNo": 916 + }, + { + "name": "kimi360", + "id": 3158007, + "comment_id": 2472266659, + "created_at": "2024-11-13T02:57:34Z", + "repoId": 765083837, + "pullRequestNo": 938 + }, + { + "name": "ProseGuys", + "id": 45124798, + "comment_id": 2472990455, + "created_at": "2024-11-13T09:37:42Z", + "repoId": 765083837, + "pullRequestNo": 945 } ] } \ No newline at end of file