diff --git a/README_zh-CN.md b/README_zh-CN.md
index e232784c..76cc383b 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -429,6 +429,7 @@ TODO
# Acknowledgments
- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
+- [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO)
- [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)
- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
- [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
diff --git a/docs/download_models.py b/docs/download_models.py
deleted file mode 100644
index 6c79db19..00000000
--- a/docs/download_models.py
+++ /dev/null
@@ -1,65 +0,0 @@
-
-import json
-import os
-
-import requests
-from modelscope import snapshot_download
-
-
-def download_json(url):
- # 下载JSON文件
- response = requests.get(url)
- response.raise_for_status() # 检查请求是否成功
- return response.json()
-
-
-def download_and_modify_json(url, local_filename, modifications):
- if os.path.exists(local_filename):
- data = json.load(open(local_filename))
- config_version = data.get('config_version', '0.0.0')
- if config_version < '1.0.0':
- data = download_json(url)
- else:
- data = download_json(url)
-
-
- # 修改内容
- for key, value in modifications.items():
- data[key] = value
-
- # 保存修改后的内容
- with open(local_filename, 'w', encoding='utf-8') as f:
- json.dump(data, f, ensure_ascii=False, indent=4)
-
-
-if __name__ == '__main__':
-
- mineru_patterns = [
- "models/Layout/LayoutLMv3/*",
- "models/Layout/YOLO/*",
- "models/MFD/YOLO/*",
- "models/MFR/unimernet_small/*",
- "models/TabRec/TableMaster/*",
- "models/TabRec/StructEqTable/*",
- ]
- model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
- layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
- model_dir = model_dir + '/models'
- print(f'model_dir is: {model_dir}')
- print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
-
- json_url = 'https://gitee.com/myhloli/MinerU/raw/dev/magic-pdf.template.json'
- config_file_name = 'magic-pdf.json'
- home_dir = os.path.expanduser('~')
-
- config_file = os.path.join(home_dir, config_file_name)
-
- json_mods = {
- 'models-dir': model_dir,
- 'layoutreader-model-dir': layoutreader_model_dir,
- }
-
- download_and_modify_json(json_url, config_file, json_mods)
-
- print(f'The configuration file has been configured successfully, the path is: {config_file}')
-
diff --git a/docs/download_models_hf.py b/docs/download_models_hf.py
deleted file mode 100644
index 9dfda1e5..00000000
--- a/docs/download_models_hf.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import json
-import os
-
-import requests
-from huggingface_hub import snapshot_download
-
-
-def download_json(url):
- # 下载JSON文件
- response = requests.get(url)
- response.raise_for_status() # 检查请求是否成功
- return response.json()
-
-
-def download_and_modify_json(url, local_filename, modifications):
- if os.path.exists(local_filename):
- data = json.load(open(local_filename))
- config_version = data.get('config_version', '0.0.0')
- if config_version < '1.0.0':
- data = download_json(url)
- else:
- data = download_json(url)
-
-
- # 修改内容
- for key, value in modifications.items():
- data[key] = value
-
- # 保存修改后的内容
- with open(local_filename, 'w', encoding='utf-8') as f:
- json.dump(data, f, ensure_ascii=False, indent=4)
-
-
-if __name__ == '__main__':
-
- mineru_patterns = [
- "models/Layout/LayoutLMv3/*",
- "models/Layout/YOLO/*",
- "models/MFD/YOLO/*",
- "models/MFR/unimernet_small/*",
- "models/TabRec/TableMaster/*",
- "models/TabRec/StructEqTable/*",
- ]
- model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
-
- layoutreader_pattern = [
- "*.json",
- "*.safetensors",
- ]
- layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern)
-
- model_dir = model_dir + '/models'
- print(f'model_dir is: {model_dir}')
- print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
-
- json_url = 'https://github.com/opendatalab/MinerU/raw/dev/magic-pdf.template.json'
- config_file_name = 'magic-pdf.json'
- home_dir = os.path.expanduser('~')
-
- config_file = os.path.join(home_dir, config_file_name)
-
- json_mods = {
- 'models-dir': model_dir,
- 'layoutreader-model-dir': layoutreader_model_dir,
- }
-
- download_and_modify_json(json_url, config_file, json_mods)
-
- print(f'The configuration file has been configured successfully, the path is: {config_file}')
-
diff --git a/docs/how_to_download_models_en.md b/docs/how_to_download_models_en.md
index 93b4c8b4..e0abed39 100644
--- a/docs/how_to_download_models_en.md
+++ b/docs/how_to_download_models_en.md
@@ -8,7 +8,7 @@ Model downloads are divided into initial downloads and updates to the model dire
Use a Python Script to Download Model Files from Hugging Face
```bash
pip install huggingface_hub
-wget https://github.com/opendatalab/MinerU/raw/master/docs/download_models_hf.py -O download_models_hf.py
+wget https://github.com/opendatalab/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
python download_models_hf.py
```
The Python script will automatically download the model files and configure the model directory in the configuration file.
diff --git a/docs/how_to_download_models_zh_cn.md b/docs/how_to_download_models_zh_cn.md
index c1f4e111..e8e2e1a0 100644
--- a/docs/how_to_download_models_zh_cn.md
+++ b/docs/how_to_download_models_zh_cn.md
@@ -8,7 +8,7 @@
使用python脚本 从Hugging Face下载模型文件
pip install huggingface_hub
-wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models_hf.py -O download_models_hf.py
+wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models_hf.py -O download_models_hf.py
python download_models_hf.py
python脚本会自动下载模型文件并配置好配置文件中的模型目录
@@ -19,7 +19,7 @@ python download_models_hf.py ```bash pip install modelscope -wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models.py -O download_models.py +wget https://gitee.com/myhloli/MinerU/raw/master/scripts/download_models.py -O download_models.py python download_models.py ``` python脚本会自动下载模型文件并配置好配置文件中的模型目录 diff --git a/scripts/download_models.py b/scripts/download_models.py index ed1ee5c3..23e07608 100644 --- a/scripts/download_models.py +++ b/scripts/download_models.py @@ -45,7 +45,7 @@ def download_and_modify_json(url, local_filename, modifications): print(f'model_dir is: {model_dir}') print(f'layoutreader_model_dir is: {layoutreader_model_dir}') - json_url = 'https://gitee.com/myhloli/MinerU/raw/dev/magic-pdf.template.json' + json_url = 'https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.template.json' config_file_name = 'magic-pdf.json' home_dir = os.path.expanduser('~') config_file = os.path.join(home_dir, config_file_name) diff --git a/scripts/download_models_hf.py b/scripts/download_models_hf.py index 5e6b8dce..e2af5a09 100644 --- a/scripts/download_models_hf.py +++ b/scripts/download_models_hf.py @@ -52,7 +52,7 @@ def download_and_modify_json(url, local_filename, modifications): print(f'model_dir is: {model_dir}') print(f'layoutreader_model_dir is: {layoutreader_model_dir}') - json_url = 'https://github.com/opendatalab/MinerU/raw/dev/magic-pdf.template.json' + json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json' config_file_name = 'magic-pdf.json' home_dir = os.path.expanduser('~') config_file = os.path.join(home_dir, config_file_name) diff --git a/signatures/version1/cla.json b/signatures/version1/cla.json index e3412545..c7009647 100644 --- a/signatures/version1/cla.json +++ b/signatures/version1/cla.json @@ -71,6 +71,30 @@ "created_at": "2024-10-26T17:39:26Z", "repoId": 765083837, "pullRequestNo": 793 + }, + { + "name": "hyastar", + "id": 117415976, + "comment_id": 2466539016, + "created_at": "2024-11-10T01:32:42Z", + "repoId": 765083837, + "pullRequestNo": 916 + }, + { + "name": "kimi360", + "id": 3158007, + "comment_id": 2472266659, + "created_at": "2024-11-13T02:57:34Z", + "repoId": 765083837, + "pullRequestNo": 938 + }, + { + "name": "ProseGuys", + "id": 45124798, + "comment_id": 2472990455, + "created_at": "2024-11-13T09:37:42Z", + "repoId": 765083837, + "pullRequestNo": 945 } ] } \ No newline at end of file