From cf385779435b389ef8f8463e3c43dc620d0a0e54 Mon Sep 17 00:00:00 2001 From: myhloli Date: Tue, 8 Oct 2024 19:26:03 +0800 Subject: [PATCH 1/2] feat(docs): automate model download and configuration - Add scripts to download models and update configuration file - Remove manual steps for modifying model paths - Update documentation for both ModelScope and HuggingFace model downloads - Improve user experience by automating the entire process --- docs/download_models.py | 49 +++++++++++++++++++++++++--- docs/download_models_hf.py | 48 +++++++++++++++++++++++++-- docs/how_to_download_models_en.md | 5 +-- docs/how_to_download_models_zh_cn.md | 8 ++--- 4 files changed, 94 insertions(+), 16 deletions(-) diff --git a/docs/download_models.py b/docs/download_models.py index 9fbaea48..7541bdd2 100644 --- a/docs/download_models.py +++ b/docs/download_models.py @@ -1,5 +1,46 @@ -# use modelscope sdk download models +import os +import requests +import json from modelscope import snapshot_download -model_dir = snapshot_download('opendatalab/PDF-Extract-Kit') -layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader') -print(f"model dir is: {model_dir}/models") + + +def download_and_modify_json(url, local_filename, modifications): + if os.path.exists(local_filename): + data = json.load(open(local_filename)) + else: + # 下载JSON文件 + response = requests.get(url) + response.raise_for_status() # 检查请求是否成功 + + # 解析JSON内容 + data = response.json() + + # 修改内容 + for key, value in modifications.items(): + data[key] = value + + # 保存修改后的内容 + with open(local_filename, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + + +if __name__ == '__main__': + model_dir = snapshot_download('opendatalab/PDF-Extract-Kit') + layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader') + model_dir = model_dir + "/models" + print(f"model_dir is: {model_dir}") + print(f"layoutreader_model_dir is: {layoutreader_model_dir}") + + json_url = 'https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.template.json' + config_file_name = "magic-pdf.json" + home_dir = os.path.expanduser("~") + config_file = os.path.join(home_dir, config_file_name) + + json_mods = { + 'models-dir': model_dir, + 'layoutreader-model-dir': layoutreader_model_dir, + } + + download_and_modify_json(json_url, config_file, json_mods) + print(f"The configuration file has been configured successfully, the path is: {config_file}") + diff --git a/docs/download_models_hf.py b/docs/download_models_hf.py index 0c7079e9..8bd06901 100644 --- a/docs/download_models_hf.py +++ b/docs/download_models_hf.py @@ -1,4 +1,46 @@ +import os +import requests +import json from huggingface_hub import snapshot_download -model_dir = snapshot_download('opendatalab/PDF-Extract-Kit') -layoutreader_model_dir = snapshot_download('hantian/layoutreader') -print(f"model dir is: {model_dir}/models") + + +def download_and_modify_json(url, local_filename, modifications): + if os.path.exists(local_filename): + data = json.load(open(local_filename)) + else: + # 下载JSON文件 + response = requests.get(url) + response.raise_for_status() # 检查请求是否成功 + + # 解析JSON内容 + data = response.json() + + # 修改内容 + for key, value in modifications.items(): + data[key] = value + + # 保存修改后的内容 + with open(local_filename, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + + +if __name__ == '__main__': + model_dir = snapshot_download('opendatalab/PDF-Extract-Kit') + layoutreader_model_dir = snapshot_download('hantian/layoutreader') + model_dir = model_dir + "/models" + print(f"model_dir is: {model_dir}") + print(f"layoutreader_model_dir is: {layoutreader_model_dir}") + + json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json' + config_file_name = "magic-pdf.json" + home_dir = os.path.expanduser("~") + config_file = os.path.join(home_dir, config_file_name) + + json_mods = { + 'models-dir': model_dir, + 'layoutreader-model-dir': layoutreader_model_dir, + } + + download_and_modify_json(json_url, config_file, json_mods) + print(f"The configuration file has been configured successfully, the path is: {config_file}") + diff --git a/docs/how_to_download_models_en.md b/docs/how_to_download_models_en.md index 87611d62..da570f1b 100644 --- a/docs/how_to_download_models_en.md +++ b/docs/how_to_download_models_en.md @@ -10,11 +10,8 @@ pip install huggingface_hub wget https://github.com/opendatalab/MinerU/raw/master/docs/download_models_hf.py python download_models_hf.py ``` -After the Python script finishes executing, it will output the directory where the models are downloaded. +The Python script will automatically download the model files and configure the model directory in the configuration file. -### 2. To modify the model path address in the configuration file - -Additionally, in `~/magic-pdf.json`, update the model directory path to the absolute path of the `models` directory output by the previous Python script. Otherwise, you will encounter an error indicating that the model cannot be loaded. # How to update models previously downloaded diff --git a/docs/how_to_download_models_zh_cn.md b/docs/how_to_download_models_zh_cn.md index 95fe1313..89046dcc 100644 --- a/docs/how_to_download_models_zh_cn.md +++ b/docs/how_to_download_models_zh_cn.md @@ -22,12 +22,10 @@ pip install modelscope wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models.py python download_models.py ``` -python脚本执行完毕后,会输出模型下载目录 - - -## 下载完成后的操作:修改magic-pdf.json中的模型路径 -在`~/magic-pdf.json`里修改模型的目录指向上一步脚本输出的models目录的绝对路径,否则会报模型无法加载的错误。 +python脚本会自动下载模型文件并配置好配置文件中的模型目录 +配置文件可以在用户目录中找到,文件名为`magic-pdf.json` +> windows的用户目录为 "C:\\Users\\用户名", linux用户目录为 "/home/用户名", macOS用户目录为 "/Users/用户名" # 此前下载过模型,如何更新 From 6c9b23c3d39c265f639163e66746aa90c8d65ede Mon Sep 17 00:00:00 2001 From: myhloli Date: Tue, 8 Oct 2024 19:28:11 +0800 Subject: [PATCH 2/2] feat(docs): automate model download and configuration - Add scripts to download models and update configuration file - Remove manual steps for modifying model paths - Update documentation for both ModelScope and HuggingFace model downloads - Improve user experience by automating the entire process --- docs/how_to_download_models_en.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/how_to_download_models_en.md b/docs/how_to_download_models_en.md index da570f1b..fa8efd13 100644 --- a/docs/how_to_download_models_en.md +++ b/docs/how_to_download_models_en.md @@ -12,7 +12,7 @@ python download_models_hf.py ``` The Python script will automatically download the model files and configure the model directory in the configuration file. - +The configuration file can be found in the user directory, with the filename `magic-pdf.json`. # How to update models previously downloaded