Skip to content

Commit

Permalink
Merge pull request #699 from myhloli/dev
Browse files Browse the repository at this point in the history
feat(docs): automate model download and configuration
  • Loading branch information
myhloli authored Oct 8, 2024
2 parents 8786d20 + 6c9b23c commit 7b78755
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 17 deletions.
49 changes: 45 additions & 4 deletions docs/download_models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,46 @@
# use modelscope sdk download models
import os
import requests
import json
from modelscope import snapshot_download
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
print(f"model dir is: {model_dir}/models")


def download_and_modify_json(url, local_filename, modifications):
if os.path.exists(local_filename):
data = json.load(open(local_filename))
else:
# 下载JSON文件
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功

# 解析JSON内容
data = response.json()

# 修改内容
for key, value in modifications.items():
data[key] = value

# 保存修改后的内容
with open(local_filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)


if __name__ == '__main__':
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
model_dir = model_dir + "/models"
print(f"model_dir is: {model_dir}")
print(f"layoutreader_model_dir is: {layoutreader_model_dir}")

json_url = 'https://gitee.com/myhloli/MinerU/raw/master/magic-pdf.template.json'
config_file_name = "magic-pdf.json"
home_dir = os.path.expanduser("~")
config_file = os.path.join(home_dir, config_file_name)

json_mods = {
'models-dir': model_dir,
'layoutreader-model-dir': layoutreader_model_dir,
}

download_and_modify_json(json_url, config_file, json_mods)
print(f"The configuration file has been configured successfully, the path is: {config_file}")

48 changes: 45 additions & 3 deletions docs/download_models_hf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,46 @@
import os
import requests
import json
from huggingface_hub import snapshot_download
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
layoutreader_model_dir = snapshot_download('hantian/layoutreader')
print(f"model dir is: {model_dir}/models")


def download_and_modify_json(url, local_filename, modifications):
if os.path.exists(local_filename):
data = json.load(open(local_filename))
else:
# 下载JSON文件
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功

# 解析JSON内容
data = response.json()

# 修改内容
for key, value in modifications.items():
data[key] = value

# 保存修改后的内容
with open(local_filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)


if __name__ == '__main__':
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
layoutreader_model_dir = snapshot_download('hantian/layoutreader')
model_dir = model_dir + "/models"
print(f"model_dir is: {model_dir}")
print(f"layoutreader_model_dir is: {layoutreader_model_dir}")

json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
config_file_name = "magic-pdf.json"
home_dir = os.path.expanduser("~")
config_file = os.path.join(home_dir, config_file_name)

json_mods = {
'models-dir': model_dir,
'layoutreader-model-dir': layoutreader_model_dir,
}

download_and_modify_json(json_url, config_file, json_mods)
print(f"The configuration file has been configured successfully, the path is: {config_file}")

7 changes: 2 additions & 5 deletions docs/how_to_download_models_en.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,9 @@ pip install huggingface_hub
wget https://github.com/opendatalab/MinerU/raw/master/docs/download_models_hf.py
python download_models_hf.py
```
After the Python script finishes executing, it will output the directory where the models are downloaded.

### 2. To modify the model path address in the configuration file

Additionally, in `~/magic-pdf.json`, update the model directory path to the absolute path of the `models` directory output by the previous Python script. Otherwise, you will encounter an error indicating that the model cannot be loaded.
The Python script will automatically download the model files and configure the model directory in the configuration file.

The configuration file can be found in the user directory, with the filename `magic-pdf.json`.

# How to update models previously downloaded

Expand Down
8 changes: 3 additions & 5 deletions docs/how_to_download_models_zh_cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,10 @@ pip install modelscope
wget https://gitee.com/myhloli/MinerU/raw/master/docs/download_models.py
python download_models.py
```
python脚本执行完毕后,会输出模型下载目录


## 下载完成后的操作:修改magic-pdf.json中的模型路径
`~/magic-pdf.json`里修改模型的目录指向上一步脚本输出的models目录的绝对路径,否则会报模型无法加载的错误。
python脚本会自动下载模型文件并配置好配置文件中的模型目录

配置文件可以在用户目录中找到,文件名为`magic-pdf.json`
> windows的用户目录为 "C:\\Users\\用户名", linux用户目录为 "/home/用户名", macOS用户目录为 "/Users/用户名"

# 此前下载过模型,如何更新
Expand Down

0 comments on commit 7b78755

Please sign in to comment.