update inference script

SayaSS · Dec 7, 2022 · 041b441 · 041b441
1 parent 93355bb
commit 041b441
Show file tree

Hide file tree

Showing 11 changed files with 630 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -25,9 +25,9 @@ wget -P logs/48k/ https://
 
 
 ## 数据集准备
-仅需要以以下文件结构将数据集放入raw目录即可
+仅需要以以下文件结构将数据集放入dataset_raw目录即可
 ```shell
-raw
+dataset_raw
 ├───speaker0
 │   ├───xxx1-xxx1.wav
 │   ├───...
@@ -52,7 +52,7 @@ python preprocess_flist_config.py
 ```shell
 python preprocess_hubert_f0.py
 ```
-执行完以上步骤后 dataset 目录便是预处理完成的数据，可以删除raw文件夹了
+执行完以上步骤后 dataset 目录便是预处理完成的数据，可以删除dataset_raw文件夹了
 
 ## 训练
 ```shell
@@ -61,4 +61,10 @@ python train.py -c configs/config.json -m 48k
 
 ## 推理
 
+使用[inference_main.py](inference_main.py)
++ 更改模型文件为你自己训练的最新模型记录点
++ 将待转化的音频放在raw文件夹下
++ clean_names 写待转化的音频名称
++ trans填写变调半音数量
++ spk_list填写合成的说话人名称
 
diff --git a/configs/config.json b/configs/config.json
@@ -82,10 +82,13 @@
     "use_spectral_norm": false,
     "gin_channels": 256,
     "ssl_dim": 256,
-    "n_speakers": 4
+    "n_speakers": 10
   },
   "spk": {
-    "paimon": 0,
-    "nen": 1
+    "nen": 0,
+    "paimon": 1,
+    "yunhao": 2,
+    "huiyu":3,
+    "jishuang":4
   }
 }
diff --git a/raw/wav_structure.txt → dataset_raw/wav_structure.txt b/raw/wav_structure.txt → dataset_raw/wav_structure.txt
diff --git a/flask_api.py b/flask_api.py
@@ -0,0 +1,56 @@
+import io
+import logging
+
+import soundfile
+import torch
+import torchaudio
+from flask import Flask, request, send_file
+from flask_cors import CORS
+
+from inference.infer_tool import Svc, RealTimeVC
+
+app = Flask(__name__)
+
+CORS(app)
+
+logging.getLogger('numba').setLevel(logging.WARNING)
+
+
+@app.route("/voiceChangeModel", methods=["POST"])
+def voice_change_model():
+    request_form = request.form
+    wave_file = request.files.get("sample", None)
+    # 变调信息
+    f_pitch_change = float(request_form.get("fPitchChange", 0))
+    # DAW所需的采样率
+    daw_sample = int(float(request_form.get("sampleRate", 0)))
+    speaker_id = int(float(request_form.get("sSpeakId", 0)))
+    # http获得wav文件并转换
+    input_wav_path = io.BytesIO(wave_file.read())
+
+    # 模型推理
+    if raw_infer:
+        out_audio, out_sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path)
+        tar_audio = torchaudio.functional.resample(out_audio, svc_model.target_sample, daw_sample)
+    else:
+        out_audio = svc.process(svc_model, speaker_id, f_pitch_change, input_wav_path)
+        tar_audio = torchaudio.functional.resample(torch.from_numpy(out_audio), svc_model.target_sample, daw_sample)
+    # 返回音频
+    out_wav_path = io.BytesIO()
+    soundfile.write(out_wav_path, tar_audio.cpu().numpy(), daw_sample, format="wav")
+    out_wav_path.seek(0)
+    return send_file(out_wav_path, download_name="temp.wav", as_attachment=True)
+
+
+if __name__ == '__main__':
+    # 启用则为直接切片合成，False为交叉淡化方式
+    # vst插件调整0.3-0.5s切片时间可以降低延迟，直接切片方法会有连接处爆音、交叉淡化会有轻微重叠声音
+    # 自行选择能接受的方法，或将vst最大切片时间调整为1s，此处设为Ture，延迟大音质稳定一些
+    raw_infer = True
+    # 每个模型和config是唯一对应的
+    model_name = "logs/48k/G_174000-Copy1.pth"
+    config_name = "configs/config.json"
+    svc_model = Svc(model_name, config_name)
+    svc = RealTimeVC()
+    # 此处与vst插件对应，不建议更改
+    app.run(port=6842, host="0.0.0.0", debug=False, threaded=False)
diff --git a/inference/__init__.py b/inference/__init__.py