myshell-ai · zgldh · Dec 8, 2024
diff --git a/melo/api.py b/melo/api.py
@@ -122,7 +122,7 @@ def tts_to_file(self, text, speaker_id, output_path=None, sdp_ratio=0.2, noise_s
                     )[0][0, 0].data.cpu().float().numpy()
                 del x_tst, tones, lang_ids, bert, ja_bert, x_tst_lengths, speakers
                 # 
-            audio_list.append(audio)
+            audio_list.append(utils.fix_loudness(audio,self.hps.data.sampling_rate))
         torch.cuda.empty_cache()
         audio = self.audio_numpy_concat(audio_list, sr=self.hps.data.sampling_rate, speed=speed)
 

diff --git a/melo/utils.py b/melo/utils.py
@@ -13,11 +13,22 @@
 from melo.text.cleaner import clean_text
 from melo import commons
 
+import pyloudnorm as pyln
+
 MATPLOTLIB_FLAG = False
 
 logger = logging.getLogger(__name__)
 
+def fix_loudness(input, rate):
+    # 峰值归一化至 -1 dB
+    peak_normalized_audio = pyln.normalize.peak(input, -1.0)
+
+    # 测量响度
+    meter = pyln.Meter(rate)
+    loudness = meter.integrated_loudness(peak_normalized_audio)
 
+    # 响度归一化至 -12 dB LUFS
+    return pyln.normalize.loudness(peak_normalized_audio, loudness, -12.0)
 
 def get_text_for_tts_infer(text, language_str, hps, device, symbol_to_id=None):
     norm_text, phone, tone, word2ph = clean_text(text, language_str)

diff --git a/requirements.txt b/requirements.txt
@@ -27,3 +27,4 @@ langid==1.1.6
 tqdm
 tensorboard==2.16.2
 loguru==0.7.2
+pyloudnorm