Skip to content

Commit cf03ef2

Browse files
authored
Merge pull request #42 from nezihtopaloglu/add_wvmos_sigmos
Add wvmos sigmos
2 parents 6cae0ce + 5e3187c commit cf03ef2

11 files changed

Lines changed: 411 additions & 8 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,3 +169,4 @@ fadtk/
169169
scoreq/
170170
fairseq/
171171
UTMOSv2/
172+
wvmos/

docs/supported_metrics.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,9 @@ We include x mark if the metric is auto-installed in versa.
5555
| 48 | x | DNSMOS Pro: A Reduced-Size DNN for Probabilistic MOS of Speech | pseudo_mos | dnsmos_pro_bvcc | [DNSMOSPro](https://github.com/fcumlin/DNSMOSPro/tree/main) | [paper](https://www.isca-archive.org/interspeech_2024/cumlin24_interspeech.html) |
5656
| 49 | x | DNSMOS Pro: A Reduced-Size DNN for Probabilistic MOS of Speech | pseudo_mos | dnsmos_pro_nisqa | [DNSMOSPro](https://github.com/fcumlin/DNSMOSPro/tree/main) | [paper](https://www.isca-archive.org/interspeech_2024/cumlin24_interspeech.html) |
5757
| 50 | x | DNSMOS Pro: A Reduced-Size DNN for Probabilistic MOS of Speech | pseudo_mos | dnsmos_pro_vcc2018 | [DNSMOSPro](https://github.com/fcumlin/DNSMOSPro/tree/main) | [paper](https://www.isca-archive.org/interspeech_2024/cumlin24_interspeech.html) |
58-
| 51 | x | VQScore (Self-Supervised Speech Quality Estimation and Enhancement Using Only Clean Speech) | vqscore | vqscore | [VQScore](https://github.com/JasonSWFu/VQscore) | [paper](https://arxiv.org/abs/2402.16321) |
59-
58+
| 51 | | WV-MOS (MOS score prediction by fine-tuned wav2vec2.0 model) | wvmos | wvmos | [wvmos](https://github.com/AndreevP/wvmos) | [paper](https://arxiv.org/abs/2203.13086) |
59+
| 52 | |SIG-MOS | sigmos | {SIGMOS_COL, SIGMOS_DISC, SIGMOS_LOUD, SIGMOS_REVERB, SIGMOS_SIG, SIGMOS_OVRL} | [sigmos](https://github.com/microsoft/SIG-Challenge/tree/main/ICASSP2024/sigmos) |[paper](https://arxiv.org/pdf/2309.07385) |
60+
| 53 | x | VQScore (Self-Supervised Speech Quality Estimation and Enhancement Using Only Clean Speech) | vqscore | vqscore | [VQScore](https://github.com/JasonSWFu/VQscore) | [paper](https://arxiv.org/abs/2402.16321) |
6061

6162

6263
### Dependent Metrics
@@ -68,7 +69,7 @@ We include x mark if the metric is auto-installed in versa.
6869
| 4 | x | Signal-to-interference Ratio (SIR) | signal_metric | sir | [espnet](https://github.com/espnet/espnet) | - |
6970
| 5 | x | Signal-to-artifact Ratio (SAR) | signal_metric | sar | [espnet](https://github.com/espnet/espnet) | - |
7071
| 6 | x | Signal-to-distortion Ratio (SDR) | signal_metric | sdr | [espnet](https://github.com/espnet/espnet) | - |
71-
| 7 | x | Convolutional scale-invariant signal-to-distortion ratio (CI-SDR) | signal_metric | ci-sdr | [ci_sdr](https://github.com/fgnt/ci_sdr) | [paper](https://arxiv.org/abs/2011.15003) |
72+
| 7 | x | Convolutional scale-invariant signal-to-distortion ratio (CI-SDR) | signal_metric | ci-sdr | [ci_sdr](https://github.com/fgnt/ci_sdr) | [paper](https://arxiv.(org/abs/2011.15003) |
7273
| 8 | x | Scale-invariant signal-to-noise ratio (SI-SNR) | signal_metric | si-snr | [espnet](https://github.com/espnet/espnet) | [paper](https://arxiv.org/abs/1711.00541) |
7374
| 9 | x | Perceptual Evaluation of Speech Quality (PESQ) | pesq | pesq | [pesq](https://pypi.org/project/pesq/) | [paper](https://ieeexplore.ieee.org/document/941023) |
7475
| 10 | x | Short-Time Objective Intelligibility (STOI) | stoi | stoi | [pystoi](https://github.com/mpariente/pystoi) | [paper](https://ieeexplore.ieee.org/document/5495701) |

egs/separate_metrics/sigmos.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# sigmos (independent) metric
2+
3+
- name: sigmos

egs/separate_metrics/wvmos.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# wvmos (independent) metric
2+
3+
- name: wvmos

test/test_pipeline/test_sigmos.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import logging
2+
import math
3+
import os
4+
5+
import yaml
6+
7+
from versa.scorer_shared import (
8+
find_files,
9+
list_scoring,
10+
load_score_modules,
11+
load_summary,
12+
)
13+
14+
TEST_INFO = {
15+
"SIGMOS_COL": 1.3242647647857666,
16+
"SIGMOS_DISC": 1.0382881164550781,
17+
"SIGMOS_LOUD": 1.0047355890274048,
18+
"SIGMOS_REVERB": 1.0245660543441772,
19+
"SIGMOS_SIG": 1.0186278820037842,
20+
"SIGMOS_OVRL": 1.0545676946640015,
21+
}
22+
23+
24+
def info_update():
25+
# find files
26+
if os.path.isdir("test/test_samples/test2"):
27+
gen_files = find_files("test/test_samples/test2")
28+
29+
# find reference file
30+
if os.path.isdir("test/test_samples/test1"):
31+
gt_files = find_files("test/test_samples/test1")
32+
33+
logging.info("The number of utterances = %d" % len(gen_files))
34+
35+
with open("egs/separate_metrics/sigmos.yaml", "r", encoding="utf-8") as f:
36+
score_config = yaml.full_load(f)
37+
38+
score_modules = load_score_modules(
39+
score_config,
40+
use_gt=(True if gt_files is not None else False),
41+
use_gpu=False,
42+
)
43+
44+
assert len(score_config) > 0, "no scoring function is provided"
45+
46+
score_info = list_scoring(
47+
gen_files, score_modules, gt_files, output_file=None, io="soundfile"
48+
)
49+
summary = load_summary(score_info)
50+
print("Summary: {}".format(load_summary(score_info)), flush=True)
51+
52+
for key in summary:
53+
if math.isinf(TEST_INFO[key]) and math.isinf(summary[key]):
54+
# for sir"
55+
continue
56+
# the plc mos is undeterministic
57+
if abs(TEST_INFO[key] - summary[key]) > 1e-4 and key != "plcmos":
58+
raise ValueError(
59+
"Value issue in the test case, might be some issue in scorer {}".format(
60+
key
61+
)
62+
)
63+
print("check successful", flush=True)
64+
65+
66+
if __name__ == "__main__":
67+
info_update()

test/test_pipeline/test_wvmos.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import logging
2+
import math
3+
import os
4+
5+
import yaml
6+
7+
from versa.scorer_shared import (
8+
find_files,
9+
list_scoring,
10+
load_score_modules,
11+
load_summary,
12+
)
13+
14+
TEST_INFO = {"wvmos": 0.621284008026123}
15+
16+
17+
def info_update():
18+
# find files
19+
if os.path.isdir("test/test_samples/test2"):
20+
gen_files = find_files("test/test_samples/test2")
21+
22+
# find reference file
23+
if os.path.isdir("test/test_samples/test1"):
24+
gt_files = find_files("test/test_samples/test1")
25+
26+
logging.info("The number of utterances = %d" % len(gen_files))
27+
28+
with open("egs/separate_metrics/wvmos.yaml", "r", encoding="utf-8") as f:
29+
score_config = yaml.full_load(f)
30+
31+
score_modules = load_score_modules(
32+
score_config,
33+
use_gt=(True if gt_files is not None else False),
34+
use_gpu=False,
35+
)
36+
37+
assert len(score_config) > 0, "no scoring function is provided"
38+
39+
score_info = list_scoring(
40+
gen_files, score_modules, gt_files, output_file=None, io="soundfile"
41+
)
42+
summary = load_summary(score_info)
43+
print("Summary: {}".format(load_summary(score_info)), flush=True)
44+
45+
for key in summary:
46+
if math.isinf(TEST_INFO[key]) and math.isinf(summary[key]):
47+
# for sir"
48+
continue
49+
# the plc mos is undeterministic
50+
if abs(TEST_INFO[key] - summary[key]) > 1e-4 and key != "plcmos":
51+
raise ValueError(
52+
"Value issue in the test case, might be some issue in scorer {}".format(
53+
key
54+
)
55+
)
56+
print("check successful", flush=True)
57+
58+
59+
if __name__ == "__main__":
60+
info_update()

tools/install_wvmos.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#/bin/bash
2+
3+
if [ -d "wvmos" ]; then
4+
rm -rf wvmos
5+
fi
6+
7+
# # Clone and install wvmos
8+
git clone https://github.com/AndreevP/wvmos.git
9+
cd wvmos
10+
pip install -e .
11+
cd ..
12+

versa/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,8 @@
109109
from versa.utterance_metrics.squim import squim_metric, squim_metric_no_ref
110110
from versa.utterance_metrics.srmr import srmr_metric
111111
from versa.utterance_metrics.chroma_alignment import chroma_metric
112+
from versa.utterance_metrics.wvmos import wvmos_setup, wvmos_calculate
113+
from versa.utterance_metrics.sigmos import sigmos_setup, sigmos_calculate
112114
from versa.utterance_metrics.dpam_distance import dpam_metric, dpam_model_setup
113115
from versa.utterance_metrics.cdpam_distance import cdpam_metric, cdpam_model_setup
114116
from versa.utterance_metrics.vqscore import vqscore_metric, vqscore_setup

versa/scorer_shared.py

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,6 @@ def load_score_modules(score_config, use_gt=True, use_gt_text=False, use_gpu=Fal
9797
logging.info("Initiate WARP-Q metric...")
9898

9999
elif config["name"] == "nisqa":
100-
101100
logging.info("Loading NISQA evaluation...")
102101
from versa.utterance_metrics.nisqa import nisqa_metric, nisqa_model_setup
103102

@@ -257,7 +256,6 @@ def load_score_modules(score_config, use_gt=True, use_gt_text=False, use_gpu=Fal
257256
logging.info("Initiate singer evaluation successfully.")
258257

259258
elif config["name"] == "sheet_ssqa":
260-
261259
logging.info("Loading Sheet SSQA models for evaluation...")
262260
from versa import sheet_ssqa, sheet_ssqa_setup
263261

@@ -287,7 +285,6 @@ def load_score_modules(score_config, use_gt=True, use_gt_text=False, use_gpu=Fal
287285
logging.info("Initiate torch squim (with reference) successfully")
288286

289287
elif config["name"] == "squim_no_ref":
290-
291288
logging.info("Loading squim metrics with reference")
292289
from versa import squim_metric_no_ref
293290

@@ -466,7 +463,6 @@ def load_score_modules(score_config, use_gt=True, use_gt_text=False, use_gpu=Fal
466463
logging.info("Initiate se_snr successfully")
467464

468465
elif config["name"] == "pam":
469-
470466
logging.info("Loading pam metric without reference...")
471467
from versa.utterance_metrics.pam import pam_metric, pam_model_setup
472468

@@ -494,7 +490,6 @@ def load_score_modules(score_config, use_gt=True, use_gt_text=False, use_gpu=Fal
494490
logging.info("Initiate vad metric successfully.")
495491

496492
elif config["name"] == "asvspoof_score":
497-
498493
logging.info("Loading asvspoof score metric without reference...")
499494
from versa.utterance_metrics.asvspoof_score import (
500495
asvspoof_metric,
@@ -897,6 +892,29 @@ def load_score_modules(score_config, use_gt=True, use_gt_text=False, use_gpu=Fal
897892
"scale_factor": config.get("scale_factor", 100),
898893
},
899894
}
895+
elif "wvmos" in config["name"]:
896+
logging.info("Loading WVMOS metric")
897+
from versa import wvmos_setup, wvmos_calculate
898+
899+
model = wvmos_setup(
900+
use_gpu=use_gpu,
901+
)
902+
score_modules["wvmos"] = {
903+
"module": wvmos_calculate,
904+
"args": {"model": model},
905+
}
906+
logging.info("Initiate WVMOS metric successfully")
907+
elif "sigmos" in config["name"]:
908+
logging.info("Loading SIGMOS metric")
909+
from versa import sigmos_setup, sigmos_calculate
910+
911+
model = sigmos_setup()
912+
913+
score_modules["sigmos"] = {
914+
"module": sigmos_calculate,
915+
"args": {"model": model},
916+
}
917+
logging.info("Initiate SIGMOS metric successfully")
900918
elif "vqscore" in config["name"]:
901919
logging.info("Loading VQScore model")
902920
from versa import vqscore_metric, vqscore_setup
@@ -1108,6 +1126,18 @@ def use_score_modules(score_modules, gen_wav, gt_wav, gen_sr, text=None):
11081126
gen_sr,
11091127
custom_prompt=score_modules[key]["prompt"],
11101128
)
1129+
elif key == "wvmos":
1130+
score = score_modules[key]["module"](
1131+
score_modules[key]["args"]["model"],
1132+
gen_wav,
1133+
gen_sr,
1134+
)
1135+
elif key == "sigmos":
1136+
score = score_modules[key]["module"](
1137+
score_modules[key]["args"]["model"],
1138+
gen_wav,
1139+
gen_sr,
1140+
)
11111141
elif key == "vqscore":
11121142
score = score_modules[key]["module"](
11131143
score_modules[key]["args"]["model"], gen_wav, gen_sr

0 commit comments

Comments
 (0)