Skip to content

Commit 1e68bf5

Browse files
committed
Update the _convert_pitch function in block_lpcnet. Update data IO
1 parent d676f87 commit 1e68bf5

File tree

9 files changed

+197
-42
lines changed

9 files changed

+197
-42
lines changed

README.md

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,14 @@ git clone --depth 1 https://github.com/nii-yamagishilab/project-NN-Pytorch-scrip
1616
```
1717

1818
* Latest updates:
19-
1. Code, databases, and resources for the paper below were added. Please check [project/09-asvspoof-vocoded-trn/](project/09-asvspoof-vocoded-trn/) for more details.
19+
1. Neural vocoders pretrained on VoxCeleb2 dev and other datasets are available in tutorial notebook **chapter_a3.ipynb** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1xObWejhqcdSxFAjfWI7sudwPPMoCx-vA?usp=sharing)
20+
2. Code, databases, and resources for the paper below were added. Please check [project/09-asvspoof-vocoded-trn/](project/09-asvspoof-vocoded-trn/) for more details.
2021
> Xin Wang, and Junichi Yamagishi. Spoofed training data for speech spoofing countermeasure can be efficiently created using neural vocoders. Proc. ICASSP 2023, accepted. https://arxiv.org/abs/2210.10570
21-
2. Code for the paper for the paper below were added. Please check [project/08-asvspoof-activelearn](project/08-asvspoof-activelearn) for more details.
22+
3. Code for the paper for the paper below were added. Please check [project/08-asvspoof-activelearn](project/08-asvspoof-activelearn) for more details.
2223
> Xin Wang, and Junichi Yamagishi. Investigating Active-Learning-Based Training Data Selection for Speech Spoofing Countermeasure. In Proc. SLT, accepted. 2023.
23-
3. Neural vocoders pretrained on VoxCeleb2 dev and other datasets are available in tutorial notebook **chapter_a3.ipynb** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1xObWejhqcdSxFAjfWI7sudwPPMoCx-vA?usp=sharing)
24-
3. Pointer to tutorials on neural vocoders were moved to [./tutorials/b1_neural_vocoder](./tutorials/b1_neural_vocoder/README.md).
24+
4. Pointer to tutorials on neural vocoders were moved to [./tutorials/b1_neural_vocoder](./tutorials/b1_neural_vocoder/README.md).
2525

26-
4. All pre-trained models were moved to [Zenodo](https://doi.org/10.5281/zenodo.6349636).
27-
28-
5. Move from pytorch-1.6 to pytoch-1.7
26+
5. All pre-trained models were moved to [Zenodo](https://doi.org/10.5281/zenodo.6349636).
2927

3028
## Contents
3129

core_scripts/config_parse/arg_parse.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,11 @@ def f_args_parsed(argument_input = None):
152152
mes = "External directory to store cache file dic"
153153
parser.add_argument('--path-cache-file', type=str, default="", help=mes)
154154

155+
mes = "Skip scanning data directories (by default False)"
156+
parser.add_argument('--force-skip-datadir-scanning',
157+
action='store_true', default=False, help=mes)
158+
159+
155160
######
156161
# options to save model / checkpoint
157162
parser.add_argument('--save-model-dir', type=str, \

core_scripts/data_io/default_data_io.py

Lines changed: 34 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,9 @@ def _data_len_reader(file_path):
7373
"""
7474
file_name, file_ext = os.path.splitext(file_path)
7575
if file_ext == '.wav':
76-
sr, data = nii_wav_tk.waveReadAsFloat(file_path)
77-
length = data.shape[0]
76+
#sr, data = nii_wav_tk.waveReadAsFloat(file_path)
77+
#length = data.shape[0]
78+
length = nii_wav_tk.readWaveLength(file_path)
7879
elif file_ext == '.flac':
7980
sr, data = nii_wav_tk.flacReadAsFloat(file_path)
8081
length = data.shape[0]
@@ -206,9 +207,11 @@ def _tmp_f(list2, default_value, length):
206207
if global_arg is not None:
207208
self.m_ignore_length_invalid = global_arg.ignore_length_invalid_data
208209
self.m_ignore_cached_finfo = global_arg.ignore_cached_file_infor
210+
self.m_force_skip_scanning = global_arg.force_skip_datadir_scanning
209211
else:
210212
self.m_ignore_length_invalid = False
211213
self.m_ignore_cached_finfo = False
214+
self.m_force_skip_scanning = False
212215

213216
# check augmentation funcctions
214217
if input_augment_funcs:
@@ -474,9 +477,9 @@ def __getitem__(self, idx_input):
474477
if in_data.shape[0] != tmp_d[s_idx:e_idx].shape[0]:
475478
mes = 'Expected length is {:d}.\n'.format(e_idx-s_idx)
476479
mes += "Loaded length "+str(tmp_d[s_idx:e_idx].shape[0])
477-
mes += 'This may be due to an incompatible cache *.dic.'
478-
mes += '\nPlease check the length in *.dic\n'
479-
mes += 'Please delete it if the cached length is wrong.'
480+
mes += '\nThis may be due to an incompatible cache *.dic.'
481+
mes += '\nPlease check the length in *.dic'
482+
mes += '\nPlease delete it if the cached length is wrong.'
480483
nii_warn.f_print(mes)
481484
nii_warn.f_die("fail to load {:s}".format(file_name))
482485
else:
@@ -820,26 +823,31 @@ def f_check_file_list(self, data_len_buf_path):
820823
return
821824

822825
# check the list of files exist in all input/output directories
823-
for tmp_d, tmp_e in zip(self.m_input_dirs, self.m_input_exts):
824-
tmp_list = nii_list_tools.listdir_with_ext(tmp_d, tmp_e, flag_recur)
825-
tmp_new_list = nii_list_tools.common_members(tmp_list,
826-
self.m_file_list)
827-
if len(tmp_new_list) < 1:
828-
nii_warn.f_print("Possible error when scanning:", 'error')
829-
nii_warn.f_print(" {:s} for {:s}".format(tmp_d, tmp_e), 'error')
830-
nii_warn.f_print('Some file names to be scanned:', 'error')
831-
nii_warn.f_print(' ' + ' '.join(self.m_file_list[0:10]),'error')
832-
if self.m_file_list[0].endswith(tmp_e):
833-
nii_warn.f_print('Names should not have {:s}'.format(tmp_e))
834-
if os.path.isfile(self.m_file_list[0]):
835-
mes = "The above name seems not to be the data name. "
836-
mes += "It seems to be a file path. "
837-
mes += "\nPlease check test_list, trn_list, val_list."
838-
nii_warn.f_print(mes, 'error')
839-
self.m_file_list = tmp_new_list
840-
break
841-
else:
842-
self.m_file_list = tmp_new_list
826+
if not self.m_force_skip_scanning:
827+
for tmp_d, tmp_e in zip(self.m_input_dirs, self.m_input_exts):
828+
# read a file list from the input directory
829+
tmp_list = nii_list_tools.listdir_with_ext(
830+
tmp_d, tmp_e, flag_recur)
831+
# get the common set of the existing files and those in list
832+
tmp_new_list = nii_list_tools.common_members(
833+
tmp_list, self.m_file_list)
834+
835+
if len(tmp_new_list) < 1:
836+
nii_warn.f_print("Possible error when scanning:", 'error')
837+
nii_warn.f_print(" {:s} for {:s}".format(tmp_d, tmp_e), 'error')
838+
nii_warn.f_print('Some file names to be scanned:', 'error')
839+
nii_warn.f_print(' ' + ' '.join(self.m_file_list[0:10]),'error')
840+
if self.m_file_list[0].endswith(tmp_e):
841+
nii_warn.f_print('Names should not have {:s}'.format(tmp_e))
842+
if os.path.isfile(self.m_file_list[0]):
843+
mes = "The above name seems not to be the data name. "
844+
mes += "It seems to be a file path. "
845+
mes += "\nPlease check test_list, trn_list, val_list."
846+
nii_warn.f_print(mes, 'error')
847+
self.m_file_list = tmp_new_list
848+
break
849+
else:
850+
self.m_file_list = tmp_new_list
843851

844852
if len(self.m_file_list) < 1:
845853
nii_warn.f_print("\nNo input features found after scanning",'error')
@@ -853,7 +861,7 @@ def f_check_file_list(self, data_len_buf_path):
853861
nii_warn.f_die("Failed to read input features")
854862

855863
# check output files if necessary
856-
if self.m_output_dirs:
864+
if self.m_output_dirs and not self.m_force_skip_scanning:
857865
for tmp_d, tmp_e in zip(self.m_output_dirs, \
858866
self.m_output_exts):
859867
tmp_list = nii_list_tools.listdir_with_ext(tmp_d, tmp_e,

core_scripts/data_io/wav_tools.py

Lines changed: 63 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import os
1818
import sys
1919
import numpy as np
20+
import wave
2021
import scipy.io.wavfile
2122
try:
2223
import soundfile
@@ -206,6 +207,20 @@ def flacReadAsFloat(wavFileIn):
206207
return sr, x
207208

208209

210+
def readWaveLength(wavFileIn):
211+
""" length = readWaveLength(wavFileIn)
212+
Read the length of the waveform
213+
214+
Input:
215+
waveFile, str, path to the input waveform
216+
Return:
217+
length, int, length of waveform
218+
"""
219+
with wave.open(wavFileIn, 'rb') as file_ptr:
220+
wavlength = file_ptr.getnframes()
221+
return wavlength
222+
223+
209224
def buffering(x, n, p=0, opt=None):
210225
"""buffering(x, n, p=0, opt=None)
211226
input
@@ -277,14 +292,16 @@ def silence_handler(wav, sr, fl=320, fs=80,
277292
shortest_len_in_ms=50,
278293
flag_output=0,
279294
flag_norm_amp=True,
280-
flag_only_startend_sil=False):
295+
flag_only_startend_sil = False,
296+
opt_silence_handler = -1):
281297
"""silence_handler(wav, sr, fl=320, fs=80,
282298
max_thres_below=30,
283299
min_thres=-55,
284300
shortest_len_in_ms=50,
285301
flag_output=0,
286302
flag_norm_amp=True,
287-
flag_only_startend_sil=False)
303+
flag_only_startend_sil = False,
304+
opt_silence_handler = 1)
288305
289306
Based on the Speech activity detector mentioned in Sec5.1 of
290307
Tomi Kinnunen, and Haizhou Li.
@@ -311,10 +328,16 @@ def silence_handler(wav, sr, fl=320, fs=80,
311328
segment less than this length is treated as speech
312329
flag_norm_amp: bool, whether normalize the waveform amplitude
313330
based on window function (default True)
314-
flag_only_startend_sil: bool, whether only consider silence in
331+
flag_only_startend_sil (obsolete): bool, whether only consider silence in
315332
the begining and end. If False, silence within the utterance
316333
will be marked / removed (default False)
317334
335+
opt_silence_handler: int, option to silence trim handler
336+
0: equivalent to flag_only_startend_sil = False
337+
1: equivalent to flag_only_startend_sil = True
338+
2: remove only silence between words
339+
-1: not use this option, but follow flag_only_startend_sil
340+
318341
output
319342
------
320343
wav_no_sil: np.array, (length_1, ), waveform after removing silence
@@ -373,9 +396,26 @@ def ignore_short_seg(frame_tag, seg_len_thres):
373396
# remove short nonsil segments
374397
frame_process_all = ignore_short_seg(frame_process_sil, seg_len_thres)
375398
frame_tag = frame_process_all
399+
376400

377-
# if only consder silence in the front and end
378-
if flag_only_startend_sil:
401+
if opt_silence_handler < 0:
402+
# if only consder silence in the front and end
403+
if flag_only_startend_sil:
404+
tmp_nonzero = np.flatnonzero(frame_tag)
405+
406+
# start of the first nonsil segment
407+
#start_nonsil = np.asarray(frame_tag == 1).nonzero()[0]
408+
if np.any(tmp_nonzero):
409+
start_nonsil = np.flatnonzero(frame_tag)[0]
410+
# end of the last nonsil segment
411+
end_nonsil = np.flatnonzero(frame_tag)[-1]
412+
# all segments between are switched to nonsil
413+
frame_tag[start_nonsil:end_nonsil] = 1
414+
else:
415+
# no non-silence data, just let it pass
416+
pass
417+
elif opt_silence_handler == 1:
418+
# if only consder silence in the front and end
379419
tmp_nonzero = np.flatnonzero(frame_tag)
380420

381421
# start of the first nonsil segment
@@ -389,7 +429,24 @@ def ignore_short_seg(frame_tag, seg_len_thres):
389429
else:
390430
# no non-silence data, just let it pass
391431
pass
392-
432+
elif opt_silence_handler == 2:
433+
# if only consder silence in the front and end
434+
tmp_nonzero = np.flatnonzero(frame_tag)
435+
436+
# start of the first nonsil segment
437+
#start_nonsil = np.asarray(frame_tag == 1).nonzero()[0]
438+
if np.any(tmp_nonzero):
439+
start_nonsil = np.flatnonzero(frame_tag)[0]
440+
# end of the last nonsil segment
441+
end_nonsil = np.flatnonzero(frame_tag)[-1]
442+
# all segments between are switched to nonsil
443+
frame_tag[:start_nonsil] = 1
444+
frame_tag[end_nonsil:] = 1
445+
else:
446+
# no non-silence data, just let it pass
447+
pass
448+
else:
449+
pass
393450

394451

395452
# separate non-speech and speech segments

project/05-nn-vocoders/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ This project is Pytorch re-implementation of a few neural waveform models.
77

88
* Note that the tutorial **chapter_a3_pretrained_vocoders.ipynb** includes pre-trained HiFiGAN and WaveGlow on VoxCeleb2 dev and other speech datasets [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1xObWejhqcdSxFAjfWI7sudwPPMoCx-vA?usp=sharing).
99

10+
* The code to extract the input Mel-spectrogram and F0 are included in the above tutorial and notebooks as well. This folder assumes that the input Mel-spectrogram and F0 have been prepared in advance.
11+
1012
**It is better to check the tutorials before diving into this project**.
1113

1214
## Quick start

project/05-nn-vocoders/ilpcnet/block_lpcnet.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -621,7 +621,8 @@ def _convert_pitch(self, pitch_value):
621621
------
622622
output: tensor in int64, quantized pitch
623623
"""
624-
return torch.clamp((pitch_value - 33) // 2, 0, 256).to(torch.int64)
624+
return torch.clamp((pitch_value - 33) // 2, 0,
625+
self.m_pitch_cat-1).to(torch.int64)
625626

626627

627628
def forward(self, cond_feat, cond_feat_normed,

sandbox/block_rawnet.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -380,12 +380,18 @@ def _compute_score(self, emb, inference=True):
380380
381381
Score here refers to
382382
"""
383+
# we should not use logsoftmax if we will use CrossEntropyLoss
384+
flag_logsoftmax = False
385+
383386
if inference:
384387
# no softmax
385388
return self.m_output(emb)
386-
else:
389+
elif flag_logsoftmax:
387390
# Logsoftmax for training loss
391+
# this is used when the training criterion is NLLoss
388392
return self.logsoftmax(self.m_output(emb))
393+
else:
394+
return self.m_output(emb)
389395

390396
def forward(self, x):
391397
"""

sandbox/eval_asvspoof.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,84 @@ def tDCF_wrapper(bonafide_cm_scores, spoof_cm_scores,
577577
return min_tDCF, eer_cm, eer_threshold
578578

579579

580+
def tDCF_wrapper2(bonafide_score_cm, spoof_score_cm, C0, C1, C2):
581+
""" mintDCF, eer = tDCF_wrapper2(bonafide_score_cm,
582+
spoof_score_cm, C0, C1, C2)
583+
584+
compute_tDCF can be factorized into two parts:
585+
C012 computation and min t-DCF computation.
586+
587+
This is for min t-DCF computation, given the values of C012
588+
589+
input
590+
-----
591+
bonafide_score_cm np.array, score of bonafide data
592+
spoof_score_cm np.array, score of spoofed data
593+
C0 scalar, coefficient for min tDCF computation
594+
C1 scalar, coefficient for min tDCF computation
595+
C2 scalar, coefficient for min tDCF computation
596+
597+
output
598+
------
599+
eer scalar, value of EER
600+
mintDCF scalar, value of min tDCF
601+
602+
For C0, C1, C2, see Appendix Eqs.(1-2) in evaluation plan [1],
603+
or Eqs.(10-11) in [2]
604+
605+
References:
606+
607+
[1] T. Kinnunen, H. Delgado, N. Evans,K.-A. Lee, V. Vestman,
608+
A. Nautsch, M. Todisco, X. Wang, M. Sahidullah, J. Yamagishi,
609+
and D.-A. Reynolds, "Tandem Assessment of Spoofing Countermeasures
610+
and Automatic Speaker Verification: Fundamentals," IEEE/ACM Transaction on
611+
Audio, Speech and Language Processing (TASLP).
612+
613+
[2] ASVspoof 2019 challenge evaluation plan
614+
https://www.asvspoof.org/asvspoof2019/asvspoof2019_evaluation_plan.pdf
615+
616+
"""
617+
# Sanity check of scores
618+
combined_scores = np.concatenate((bonafide_score_cm, spoof_score_cm))
619+
if np.isnan(combined_scores).any() or np.isinf(combined_scores).any():
620+
sys.exit('ERROR: Your scores contain nan or inf.')
621+
622+
# Sanity check that inputs are scores and not decisions
623+
n_uniq = np.unique(combined_scores).size
624+
if n_uniq < 3:
625+
sys.exit('ERROR: You should provide soft CM scores - not binary decisions')
626+
627+
# Obtain miss and false alarm rates of CM
628+
Pmiss_cm, Pfa_cm, CM_thresholds = compute_det_curve(
629+
bonafide_score_cm, spoof_score_cm)
630+
631+
# =====
632+
# tDCF
633+
# =====
634+
if np.isnan(C0) or np.isnan(C1) or np.isnan(C2):
635+
# this is a case where
636+
mintDCF = np.nan
637+
else:
638+
# tDCF values
639+
tDCF = C0 + C1 * Pmiss_cm + C2 * Pfa_cm
640+
# Obtain default t-DCF
641+
tDCF_default = C0 + np.minimum(C1, C2)
642+
# Normalized t-DCF
643+
tDCF_norm = tDCF / tDCF_default
644+
# min t-DCF
645+
mintDCF = tDCF_norm[tDCF_norm.argmin()]
646+
647+
# ====
648+
# EER
649+
# ====
650+
abs_diffs = np.abs(Pmiss_cm - Pfa_cm)
651+
min_index = np.argmin(abs_diffs)
652+
eer = np.mean((Pmiss_cm[min_index], Pfa_cm[min_index]))
653+
654+
return mintDCF, eer
655+
656+
657+
580658
def ASVspoof2019_evaluate(bonafide_cm_scores, bonafide_cm_file_names,
581659
spoof_cm_scores, spoof_cm_file_names, verbose=False,
582660
protocol_alternative=None):

sandbox/util_loss_metric.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -462,7 +462,7 @@ def rank_consistency_v3(x, metric = None):
462462
>> rank_consistency_v3(x, metric)
463463
tensor(.0)
464464
"""
465-
465+
# batch size
466466
bs = x.shape[0]
467467

468468
# loss to be accumulated

0 commit comments

Comments
 (0)