Update the _convert_pitch function in block_lpcnet. Update data IO

TonyWangX · TonyWangX · commit 1e68bf568c93 · 2023-04-22T12:00:25.000+09:00
diff --git a/README.md b/README.md
@@ -16,16 +16,14 @@ git clone --depth 1 https://github.com/nii-yamagishilab/project-NN-Pytorch-scrip
 ```
 
 * Latest updates:
-   1. Code, databases, and resources for the paper below were added. Please check [project/09-asvspoof-vocoded-trn/](project/09-asvspoof-vocoded-trn/) for more details.
+   1. Neural vocoders pretrained on VoxCeleb2 dev and other datasets are available in tutorial notebook **chapter_a3.ipynb** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1xObWejhqcdSxFAjfWI7sudwPPMoCx-vA?usp=sharing)
+   2. Code, databases, and resources for the paper below were added. Please check [project/09-asvspoof-vocoded-trn/](project/09-asvspoof-vocoded-trn/) for more details.
       > Xin Wang, and Junichi Yamagishi. Spoofed training data for speech spoofing countermeasure can be efficiently created using neural vocoders. Proc. ICASSP 2023, accepted. https://arxiv.org/abs/2210.10570
-   2. Code for the paper for the paper below were added. Please check [project/08-asvspoof-activelearn](project/08-asvspoof-activelearn) for more details.
+   3. Code for the paper for the paper below were added. Please check [project/08-asvspoof-activelearn](project/08-asvspoof-activelearn) for more details.
       > Xin Wang, and Junichi Yamagishi. Investigating Active-Learning-Based Training Data Selection for Speech Spoofing Countermeasure. In Proc. SLT, accepted. 2023.
-   3. Neural vocoders pretrained on VoxCeleb2 dev and other datasets are available in tutorial notebook **chapter_a3.ipynb** [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1xObWejhqcdSxFAjfWI7sudwPPMoCx-vA?usp=sharing)
-   3. Pointer to tutorials on neural vocoders were moved to [./tutorials/b1_neural_vocoder](./tutorials/b1_neural_vocoder/README.md).
+   4. Pointer to tutorials on neural vocoders were moved to [./tutorials/b1_neural_vocoder](./tutorials/b1_neural_vocoder/README.md).
    
-   4. All pre-trained models were moved to [Zenodo](https://doi.org/10.5281/zenodo.6349636).
-
-   5. Move from pytorch-1.6 to pytoch-1.7
+   5. All pre-trained models were moved to [Zenodo](https://doi.org/10.5281/zenodo.6349636).
 
 ## Contents
 
diff --git a/core_scripts/config_parse/arg_parse.py b/core_scripts/config_parse/arg_parse.py
@@ -152,6 +152,11 @@ def f_args_parsed(argument_input = None):
     mes = "External directory to store cache file dic"
     parser.add_argument('--path-cache-file', type=str, default="", help=mes)
 
+    mes = "Skip scanning data directories (by default False)"
+    parser.add_argument('--force-skip-datadir-scanning', 
+                        action='store_true', default=False, help=mes)
+    
+    
     ######
     # options to save model / checkpoint
     parser.add_argument('--save-model-dir', type=str, \
diff --git a/core_scripts/data_io/default_data_io.py b/core_scripts/data_io/default_data_io.py
@@ -73,8 +73,9 @@ def _data_len_reader(file_path):
     """
     file_name, file_ext = os.path.splitext(file_path)
     if file_ext == '.wav':
-        sr, data = nii_wav_tk.waveReadAsFloat(file_path)
-        length = data.shape[0]
+        #sr, data = nii_wav_tk.waveReadAsFloat(file_path)
+        #length = data.shape[0]
+        length = nii_wav_tk.readWaveLength(file_path)
     elif file_ext == '.flac':
         sr, data = nii_wav_tk.flacReadAsFloat(file_path)
         length = data.shape[0]
@@ -206,9 +207,11 @@ def _tmp_f(list2, default_value, length):
         if global_arg is not None:
             self.m_ignore_length_invalid = global_arg.ignore_length_invalid_data
             self.m_ignore_cached_finfo = global_arg.ignore_cached_file_infor
+            self.m_force_skip_scanning = global_arg.force_skip_datadir_scanning
         else:
             self.m_ignore_length_invalid = False
             self.m_ignore_cached_finfo = False
+            self.m_force_skip_scanning = False
 
         # check augmentation funcctions
         if input_augment_funcs:
@@ -474,9 +477,9 @@ def __getitem__(self, idx_input):
                     if in_data.shape[0] != tmp_d[s_idx:e_idx].shape[0]:
                         mes = 'Expected length is {:d}.\n'.format(e_idx-s_idx)
                         mes += "Loaded length "+str(tmp_d[s_idx:e_idx].shape[0])
-                        mes += 'This may be due to an incompatible cache *.dic.'
-                        mes += '\nPlease check the length in *.dic\n'
-                        mes += 'Please delete it if the cached length is wrong.'
+                        mes += '\nThis may be due to an incompatible cache *.dic.'
+                        mes += '\nPlease check the length in *.dic'
+                        mes += '\nPlease delete it if the cached length is wrong.'
                         nii_warn.f_print(mes)
                         nii_warn.f_die("fail to load {:s}".format(file_name))
                     else:
@@ -820,26 +823,31 @@ def f_check_file_list(self, data_len_buf_path):
             return
 
         # check the list of files exist in all input/output directories
-        for tmp_d, tmp_e in zip(self.m_input_dirs, self.m_input_exts):
-            tmp_list = nii_list_tools.listdir_with_ext(tmp_d, tmp_e, flag_recur)
-            tmp_new_list = nii_list_tools.common_members(tmp_list, 
-                                                         self.m_file_list)
-            if len(tmp_new_list) < 1:
-                nii_warn.f_print("Possible error when scanning:", 'error')
-                nii_warn.f_print(" {:s} for {:s}".format(tmp_d, tmp_e), 'error')
-                nii_warn.f_print('Some file names to be scanned:', 'error')
-                nii_warn.f_print(' ' + ' '.join(self.m_file_list[0:10]),'error')
-                if self.m_file_list[0].endswith(tmp_e):
-                    nii_warn.f_print('Names should not have {:s}'.format(tmp_e))
-                if os.path.isfile(self.m_file_list[0]):
-                    mes = "The above name seems not to be the data name. "
-                    mes += "It seems to be a file path. "
-                    mes += "\nPlease check test_list, trn_list, val_list."
-                    nii_warn.f_print(mes, 'error')
-                self.m_file_list = tmp_new_list
-                break
-            else:
-                self.m_file_list = tmp_new_list
+        if not self.m_force_skip_scanning:
+            for tmp_d, tmp_e in zip(self.m_input_dirs, self.m_input_exts):
+                # read a file list from the input directory
+                tmp_list = nii_list_tools.listdir_with_ext(
+                    tmp_d, tmp_e, flag_recur)
+                # get the common set of the existing files and those in list
+                tmp_new_list = nii_list_tools.common_members(
+                    tmp_list, self.m_file_list)
+            
+                if len(tmp_new_list) < 1:
+                    nii_warn.f_print("Possible error when scanning:", 'error')
+                    nii_warn.f_print(" {:s} for {:s}".format(tmp_d, tmp_e), 'error')
+                    nii_warn.f_print('Some file names to be scanned:', 'error')
+                    nii_warn.f_print(' ' + ' '.join(self.m_file_list[0:10]),'error')
+                    if self.m_file_list[0].endswith(tmp_e):
+                        nii_warn.f_print('Names should not have {:s}'.format(tmp_e))
+                    if os.path.isfile(self.m_file_list[0]):
+                        mes = "The above name seems not to be the data name. "
+                        mes += "It seems to be a file path. "
+                        mes += "\nPlease check test_list, trn_list, val_list."
+                        nii_warn.f_print(mes, 'error')
+                    self.m_file_list = tmp_new_list
+                    break
+                else:
+                    self.m_file_list = tmp_new_list
 
         if len(self.m_file_list) < 1:
             nii_warn.f_print("\nNo input features found after scanning",'error')
@@ -853,7 +861,7 @@ def f_check_file_list(self, data_len_buf_path):
             nii_warn.f_die("Failed to read input features")
             
         # check output files if necessary
-        if self.m_output_dirs:
+        if self.m_output_dirs and not self.m_force_skip_scanning:
             for tmp_d, tmp_e in zip(self.m_output_dirs, \
                                     self.m_output_exts):
                 tmp_list = nii_list_tools.listdir_with_ext(tmp_d, tmp_e, 
diff --git a/core_scripts/data_io/wav_tools.py b/core_scripts/data_io/wav_tools.py
@@ -17,6 +17,7 @@
 import os
 import sys
 import numpy as np
+import wave
 import scipy.io.wavfile
 try:
     import soundfile
@@ -206,6 +207,20 @@ def flacReadAsFloat(wavFileIn):
     return sr, x
 
 
+def readWaveLength(wavFileIn):
+    """ length = readWaveLength(wavFileIn)
+    Read the length of the waveform
+
+    Input: 
+             waveFile, str, path to the input waveform
+    Return: 
+             length, int, length of waveform
+    """
+    with wave.open(wavFileIn, 'rb') as file_ptr:
+        wavlength = file_ptr.getnframes()
+    return wavlength
+
+
 def buffering(x, n, p=0, opt=None):
     """buffering(x, n, p=0, opt=None)
     input
@@ -277,14 +292,16 @@ def silence_handler(wav, sr, fl=320, fs=80,
                     shortest_len_in_ms=50,
                     flag_output=0, 
                     flag_norm_amp=True,
-                    flag_only_startend_sil=False):
+                    flag_only_startend_sil = False,
+                    opt_silence_handler = -1):
     """silence_handler(wav, sr, fl=320, fs=80,
                     max_thres_below=30, 
                     min_thres=-55, 
                     shortest_len_in_ms=50,
                     flag_output=0, 
                     flag_norm_amp=True,
-                    flag_only_startend_sil=False)
+                    flag_only_startend_sil = False,
+                    opt_silence_handler = 1)
     
     Based on the Speech activity detector mentioned in Sec5.1 of
     Tomi Kinnunen, and Haizhou Li. 
@@ -311,10 +328,16 @@ def silence_handler(wav, sr, fl=320, fs=80,
           segment less than this length is treated as speech
       flag_norm_amp: bool, whether normalize the waveform amplitude
           based on window function (default True)
-      flag_only_startend_sil: bool, whether only consider silence in 
+      flag_only_startend_sil (obsolete): bool, whether only consider silence in 
           the begining and end. If False, silence within the utterance
           will be marked / removed (default False)
 
+      opt_silence_handler:  int, option to silence trim handler
+          0: equivalent to flag_only_startend_sil = False
+          1: equivalent to flag_only_startend_sil = True
+          2: remove only silence between words
+         -1: not use this option, but follow flag_only_startend_sil
+
     output
     ------
       wav_no_sil: np.array, (length_1, ), waveform after removing silence
@@ -373,9 +396,26 @@ def ignore_short_seg(frame_tag, seg_len_thres):
     # remove short nonsil segments
     frame_process_all = ignore_short_seg(frame_process_sil, seg_len_thres)    
     frame_tag = frame_process_all
+    
 
-    # if only consder silence in the front and end
-    if flag_only_startend_sil:
+    if opt_silence_handler < 0:
+        # if only consder silence in the front and end
+        if flag_only_startend_sil:
+            tmp_nonzero = np.flatnonzero(frame_tag)
+        
+            # start of the first nonsil segment
+            #start_nonsil = np.asarray(frame_tag == 1).nonzero()[0]
+            if np.any(tmp_nonzero):
+                start_nonsil = np.flatnonzero(frame_tag)[0]
+                # end of the last nonsil segment
+                end_nonsil = np.flatnonzero(frame_tag)[-1]
+                # all segments between are switched to nonsil
+                frame_tag[start_nonsil:end_nonsil] = 1
+            else:
+                # no non-silence data, just let it pass
+                pass
+    elif opt_silence_handler == 1:
+        # if only consder silence in the front and end
         tmp_nonzero = np.flatnonzero(frame_tag)
         
         # start of the first nonsil segment
@@ -389,7 +429,24 @@ def ignore_short_seg(frame_tag, seg_len_thres):
         else:
             # no non-silence data, just let it pass
             pass
-            
+    elif opt_silence_handler == 2:
+        # if only consder silence in the front and end
+        tmp_nonzero = np.flatnonzero(frame_tag)
+        
+        # start of the first nonsil segment
+        #start_nonsil = np.asarray(frame_tag == 1).nonzero()[0]
+        if np.any(tmp_nonzero):
+            start_nonsil = np.flatnonzero(frame_tag)[0]
+            # end of the last nonsil segment
+            end_nonsil = np.flatnonzero(frame_tag)[-1]
+            # all segments between are switched to nonsil
+            frame_tag[:start_nonsil] = 1
+            frame_tag[end_nonsil:] = 1
+        else:
+            # no non-silence data, just let it pass
+            pass
+    else:
+        pass
         
 
     # separate non-speech and speech segments
diff --git a/project/05-nn-vocoders/README.md b/project/05-nn-vocoders/README.md
@@ -7,6 +7,8 @@ This project is Pytorch re-implementation of a few neural waveform models.
 
 * Note that the tutorial **chapter_a3_pretrained_vocoders.ipynb** includes pre-trained HiFiGAN and WaveGlow on VoxCeleb2 dev and other speech datasets [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1xObWejhqcdSxFAjfWI7sudwPPMoCx-vA?usp=sharing).
 
+* The code to extract the input Mel-spectrogram and F0 are included in the above tutorial and notebooks as well. This folder assumes that the input Mel-spectrogram and F0 have been prepared in advance. 
+
 **It is better to check the tutorials before diving into this project**.
 
 ## Quick start
diff --git a/project/05-nn-vocoders/ilpcnet/block_lpcnet.py b/project/05-nn-vocoders/ilpcnet/block_lpcnet.py
@@ -621,7 +621,8 @@ def _convert_pitch(self, pitch_value):
         ------
           output: tensor in int64, quantized pitch
         """
-        return torch.clamp((pitch_value - 33) // 2, 0, 256).to(torch.int64)
+        return torch.clamp((pitch_value - 33) // 2, 0, 
+                           self.m_pitch_cat-1).to(torch.int64)
 
     
     def forward(self, cond_feat, cond_feat_normed, 
diff --git a/sandbox/block_rawnet.py b/sandbox/block_rawnet.py
@@ -380,12 +380,18 @@ def _compute_score(self, emb, inference=True):
           
         Score here refers to 
         """
+        # we should not use logsoftmax if we will use CrossEntropyLoss
+        flag_logsoftmax = False
+
         if inference:
             # no softmax
             return self.m_output(emb)
-        else:
+        elif flag_logsoftmax:
             # Logsoftmax for training loss
+            # this is used when the training criterion is NLLoss
             return self.logsoftmax(self.m_output(emb))
+        else:
+            return self.m_output(emb)
     
     def forward(self, x):
         """
diff --git a/sandbox/eval_asvspoof.py b/sandbox/eval_asvspoof.py
@@ -577,6 +577,84 @@ def tDCF_wrapper(bonafide_cm_scores, spoof_cm_scores,
     return min_tDCF, eer_cm, eer_threshold
 
 
+def tDCF_wrapper2(bonafide_score_cm, spoof_score_cm, C0, C1, C2):
+    """ mintDCF, eer = tDCF_wrapper2(bonafide_score_cm, 
+                                    spoof_score_cm, C0, C1, C2)
+    
+    compute_tDCF can be factorized into two parts: 
+    C012 computation and min t-DCF computation.
+
+    This is for min t-DCF computation, given the values of C012
+    
+    input
+    -----
+      bonafide_score_cm  np.array, score of bonafide data
+      spoof_score_cm     np.array, score of spoofed data
+      C0                 scalar, coefficient for min tDCF computation
+      C1                 scalar, coefficient for min tDCF computation
+      C2                 scalar, coefficient for min tDCF computation
+    
+    output
+    ------
+      eer                scalar, value of EER
+      mintDCF            scalar, value of min tDCF
+
+    For C0, C1, C2, see Appendix Eqs.(1-2) in evaluation plan [1],
+    or Eqs.(10-11) in [2]
+
+    References:
+
+      [1] T. Kinnunen, H. Delgado, N. Evans,K.-A. Lee, V. Vestman, 
+          A. Nautsch, M. Todisco, X. Wang, M. Sahidullah, J. Yamagishi, 
+          and D.-A. Reynolds, "Tandem Assessment of Spoofing Countermeasures
+          and Automatic Speaker Verification: Fundamentals," IEEE/ACM Transaction on
+          Audio, Speech and Language Processing (TASLP).
+
+      [2] ASVspoof 2019 challenge evaluation plan
+          https://www.asvspoof.org/asvspoof2019/asvspoof2019_evaluation_plan.pdf
+
+    """
+    # Sanity check of scores
+    combined_scores = np.concatenate((bonafide_score_cm, spoof_score_cm))
+    if np.isnan(combined_scores).any() or np.isinf(combined_scores).any():
+        sys.exit('ERROR: Your scores contain nan or inf.')
+
+    # Sanity check that inputs are scores and not decisions
+    n_uniq = np.unique(combined_scores).size
+    if n_uniq < 3:
+        sys.exit('ERROR: You should provide soft CM scores - not binary decisions')
+
+    # Obtain miss and false alarm rates of CM
+    Pmiss_cm, Pfa_cm, CM_thresholds = compute_det_curve(
+        bonafide_score_cm, spoof_score_cm)
+    
+    # =====
+    # tDCF
+    # =====
+    if np.isnan(C0) or np.isnan(C1) or np.isnan(C2): 
+        # this is a case where 
+        mintDCF = np.nan
+    else:
+        # tDCF values
+        tDCF = C0 + C1 * Pmiss_cm + C2 * Pfa_cm
+        # Obtain default t-DCF
+        tDCF_default = C0 + np.minimum(C1, C2)
+        # Normalized t-DCF
+        tDCF_norm = tDCF / tDCF_default
+        # min t-DCF
+        mintDCF = tDCF_norm[tDCF_norm.argmin()]
+
+    # ====
+    # EER
+    # ====
+    abs_diffs = np.abs(Pmiss_cm - Pfa_cm)
+    min_index = np.argmin(abs_diffs)
+    eer = np.mean((Pmiss_cm[min_index], Pfa_cm[min_index]))
+
+    return mintDCF, eer
+
+
+
 def ASVspoof2019_evaluate(bonafide_cm_scores, bonafide_cm_file_names,
                           spoof_cm_scores, spoof_cm_file_names, verbose=False,
                           protocol_alternative=None):
diff --git a/sandbox/util_loss_metric.py b/sandbox/util_loss_metric.py
@@ -462,7 +462,7 @@ def rank_consistency_v3(x, metric = None):
       >> rank_consistency_v3(x, metric)
       tensor(.0)
     """
-    
+    # batch size
     bs = x.shape[0]
 
     # loss to be accumulated