diff --git a/README.md b/README.md index 9282b99..a1a0d3d 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,7 @@ Download the code. The code is based on signal processing package in Python call Dependencies: Run these lines in a terminal to install everything necessary for feature extraction. ``` -sudo apt-get install python-numpy python-scipy python-nose python-pip - -sudo pip install scikits.talkbox +sudo apt-get install python3-numpy python3-scipy python3-nose ``` Next for the installation of Torch for loading the models run this. ``` @@ -37,30 +35,31 @@ cd ~/torch; bash install-deps; ./install.sh ``` ``` -luarocks install rnn +git clone https://github.com/Element-Research/rnn.git old-rnn +cd old-rnn; luarocks make rocks/rnn-scm-1.rockspec ``` -The Estimation model can be downloaded here and because of size constraints the Tracking model can be abtained by download from this link -[tracking_model.mat] (https://drive.google.com/open?id=0Bxkc5_D0JjpiZWx4eTU1d0hsVXc) +The Estimation model can be downloaded here and because of size constraints the Tracking model can be obtained by download from this link: +[tracking_model.mat](https://drive.google.com/open?id=0Bxkc5_D0JjpiZWx4eTU1d0hsVXc) ## How to use: For vowel formant estimation, call the main script in a terminal with the following inputs: wav file, formant output filename, and the vowel begin and end times: ``` -python formants.py data/Example.wav data/ExamplePredictions.csv --begin 1.2 --end 1.3 +python3 formants.py data/Example.wav data/ExamplePredictions.csv --begin 1.2 --end 1.3 ``` or the vowel begin and end times can be taken from a TextGrid file (here the name of the TextGrid is Example.TextGrid and the vowel is taken from a tier called "VOWEL"): ``` -python formants.py data/Example.wav data/examplePredictions.csv --textgrid_filename data/Example.TextGrid \ +python3 formants.py data/Example.wav data/examplePredictions.csv --textgrid_filename data/Example.TextGrid \ --textgrid_tier VOWEL ``` For formant tracking, just call the script with the wav file and output filename: ``` -python formants.py data/Example.wav data/ExamplePredictions.csv +python3 formants.py data/Example.wav data/ExamplePredictions.csv ``` diff --git a/extract_features.py b/extract_features.py index 033e91b..3294130 100644 --- a/extract_features.py +++ b/extract_features.py @@ -9,9 +9,9 @@ import math from scipy.fftpack.realtransforms import dct from scipy.signal import lfilter, hamming -from copy import deepcopy from scipy.fftpack import fft, ifft -from scikits.talkbox.linpred import lpc +#from scikits.talkbox.linpred import lpc # obsolete +from helpers.conch_lpc import lpc import shutil from helpers.utilities import * @@ -88,9 +88,9 @@ def periodogram(x, nfft=None, fs=1): pxx = np.abs(fft(x, nfft)) ** 2 if nfft % 2 == 0: - pn = nfft / 2 + 1 + pn = nfft // 2 + 1 else: - pn = (nfft + 1 )/ 2 + pn = (nfft + 1) // 2 fgrid = np.linspace(0, fs * 0.5, pn) return pxx[:pn] / (n * fs), fgrid @@ -137,9 +137,9 @@ def arspec(x, order, nfft=None, fs=1): # This is not enough to deal correctly with even/odd size if nfft % 2 == 0: - pn = nfft / 2 + 1 + pn = nfft // 2 + 1 else: - pn = (nfft + 1 )/ 2 + pn = (nfft + 1) // 2 px = 1 / np.fft.fft(a, nfft)[:pn] pxx = np.real(np.conj(px) * px) @@ -200,7 +200,6 @@ def preemp(input, p): def arspecs(input_wav,order,Atal=False): - epsilon = 0.0000000001 data = input_wav if Atal: ar = atal(data, order, 30) @@ -211,8 +210,10 @@ def arspecs(input_wav,order,Atal=False): for k, l in zip(ars[0], ars[1]): ar.append(math.log(math.sqrt((k**2)+(l**2)))) for val in range(0,len(ar)): - if ar[val] == 0.0: - ar[val] = deepcopy(epsilon) + if ar[val] < 0.0: + ar[val] = np.nan + elif ar[val] == 0.0: + ar[val] = epsilon mspec1 = np.log10(ar) # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain) ar = dct(mspec1, type=2, norm='ortho', axis=-1) @@ -221,10 +222,10 @@ def arspecs(input_wav,order,Atal=False): def specPS(input_wav,pitch): N = len(input_wav) - samps = N/pitch + samps = N // pitch if samps == 0: samps = 1 - frames = N/samps + frames = N // samps data = input_wav[0:frames] specs = periodogram(data,nfft=4096) for i in range(1,int(samps)): @@ -236,10 +237,11 @@ def specPS(input_wav,pitch): specs[0][s] /= float(samps) peri = [] for k, l in zip(specs[0], specs[1]): - if k == 0 and l == 0: - peri.append(epsilon) - else: - peri.append(math.log(math.sqrt((k ** 2) + (l ** 2)))) + m = math.sqrt((k ** 2) + (l ** 2)) + if m > 0: m = math.log(m) + if m == 0: m = epsilon + elif m < 0: m = np.nan + peri.append(m) # Filter the spectrum through the triangle filterbank mspec = np.log10(peri) # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain) diff --git a/formants.py b/formants.py index b963de0..db5e1ff 100644 --- a/formants.py +++ b/formants.py @@ -9,19 +9,19 @@ def predict_from_times(wav_filename, preds_filename, begin, end): tmp_features_filename = tempfile._get_default_tempdir() + "/" + next(tempfile._get_candidate_names()) + ".txt" - print tmp_features_filename + print(tmp_features_filename) if begin > 0.0 or end > 0.0: features.create_features(wav_filename, tmp_features_filename, begin, end) - easy_call("th load_estimation_model.lua " + tmp_features_filename + ' ' + preds_filename) + easy_call("luajit load_estimation_model.lua " + tmp_features_filename + ' ' + preds_filename) else: features.create_features(wav_filename, tmp_features_filename) - easy_call("th load_tracking_model.lua " + tmp_features_filename + ' ' + preds_filename) + easy_call("luajit load_tracking_model.lua " + tmp_features_filename + ' ' + preds_filename) def predict_from_textgrid(wav_filename, preds_filename, textgrid_filename, textgrid_tier): - print wav_filename + print(wav_filename) if os.path.exists(preds_filename): os.remove(preds_filename) diff --git a/formants.sh b/formants.sh index 885c834..d7d37e7 100755 --- a/formants.sh +++ b/formants.sh @@ -4,12 +4,12 @@ if [ $# -eq 2 ] then tempfile=`mktemp -t txt` python extract_features.py $1 $tempfile - th load_estimation_model.lua $tempfile $2 + luajit load_estimation_model.lua $tempfile $2 elif [ $# -eq 4 ] then tempfile=`mktemp -t txt` python extract_features.py $1 $tempfile --begin $3 --end $4 - th load_estimation_model.lua $tempfile $2 + luajit load_estimation_model.lua $tempfile $2 else echo "$0 wav_filename pred_csv_filename [begin_time end_time]" fi diff --git a/helpers/conch_lpc.py b/helpers/conch_lpc.py new file mode 100644 index 0000000..1e02cf4 --- /dev/null +++ b/helpers/conch_lpc.py @@ -0,0 +1,286 @@ +# This file has been copied (with minor changes) from Michael +# McAuliffe's Conch project, to provide a compatible replacement +# implementation of the lpc() function from the obsolete Python-2-only +# scikits.talkbox library. +# +# Conch repository: https://github.com/mmcauliffe/Conch-sounds +# Source: https://github.com/mmcauliffe/Conch-sounds/blob/master/conch/analysis/formants/lpc.py + +# Copyright (c) 2015 Michael McAuliffe +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +#import librosa +import numpy as np +import scipy as sp +from scipy.signal import lfilter + +from scipy.fftpack import fft, ifft +from scipy.signal import gaussian + +#from ..helper import nextpow2 +#from ..functions import BaseAnalysisFunction + +# Source: https://github.com/mmcauliffe/Conch-sounds/blob/master/conch/analysis/helper.py +def nextpow2(x): + """Return the first integer N such that 2**N >= abs(x)""" + return np.ceil(np.log2(np.abs(x))) + +def lpc_ref(signal, order): + """Compute the Linear Prediction Coefficients. + + Return the order + 1 LPC coefficients for the signal. c = lpc(x, k) will + find the k+1 coefficients of a k order linear filter: + + xp[n] = -c[1] * x[n-2] - ... - c[k-1] * x[n-k-1] + + Such as the sum of the squared-error e[i] = xp[i] - x[i] is minimized. + + Parameters + ---------- + signal: array_like + input signal + order : int + LPC order (the output will have order + 1 items) + + Notes + ---- + This is just for reference, as it is using the direct inversion of the + toeplitz matrix, which is really slow""" + if signal.ndim > 1: + raise ValueError("Array of rank > 1 not supported yet") + if order > signal.size: + raise ValueError("Input signal must have a lenght >= lpc order") + + if order > 0: + p = order + 1 + r = np.zeros(p, 'float32') + # Number of non zero values in autocorrelation one needs for p LPC + # coefficients + nx = np.min([p, signal.size]) + x = np.correlate(signal, signal, 'full') + r[:nx] = x[signal.size - 1:signal.size + order] + phi = np.dot(sp.linalg.inv(sp.linalg.toeplitz(r[:-1])), -r[1:]) + return np.concatenate(([1.], phi)) + else: + return np.ones(1, dtype='float32') + + +# @jit +def levinson_1d(r, order): + """Levinson-Durbin recursion, to efficiently solve symmetric linear systems + with toeplitz structure. + + Parameters + --------- + r : array-like + input array to invert (since the matrix is symmetric Toeplitz, the + corresponding pxp matrix is defined by p items only). Generally the + autocorrelation of the signal for linear prediction coefficients + estimation. The first item must be a non zero real. + + Notes + ---- + This implementation is in python, hence unsuitable for any serious + computation. Use it as educational and reference purpose only. + + Levinson is a well-known algorithm to solve the Hermitian toeplitz + equation: + + _ _ + -R[1] = R[0] R[1] ... R[p-1] a[1] + : : : : * : + : : : _ * : + -R[p] = R[p-1] R[p-2] ... R[0] a[p] + _ + with respect to a ( is the complex conjugate). Using the special symmetry + in the matrix, the inversion can be done in O(p^2) instead of O(p^3). + """ + r = np.atleast_1d(r) + if r.ndim > 1: + raise ValueError("Only rank 1 are supported for now.") + + n = r.size + if n < 1: + raise ValueError("Cannot operate on empty array !") + elif order > n - 1: + raise ValueError("Order should be <= size-1") + + if not np.isreal(r[0]): + raise ValueError("First item of input must be real.") + elif not np.isfinite(1 / r[0]): + raise ValueError("First item should be != 0") + + # Estimated coefficients + a = np.empty(order + 1, 'float32') + # temporary array + t = np.empty(order + 1, 'float32') + # Reflection coefficients + k = np.empty(order, 'float32') + + a[0] = 1. + e = r[0] + + for i in range(1, order + 1): + acc = r[i] + for j in range(1, i): + acc += a[j] * r[i - j] + k[i - 1] = -acc / e + a[i] = k[i - 1] + + for j in range(order): + t[j] = a[j] + + for j in range(1, i): + a[j] += k[i - 1] * np.conj(t[i - j]) + + e *= 1 - k[i - 1] * np.conj(k[i - 1]) + + return a, e, k + + +# @jit +def _acorr_last_axis(x, nfft, maxlag): + a = np.real(ifft(np.abs(fft(x, n=nfft) ** 2))) + return a[..., :maxlag + 1] / x.shape[-1] + + +# @jit +def acorr_lpc(x, axis=-1): + """Compute autocorrelation of x along the given axis. + + This compute the biased autocorrelation estimator (divided by the size of + input signal) + + Notes + ----- + The reason why we do not use acorr directly is for speed issue.""" + if not np.isrealobj(x): + raise ValueError("Complex input not supported yet") + + maxlag = x.shape[axis] + nfft = int(2 ** nextpow2(2 * maxlag - 1)) + + if axis != -1: + x = np.swapaxes(x, -1, axis) + a = _acorr_last_axis(x, nfft, maxlag) + if axis != -1: + a = np.swapaxes(a, -1, axis) + return a + + +# @jit +def lpc(signal, order, axis=-1): + """Compute the Linear Prediction Coefficients. + + Return the order + 1 LPC coefficients for the signal. c = lpc(x, k) will + find the k+1 coefficients of a k order linear filter: + + xp[n] = -c[1] * x[n-2] - ... - c[k-1] * x[n-k-1] + + Such as the sum of the squared-error e[i] = xp[i] - x[i] is minimized. + + Parameters + ---------- + signal: array_like + input signal + order : int + LPC order (the output will have order + 1 items) + + Returns + ------- + a : array-like + the solution of the inversion. + e : array-like + the prediction error. + k : array-like + reflection coefficients. + + Notes + ----- + This uses Levinson-Durbin recursion for the autocorrelation matrix + inversion, and fft for the autocorrelation computation. + + For small order, particularly if order << signal size, direct computation + of the autocorrelation is faster: use levinson and correlate in this case.""" + n = signal.shape[axis] + if order > n: + raise ValueError("Input signal must have length >= order") + + r = acorr_lpc(signal, axis) + return levinson_1d(r, order) + + +def process_frame(X, window, num_formants, new_sr): + X = X * window + A, e, k = lpc(X, num_formants * 2) + + rts = np.roots(A) + rts = rts[np.where(np.imag(rts) >= 0)] + angz = np.arctan2(np.imag(rts), np.real(rts)) + frqs = angz * (new_sr / (2 * np.pi)) + frq_inds = np.argsort(frqs) + frqs = frqs[frq_inds] + bw = -1 / 2 * (new_sr / (2 * np.pi)) * np.log(np.abs(rts[frq_inds])) + return frqs, bw + + +def lpc_formants(signal, sr, num_formants, max_freq, time_step, + win_len, window_shape='gaussian'): + output = {} + new_sr = 2 * max_freq + alpha = np.exp(-2 * np.pi * 50 * (1 / new_sr)) + proc = lfilter([1., -alpha], 1, signal) + if sr > new_sr: + proc = librosa.resample(proc, sr, new_sr) + nperseg = int(win_len * new_sr) + nperstep = int(time_step * new_sr) + if window_shape == 'gaussian': + window = gaussian(nperseg + 2, 0.45 * (nperseg - 1) / 2)[1:nperseg + 1] + else: + window = np.hanning(nperseg + 2)[1:nperseg + 1] + indices = np.arange(int(nperseg / 2), proc.shape[0] - int(nperseg / 2) + 1, nperstep) + num_frames = len(indices) + for i in range(num_frames): + if nperseg % 2 != 0: + X = proc[indices[i] - int(nperseg / 2):indices[i] + int(nperseg / 2) + 1] + else: + X = proc[indices[i] - int(nperseg / 2):indices[i] + int(nperseg / 2)] + frqs, bw = process_frame(X, window, num_formants, new_sr) + formants = [] + for j, f in enumerate(frqs): + if f < 50: + continue + if f > max_freq - 50: + continue + formants.append((np.asscalar(f), np.asscalar(bw[j]))) + missing = num_formants - len(formants) + if missing: + formants += [(None, None)] * missing + output[indices[i] / new_sr] = formants + return output + + +#class FormantTrackFunction(BaseAnalysisFunction): +# def __init__(self, num_formants=5, max_frequency=5000, +# time_step=0.01, window_length=0.025, window_shape='gaussian'): +# super(FormantTrackFunction, self).__init__() +# self.arguments = [num_formants, max_frequency, time_step, window_length, window_shape] +# self._function = lpc_formants +# self.requires_file = False