-
Notifications
You must be signed in to change notification settings - Fork 56
/
Copy pathlr_speech.py
199 lines (153 loc) · 6.73 KB
/
lr_speech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import random
import math
import torch
import torch.nn as nn
import numpy as np
import librosa
import soundfile
import glob
from numpy import zeros, sign
from math import exp, log
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import argparse
torch.manual_seed(1701)
class SpeechDataset(Dataset):
def __init__(self, data):
self.n_samples, self.n_features = data.shape
# The first column is label, the rest are the features
self.n_features -= 1
self.feature = torch.from_numpy(data[:, 1:].astype(np.float32)) # size [n_samples, n_features]
self.label = torch.from_numpy(data[:, [0]].astype(np.float32)) # size [n_samples, 1]
# support indexing such that dataset[i] can be used to get i-th sample
def __getitem__(self, index):
return self.feature[index], self.label[index]
# we can call len(dataset) to return the size
def __len__(self):
return self.n_samples
def list_files(directory, vowels):
'''
Takes in a directory location of the Hillenbrand data and a list of vowels;
returns a dictionary mapping from vowels to their soundfiles
'''
soundfile_dict = {};
for vowel in vowels:
soundfile_dict[vowel] = glob.glob(directory+'/*/*'+vowel+'.wav')
return soundfile_dict
def create_dataset(soundfile_dict, vowels, num_mfccs):
"""
Read in wav files, and return a 2-D numpy array that contains your
speech dataset.
:param soundfile_dict: A dictionary that, for each vowel V, contains a list of file
paths corresponding to recordings of the utterance 'hVd'
:param vowels: The set of vowels to be used in the logistic regression
:param num_mfccs: The number of MFCCs to include as features
"""
dataset = zeros((len(soundfile_dict[vowels[0]])+len(soundfile_dict[vowels[1]]),num_mfccs+1))
# TODO: Complete this function. You will need to:
#
# 1. Extract MFCCs for every wav file in soundfile_dict. The basic code for
# extracting MFCCs is given, but you will need to store the MFCCs in an
# appropriate data structure
#
# 2. Take the midpoint frame from the MFCC matrix. If there are an even
# number of frames in an utterance, take the second of the two midpoint frames.
#
# 3. z-score each feature, using the column mean and the column st. dev.
#
# Return a numpy array where the first element in each row is the label
# (0 for the first element of 'vowels', 1 for the second) and the next
# num_features elements in each row are z-scored MFCCs.
for vowel in vowels:
for filename in soundfile_dict[vowel]:
utterance, _ = librosa.load(filename,sr=16000)
mfccs = librosa.feature.mfcc(y=utterance, sr=16000, n_mfcc=num_mfccs, n_fft=512, win_length=400, hop_length=160)
# To use the midpoint frame
# z-score your dataset
return dataset
class SimpleLogreg(nn.Module):
def __init__(self, num_features):
"""
Initialize the parameters you'll need for the model.
:param num_features: The number of features in the linear model
"""
super(SimpleLogreg, self).__init__()
# TODO: Replace this with a real nn.Module
self.linear = None
def forward(self, x):
"""
Compute the model prediction for an example.
:param x: Example to evaluate
"""
# TODO: Complete this function
return 0.5
def evaluate(self, data):
with torch.no_grad():
y_predicted = self(data.feature)
y_predicted_cls = y_predicted.round()
acc = y_predicted_cls.eq(data.label).sum() / float(data.label.shape[0])
return acc
def step(epoch, ex, model, optimizer, criterion, inputs, labels):
"""Take a single step of the optimizer, we factored it into a single
function so we could write tests.
:param epoch: The current epoch
:param ex: Which example / minibatch you're one
:param model: The model you're optimizing
:param inputs: The current set of inputs
:param labels: The labels for those inputs
"""
# You should:
# A) get predictions
# B) compute the loss from that prediction
# C) backprop
# D) update the parameters
# There's additional code to print updates (for good software
# engineering practices, this should probably be logging, but
# printing is good enough for a homework).
if (ex+1) % 20 == 0:
acc_train = model.evaluate(train)
acc_test = model.evaluate(test)
print(f'Epoch: {epoch+1}/{num_epochs}, Example {ex}, loss = {loss.item():.4f}, train_acc = {acc_train.item():.4f} test_acc = {acc_test.item():.4f}')
if __name__ == "__main__":
argparser = argparse.ArgumentParser()
argparser.add_argument("--vowels", help="The two vowels to be classified, separated by a comma",
type=str, default="ih,eh")
argparser.add_argument("--directory", help="Main directory for the speech files",
type=str, default="./Hillenbrand")
argparser.add_argument("--num_mfccs", help="Number of MFCCs to use",
type=int, default=13)
argparser.add_argument("--passes", help="Number of passes through train",
type=int, default=5)
argparser.add_argument("--batch", help="Number of items in each batch",
type=int, default=1)
argparser.add_argument("--learnrate", help="Learning rate for SGD",
type=float, default=0.1)
args = argparser.parse_args()
directory = args.directory
num_mfccs = args.num_mfccs
vowels = args.vowels.split(',')
# Vowels in the dataset (we're only using a subset):
# ae, ah, aw, eh, ei, er, ih, iy, oa, oo, uh, uw
files = list_files(directory, vowels)
speechdata = create_dataset(files, vowels, num_mfccs)
train_np, test_np = train_test_split(speechdata, test_size=0.15, random_state=1234)
train, test = SpeechDataset(train_np), SpeechDataset(test_np)
print("Read in %i train and %i test" % (len(train), len(test)))
# Initialize model
logreg = SimpleLogreg(train.n_features)
num_epochs = args.passes
batch = args.batch
total_samples = len(train)
# Replace these with the correct loss and optimizer
criterion = None
optimizer = None
train_loader = DataLoader(dataset=train,
batch_size=batch,
shuffle=True,
num_workers=0)
dataiter = iter(train_loader)
# Iterations
for epoch in range(num_epochs):
for ex, (inputs, labels) in enumerate(train_loader):
# Run your training process
step(epoch, ex, logreg, optimizer, criterion, inputs, labels)