-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmodel.py
114 lines (93 loc) · 6.51 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import torch.nn as nn
import torch
from layers import ParalingExtractor, TemporalExtractor, BayesMLP
import utils
##### Number of Parameter = 1,643,110
class UncertaintyModel(nn.Module):
def __init__(self, nout=2, ninp_lstm=320, nhidden_lstm=256, nlstm=2, dropout=0.5, uncertainty_samples=30, bbb_nsegments=50):
""" Generates the Uncertainty Model object for label uncertainty aware Speech Emotion Recognition.
The model introduced in the following papers,
Navin Raj Prabhu, Guillaume Carbajal, Nale Lehmann-Willenbrock, Timo Gerkmann,
"End-To-End Label Uncertainty Modeling for Speech-based Arousal Recognition Using Bayesian Neural Networks",
Interspeech, Incheon, Korea, Sep. 2022. https://arxiv.org/abs/2110.03299
and
Navin Raj Prabhu, Nale Lehmann-Willenbrock, Timo Gerkmann,
"Label Uncertainty Modeling and Prediction for Speech Emotion Recognition using t-Distributions",
Affective Computing and Intelligent Interaction (ACII), Nara, Japan, Oct. 2022 https://arxiv.org/abs/2207.12135
Args:
nout (int, optional): The emotion output dimension.
Defaults to 2, for arousal and valence prediction.
ninp_lstm (int, optional): Input dimension to the LSTM-based TemporalExtractor layer.
Defaults to 320.
nhidden_lstm (int, optional): Output dimesion of the TemporalExtractor, number of temporal features.
Defaults to 256.
nlstm (int, optional): Number of stacked LSTM layers.
Defaults to 2.
dropout (float, optional): Dropout probability on the feature extraction layers.
Defaults to 0.5.
uncertainty_samples (int, optional): Number of foward passes in the uncertainty layer, for stochastic outputs.
Defaults to 30, to acheive output distribtuion converging to a Gaussian.
bbb_nsegments (int, optional): Granularity of dynamic uncertainty and stochastic weights.
Defaults to 50, leading to new weights sampled every 2 secs (40ms*50=2s).
"""
super().__init__()
self.nout = nout
self.uncertaintySamples = uncertainty_samples
post_mu_init = utils.get_posterior_mu_init_range()
post_rho_init = utils.get_posterior_rho_init_range()
self.paralinguisticExtractor = ParalingExtractor(dropout=dropout)
self.temporalExtractor = TemporalExtractor(ninp=ninp_lstm, nhidden=nhidden_lstm, nlstm=nlstm, dropout=dropout)
self.uncertaintyLayer = BayesMLP(ninp=nhidden_lstm, nout=nout, bbb_nsegments=bbb_nsegments,
post_mu_init=post_mu_init, post_rho_init=post_rho_init)
def sample_uncertainty_predictions(self, x):
""" The uncertainty model's forward pass functions.
While training/testing this model, use this function as the forward pass.
This function in turn uses the vanilla forward() + does additional stochastic output sampling.
Args:
x (_type_): For Batch Size of 25 and Segment Size of 300 (300 segment size = 12s, 40ms*300 = 12 secs)-> torch.Size([25, 300, 320])
Returns:
outputs: All stochastic outputs. Dimension 0 has all uncertainty samples. example shape - torch.Size([uncertaintySamples, 25, 300, 2])
outputs_mean: Mean (m_t) of stochastic ouputs. example shape - torch.Size([25, 300, 2])
outputs_std: Standard deviation (s_t) of stochastic ouputs. example shape - torch.Size([25, 300, 2])
log_post: Mean Posterior of BayesMLP weights (sum across all MLP layers) across #uncertaintySamples.
log_prior: Mean Posterior of BayesMLP weights (sum across all MLP layers) across #uncertaintySamples.
outs_meanw: Mean Emotions (m_t) predicitons using mu of gaussain wgts and biases, reducing the randomization effect of sampling.
"""
# we calculate variables for the negative elbo loss, which will be one of the elements in our loss function
# initialize uncertainty tensors and place in device
outputs = torch.zeros(self.uncertaintySamples, x.shape[0]*x.shape[1], self.nout)
log_priors = torch.zeros(self.uncertaintySamples)
log_posts = torch.zeros(self.uncertaintySamples)
# Feature extract 'x' - Reuse code Conv1D and LSTM where no weight uncertianty exists-hence no sampling required
feat_x = self(x, mode="feat")
# make multiple predictions, using the uncertainty layers MLP alone,
# and calculate prior, posterior, and likelihood for a given number of samples
for i in range(self.uncertaintySamples):
# Input to Uncertainty layers also has control on granularity of uncertainty
outputs[i] = self.uncertaintyLayer(feat_x, self.training, True).reshape(x.shape[0]*x.shape[1], self.nout) # make predictions
log_priors[i] = self.uncertaintyLayer.log_prior() # get log prior
log_posts[i] = self.uncertaintyLayer.log_post() # get log variational posterior
# Predicitons from mu of gaussain wgts, biases
# Sampling and Training variables set to --> FALSE
outs_meanw = self.uncertaintyLayer(feat_x, False, False).reshape(x.shape[0]*x.shape[1], self.nout)
# Below, y.shape[0]*y.shape[1] = number of samples tested
num_samples = outputs.shape[1]
# calculate monte carlo estimate of prior posterior and likelihood
log_prior = log_priors.mean() / num_samples
log_post = log_posts.mean() / num_samples
# Calculate mean and std of outputs (across uncertaintySamples)
outputs_mean = outputs.mean(0)
outputs_std = outputs.std(0)
return outputs, outputs_mean, outputs_std, log_post, log_prior, outs_meanw
def forward(self, x, mode):
paraling_feat = self.paralinguisticExtractor(x) if self.paralinguisticExtractor is not None else x
temporal_feat = self.temporalExtractor(paraling_feat)
if mode== "pred":
# Forward pass including the uncertainty layer.
# Return the stochastic outputs of the model.
uncert_out = self.uncertaintyLayer(temporal_feat, True)
return uncert_out
else:
# Forward pass with only the E2E backbone.
# Return the features extracted.
return temporal_feat