-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathmodel.py
107 lines (99 loc) · 5.44 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import sys
import numpy as np
import math
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, Activation
from dataset import Dataset
# Class to manage the model, it's state.
class Model:
def __init__(self, logger, config, dataset=None, validation_data=None):
self.logger = logger
self.config = config
self.model = None
self.dataset = dataset
self.validation_data = validation_data
# Build a model
# If you want to experiment, this is the place you want to make the changes in
def build(self, output_summary=False):
self.logger.info("Building the model...")
model = Sequential()
bins = math.ceil(self.config.getint("song", "window_size")/2)+1
model.add(Conv2D(32, (3,3), padding="same", input_shape=(bins, self.config.getint("song", "sample_length"), 1), activation='relu'))
model.add(Conv2D(32, (3,3), padding="same", activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.25))
model.add(Conv2D(64, (3,3), padding="same", activation='relu'))
model.add(Conv2D(64, (3,3), padding="same", activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('elu'))
model.add(Dropout(0.5))
model.add(Dense(bins, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adadelta', metrics=['accuracy'])
if output_summary is True:
model.summary()
self.model = model
def train(self, epochs, batch=32, save_log=False, log_name="history.csv"):
if self.model is not None:
self.logger.info("Training the model...")
self.logger.info("Beggining training with %i samples.", len(self.dataset.mixture_windows))
weights_backup = keras.callbacks.ModelCheckpoint('weights{epoch:08d}.h5', save_weights_only=True, period=5)
#TODO: Evaluate the use of fit_generator or train_on_batch to load data from disk instead of storing it all in RAM since it takes up a lot of memory otherwise.
training = self.model.fit(self.dataset.mixture_windows, self.dataset.labels, batch_size=batch, epochs=epochs, validation_data=(self.validation_data.mixture_windows, self.validation_data.labels), callbacks=[weights_backup])
self.logger.info("Training finished.")
if save_log is True:
self.logger.info("Exporting statistics.")
import csv
with open(log_name, 'w') as f:
w = csv.DictWriter(f, training.history.keys())
w.writeheader()
w.writerow(training.history)
def save(self, filename='obj.save'):
if self.model is not None:
self.model.save_weights(filename, overwrite=True)
else:
self.logger.critical("Cannot save weights - model not set up")
sys.exit(5)
def load(self, filename='obj.save'):
if self.model is not None and os.path.isfile(filename):
self.model.load_weights(filename)
else:
self.logger.critical("Cannot load weights - model not set up or file not found")
sys.exit(3)
def isolate(self, mixture, output="output.wav", save_accompaniment=True, save_original_mask=False, save_original_probabilities=False):
if self.model is not None:
#TODO: For some reason the output loses ~0.004s (3*BINS+1) worth of samples
self.logger.info("Preparing the song...")
split_x = np.array(mixture.split_slidingwindow(self.config.getint("song", "sample_length")))
split_x = split_x.reshape(len(split_x), len(split_x[0]), len(split_x[0][0]), 1)
self.logger.info("Extracting vocals from the audio file...")
prediction = self.model.predict(split_x)
prediction = np.transpose(prediction) # Transpose the mask into the format librosa uses
if save_original_probabilities is True:
np.savetxt('original_predicted_probabilities.out', prediction)
self.logger.info("Calculating the binary mask...")
# Probability to label conversion, as there's no other way to get the output from the network in the right format
#TODO: don't fill the accompaniment array if save_accompaniment is set to False
accompaniment = np.zeros(np.shape(prediction))
for x in range(0, len(prediction)):
for y in range(0, len(prediction[x])):
prediction[x][y] = 1 if prediction[x][y] > 0.45 else 0 # Higher values tend to make voice unintelligible.
accompaniment[x][y] = 0 if prediction[x][y] > 0.45 else 1
if save_original_mask is True:
np.savetxt('predicted_mask.out', prediction)
if save_accompaniment is True:
spectrogram_bak = mixture.get_spectrogram()
mixture.apply_binary_mask(accompaniment)
mixture.reverse_stft()
mixture.save_file("instrumental_"+output)
mixture.set_spectrogram(spectrogram_bak)
mixture.apply_binary_mask(prediction)
mixture.reverse_stft()
mixture.save_file(output)
else:
self.logger.critical("Model not set up, cannot attempt to isolate.")
sys.exit(4)