From e0f9d50aaaa902d22ef9f62e070cf3bd1b65f384 Mon Sep 17 00:00:00 2001 From: Shoeboxam Date: Tue, 26 Jan 2021 04:28:50 -0500 Subject: [PATCH 1/9] initial integration with smartnoise LSTM layers are not supported yet --- senone_classifier/run_training.sh | 15 ++++++++------- senone_classifier/train.py | 2 ++ senone_classifier/trainer.py | 15 +++++++++++++++ 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/senone_classifier/run_training.sh b/senone_classifier/run_training.sh index 19ba549..7a97236 100755 --- a/senone_classifier/run_training.sh +++ b/senone_classifier/run_training.sh @@ -1,12 +1,12 @@ #!/bin/bash -train_scp_dir="/nfs/mercury-13/u123/dbagchi/ASR_utt" -train_scp_file="train_full_nodup_tr90/feats.scp" -train_label_scp_file="train_full_nodup_tr90/phone.ctm2.scp" +train_scp_dir="/Users/michael/whitenoise/sotto-voce-corpus/senone_labels" +train_scp_file="/Users/michael/whitenoise/sotto-voce-corpus/libri_inp_data/train_full_nodup_tr90/feats.scp" +train_label_scp_file="/Users/michael/whitenoise/sotto-voce-corpus/libri_inp_data/train_full_nodup_tr90/phone.ctm2.scp" -cv_scp_dir="/nfs/mercury-13/u123/dbagchi/ASR_utt" -cv_scp_file="train_full_nodup_cv10/feats.scp" -cv_label_scp_file="train_full_nodup_cv10/phone.ctm2.scp" +cv_scp_dir="/Users/michael/whitenoise/sotto-voce-corpus/senone_labels" +cv_scp_file="/Users/michael/whitenoise/sotto-voce-corpus/libri_inp_data/train_full_nodup_cv10/feats.scp" +cv_label_scp_file="/Users/michael/whitenoise/sotto-voce-corpus/libri_inp_data/train_full_nodup_cv10/phone.ctm2.scp" input_dim=13 output_dim=9096 @@ -27,7 +27,8 @@ mdl_path="final.pth.tar" pr_fr=1000 -/nfs/mercury-13/u123/dbagchi/anaconda3/envs/pytorch_gpu_deblin/bin/python train.py --train_scp_dir $train_scp_dir \ +workon smartnoise +python train.py --train_scp_dir $train_scp_dir \ --train_scp_file $train_scp_file \ --train_label_scp_file $train_label_scp_file \ --cv_scp_dir $cv_scp_dir \ diff --git a/senone_classifier/train.py b/senone_classifier/train.py index 33cee2a..32775de 100644 --- a/senone_classifier/train.py +++ b/senone_classifier/train.py @@ -37,6 +37,8 @@ parser.add_argument('--early_stop', dest='early_stop', default=0, type=int, help='Early stop training when halving lr but still get' 'small improvement') + parser.add_argument('--step_epsilon', default=None, type=float, + help='Set step_epsilon to enable differentially private learning') # save and load model parser.add_argument('--save_folder', default='exp/temp', diff --git a/senone_classifier/trainer.py b/senone_classifier/trainer.py index 485a340..b50b0ad 100644 --- a/senone_classifier/trainer.py +++ b/senone_classifier/trainer.py @@ -1,6 +1,11 @@ import os import time import torch +try: + from opendp.smartnoise.network.optimizer import PrivacyAccountant +except ImportError as e: + print("Install smartnoise from the ms-external-sgd branch of this repository: https://github.com/opendifferentialprivacy/smartnoise-core-python") + raise e class Trainer(object): @@ -10,6 +15,8 @@ def __init__(self, dataloader, model, optimizer, args): self.tr_loader = dataloader['tr_loader'] self.cv_loader = dataloader['cv_loader'] + self.accountant = PrivacyAccountant(model=self.model, step_epsilon=args.step_epsilon) if args.step_epsilon else None + # Training config self.epochs = args.epochs self.half_lr = args.half_lr @@ -68,7 +75,11 @@ def train(self): print('Saving checkpoint model to %s' % file_path) print("Cross validation...") + val_loss = self._run_one_epoch(epoch, cross_valid=True) + if self.accountant: + self.accountant.increment_epoch() + print('-' * 85) print('Valid Summary | End of Epoch {0} | Time {1:.2f}s | ' 'Valid Loss {2:.3f}'.format( @@ -123,6 +134,10 @@ def _run_one_epoch(self, epoch, cross_valid=False): if not cross_valid: self.optimizer.zero_grad() loss.backward() + + if self.accountant: + self.accountant.privatize_grad() + self.optimizer.step() total_loss += loss.item() if i % self.print_freq == 0: From eb0385dde492d00b1ae6f65a1c1190bf7c5d68fb Mon Sep 17 00:00:00 2001 From: Shoeboxam Date: Thu, 11 Feb 2021 20:42:48 -0500 Subject: [PATCH 2/9] senone: use DPLSTM module if is_private --- .gitignore | 3 +++ senone_classifier/model.py | 12 ++++++++++-- senone_classifier/run_training.sh | 3 ++- senone_classifier/train.py | 10 +++++++--- senone_classifier/trainer.py | 6 +++--- 5 files changed, 25 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index b6e4761..baff6b4 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,6 @@ dmypy.json # Pyre type checker .pyre/ + +.idea +senone_classifier/exp/ \ No newline at end of file diff --git a/senone_classifier/model.py b/senone_classifier/model.py index dc713b5..25516cd 100644 --- a/senone_classifier/model.py +++ b/senone_classifier/model.py @@ -1,8 +1,15 @@ import torch import torch.nn.functional as F +try: + from opendp.smartnoise.network.layers import DPLSTM +except ImportError as e: + print("Install smartnoise from the ms-external-sgd branch of this repository: https://github.com/opendifferentialprivacy/smartnoise-core-python") + raise e + + class FcNet(torch.nn.Module): - def __init__(self, D_in, H, D_out, layers): + def __init__(self, D_in, H, D_out, layers, is_private=False): """ In the constructor we instantiate two nn.Linear modules and assign them as member variables. @@ -11,9 +18,10 @@ def __init__(self, D_in, H, D_out, layers): self.hid_size = H self.out_size = D_out self.hidden_layers = layers + self.is_private = is_private super(FcNet, self).__init__() - self.lstm = torch.nn.LSTM(self.in_size, self.hid_size, self.hidden_layers) + self.lstm = (DPLSTM if is_private else torch.nn.LSTM)(self.in_size, self.hid_size, self.hidden_layers) self.linear1 = torch.nn.Linear(self.hid_size, self.out_size) diff --git a/senone_classifier/run_training.sh b/senone_classifier/run_training.sh index 7a97236..56e3f30 100755 --- a/senone_classifier/run_training.sh +++ b/senone_classifier/run_training.sh @@ -42,4 +42,5 @@ python train.py --train_scp_dir $train_scp_dir \ --momentum $mom \ --epochs $ep \ --save_folder $save_fld \ - --checkpoint $ckpt --model_path $mdl_path --print_freq $pr_fr + --checkpoint $ckpt --model_path $mdl_path --print_freq $pr_fr \ + --step_epsilon 1.0 diff --git a/senone_classifier/train.py b/senone_classifier/train.py index 32775de..7ac81ff 100644 --- a/senone_classifier/train.py +++ b/senone_classifier/train.py @@ -8,7 +8,6 @@ resource.setrlimit(resource.RLIMIT_NOFILE, (2048, rlimit[1])) if __name__ == "__main__": - parser = argparse.ArgumentParser() # Path params parser.add_argument("--base_dir", default=os.getcwd()) @@ -65,11 +64,16 @@ dataloader = {"tr_loader":tr_dataloader, "cv_loader":cv_dataloader} - model = FcNet(args.input_dim, args.fc_nodes, args.output_dim, args.hidden_layers) # input, hidden, output + model = FcNet( + args.input_dim, # input + args.fc_nodes, # hidden + args.output_dim, # output + args.hidden_layers, + is_private=args.step_epsilon is not None) model.apply(weights_init) print(model) - model.cuda() + # model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=args.learn_rate) diff --git a/senone_classifier/trainer.py b/senone_classifier/trainer.py index b50b0ad..8151334 100644 --- a/senone_classifier/trainer.py +++ b/senone_classifier/trainer.py @@ -2,7 +2,7 @@ import time import torch try: - from opendp.smartnoise.network.optimizer import PrivacyAccountant + from opendp.smartnoise.network import PrivacyAccountant except ImportError as e: print("Install smartnoise from the ms-external-sgd branch of this repository: https://github.com/opendifferentialprivacy/smartnoise-core-python") raise e @@ -127,8 +127,8 @@ def _run_one_epoch(self, epoch, cross_valid=False): data_loader = self.tr_loader if not cross_valid else self.cv_loader for i, sample in enumerate(data_loader): - x = sample['features'].cuda() - y = sample['labels'].cuda() + x = sample['features'] # .cuda() + y = sample['labels'] # .cuda() #print(x.shape) loss = self.model(x, y) if not cross_valid: From eaac8e156fbc269e3a24dab6ddf542cc0fb53bc0 Mon Sep 17 00:00:00 2001 From: Shoeboxam Date: Thu, 11 Feb 2021 22:16:07 -0500 Subject: [PATCH 3/9] senone: initial federation --- senone_classifier/train.py | 128 +++++++++++++++++++++++++++-------- senone_classifier/trainer.py | 8 ++- 2 files changed, 108 insertions(+), 28 deletions(-) diff --git a/senone_classifier/train.py b/senone_classifier/train.py index 7ac81ff..8b84573 100644 --- a/senone_classifier/train.py +++ b/senone_classifier/train.py @@ -1,12 +1,101 @@ import argparse +import resource +import traceback +from multiprocessing import Event +from multiprocessing.context import Process +from queue import Queue + +import torch.distributed as dist + from data_io_utt import * -import torch.utils.data as data from trainer import * -import resource rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) resource.setrlimit(resource.RLIMIT_NOFILE, (2048, rlimit[1])) + +def run_senone_worker(args, rank=None, size=None, step_limit=None, federation_scheme='shuffle', queue=None, + end_event=None): + tr_file_details = {'scp_dir': args.train_scp_dir, 'scp_file': args.train_scp_file_name, + 'label_scp_file': args.train_label_scp_file} + tr_dataset = SenoneClassification(tr_file_details) + tr_dataloader = data.DataLoader(tr_dataset, batch_size=1, shuffle=True, num_workers=50) + + cv_file_details = {'scp_dir': args.cv_scp_dir, 'scp_file': args.cv_scp_file_name, + 'label_scp_file': args.cv_label_scp_file} + cv_dataset = SenoneClassification(cv_file_details) + cv_dataloader = data.DataLoader(cv_dataset, batch_size=1, shuffle=False, num_workers=50) + + dataloader = {"tr_loader": tr_dataloader, "cv_loader": cv_dataloader} + + model = FcNet( + args.input_dim, # input + args.fc_nodes, # hidden + args.output_dim, # output + args.hidden_layers, + is_private=args.step_epsilon is not None) + model.apply(weights_init) + + print(model) + + if rank: + args.federation = { + 'rank': rank, + 'size': size, + 'federation_scheme': federation_scheme, + 'step_limit': step_limit, + 'end_event': end_event + } + # model.cuda() + optimizer = torch.optim.Adam(model.parameters(), + lr=args.learn_rate) + + trainer = Trainer(dataloader, model, optimizer, args) + trainer.train() + + +def init_process(rank, size, fn, kwargs, backend='gloo'): + """ + Initialize the distributed environment. + """ + # use this command to kill processes: + # lsof -t -i tcp:29500 | xargs kill + os.environ['MASTER_ADDR'] = '127.0.0.1' + os.environ['MASTER_PORT'] = '29500' + dist.init_process_group(backend, rank=rank, world_size=size) + + kwargs['rank'] = rank + kwargs['size'] = size + try: + fn(**kwargs) + except Exception: + if not kwargs['end_event'].is_set(): + traceback.print_exc() + kwargs['end_event'].set() + + +def train_multiprocess(worker, size, **kwargs): + processes = [] + queue = Queue() + + end_event = Event() + + for rank in range(size): + p = Process(target=init_process, args=(rank, size, worker, { + 'queue': queue, 'end_event': end_event, + **kwargs + })) + p.start() + processes.append(p) + + end_event.wait() + # wait for history to be queued + time.sleep(1) + + for p in processes: + p.terminate() + + if __name__ == "__main__": parser = argparse.ArgumentParser() # Path params @@ -38,6 +127,8 @@ 'small improvement') parser.add_argument('--step_epsilon', default=None, type=float, help='Set step_epsilon to enable differentially private learning') + parser.add_argument('--num_workers', default=1, type=int, + help='Federate learning if > 1') # save and load model parser.add_argument('--save_folder', default='exp/temp', @@ -54,28 +145,11 @@ args = parser.parse_args() - tr_file_details = {'scp_dir':args.train_scp_dir,'scp_file':args.train_scp_file_name,'label_scp_file':args.train_label_scp_file} - tr_dataset = SenoneClassification(tr_file_details) - tr_dataloader = data.DataLoader(tr_dataset, batch_size=1, shuffle=True, num_workers=50) - - cv_file_details = {'scp_dir':args.cv_scp_dir,'scp_file':args.cv_scp_file_name,'label_scp_file':args.cv_label_scp_file} - cv_dataset = SenoneClassification(cv_file_details) - cv_dataloader = data.DataLoader(cv_dataset, batch_size=1, shuffle=False, num_workers=50) - - dataloader = {"tr_loader":tr_dataloader, "cv_loader":cv_dataloader} - - model = FcNet( - args.input_dim, # input - args.fc_nodes, # hidden - args.output_dim, # output - args.hidden_layers, - is_private=args.step_epsilon is not None) - model.apply(weights_init) - - print(model) - # model.cuda() - optimizer = torch.optim.Adam(model.parameters(), - lr=args.learn_rate) - - trainer = Trainer(dataloader, model, optimizer, args) - trainer.train() + if args.num_workers > 1: + train_multiprocess( + worker=run_senone_worker, + size=args.num_workers, + args=args, + federation_scheme='shuffle') + else: + run_senone_worker(args) diff --git a/senone_classifier/trainer.py b/senone_classifier/trainer.py index 8151334..476fd62 100644 --- a/senone_classifier/trainer.py +++ b/senone_classifier/trainer.py @@ -2,7 +2,7 @@ import time import torch try: - from opendp.smartnoise.network import PrivacyAccountant + from opendp.smartnoise.network import PrivacyAccountant, ModelCoordinator except ImportError as e: print("Install smartnoise from the ms-external-sgd branch of this repository: https://github.com/opendifferentialprivacy/smartnoise-core-python") raise e @@ -16,6 +16,7 @@ def __init__(self, dataloader, model, optimizer, args): self.cv_loader = dataloader['cv_loader'] self.accountant = PrivacyAccountant(model=self.model, step_epsilon=args.step_epsilon) if args.step_epsilon else None + self.coordinator = ModelCoordinator(model=self.model, **args.federation) if hasattr(args, 'federation') else None # Training config self.epochs = args.epochs @@ -55,6 +56,8 @@ def _reset(self): def train(self): for epoch in range(self.start_epoch, self.epochs): + if self.coordinator: + self.coordinator.recv() print("training...") start = time.time() tr_avg_loss = self._run_one_epoch(epoch) @@ -115,6 +118,9 @@ def train(self): file_path) print("Find better validated model, saving to %s" % file_path) + if self.coordinator: + self.coordinator.send() + From 40c1dfb1a994126b05c67a4bc99e859ae71e04ac Mon Sep 17 00:00:00 2001 From: Shoeboxam Date: Thu, 25 Feb 2021 16:14:50 -0500 Subject: [PATCH 4/9] remove DP edits to model definition PrivacyAccountant is now able to rewrite the forward pass on a copy of the model that shares weights. So modifying the model definition is not necessary --- senone_classifier/model.py | 11 ++--------- senone_classifier/train.py | 3 +-- senone_classifier/trainer.py | 8 ++++++-- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/senone_classifier/model.py b/senone_classifier/model.py index 25516cd..eefd812 100644 --- a/senone_classifier/model.py +++ b/senone_classifier/model.py @@ -1,15 +1,9 @@ import torch import torch.nn.functional as F -try: - from opendp.smartnoise.network.layers import DPLSTM -except ImportError as e: - print("Install smartnoise from the ms-external-sgd branch of this repository: https://github.com/opendifferentialprivacy/smartnoise-core-python") - raise e - class FcNet(torch.nn.Module): - def __init__(self, D_in, H, D_out, layers, is_private=False): + def __init__(self, D_in, H, D_out, layers): """ In the constructor we instantiate two nn.Linear modules and assign them as member variables. @@ -18,10 +12,9 @@ def __init__(self, D_in, H, D_out, layers, is_private=False): self.hid_size = H self.out_size = D_out self.hidden_layers = layers - self.is_private = is_private super(FcNet, self).__init__() - self.lstm = (DPLSTM if is_private else torch.nn.LSTM)(self.in_size, self.hid_size, self.hidden_layers) + self.lstm = torch.nn.LSTM(self.in_size, self.hid_size, self.hidden_layers) self.linear1 = torch.nn.Linear(self.hid_size, self.out_size) diff --git a/senone_classifier/train.py b/senone_classifier/train.py index 8b84573..56d0ef5 100644 --- a/senone_classifier/train.py +++ b/senone_classifier/train.py @@ -32,8 +32,7 @@ def run_senone_worker(args, rank=None, size=None, step_limit=None, federation_sc args.input_dim, # input args.fc_nodes, # hidden args.output_dim, # output - args.hidden_layers, - is_private=args.step_epsilon is not None) + args.hidden_layers) model.apply(weights_init) print(model) diff --git a/senone_classifier/trainer.py b/senone_classifier/trainer.py index 476fd62..d4ebf34 100644 --- a/senone_classifier/trainer.py +++ b/senone_classifier/trainer.py @@ -15,8 +15,12 @@ def __init__(self, dataloader, model, optimizer, args): self.tr_loader = dataloader['tr_loader'] self.cv_loader = dataloader['cv_loader'] - self.accountant = PrivacyAccountant(model=self.model, step_epsilon=args.step_epsilon) if args.step_epsilon else None - self.coordinator = ModelCoordinator(model=self.model, **args.federation) if hasattr(args, 'federation') else None + self.accountant = None + if args.step_epsilon: + self.accountant = PrivacyAccountant(model=model, step_epsilon=args.step_epsilon) + self.model = self.accountant.model + + self.coordinator = ModelCoordinator(model=model, **args.federation) if hasattr(args, 'federation') else None # Training config self.epochs = args.epochs From aaffef95f33feff0ada8fe3474616ab130ab4aa1 Mon Sep 17 00:00:00 2001 From: Shoeboxam Date: Wed, 3 Mar 2021 22:11:37 -0500 Subject: [PATCH 5/9] debug federated, gpu dpsgd --- senone_classifier/model.py | 8 +++++--- senone_classifier/run_training.sh | 33 +++++++++++++++++++++---------- senone_classifier/train.py | 6 ++---- senone_classifier/trainer.py | 29 +++++++++++---------------- 4 files changed, 42 insertions(+), 34 deletions(-) diff --git a/senone_classifier/model.py b/senone_classifier/model.py index eefd812..13f0d33 100644 --- a/senone_classifier/model.py +++ b/senone_classifier/model.py @@ -25,12 +25,14 @@ def forward(self, x, y): well as arbitrary operators on Tensors. """ lstm_out, _ = self.lstm(x.view(x.shape[1], 1, -1)) - y_pred = F.log_softmax(self.linear1(lstm_out.view(x.shape[1], -1)), dim=1) + # swap batch axis to front + y_pred = F.log_softmax(self.linear1(lstm_out.view(lstm_out.shape[1], lstm_out.shape[0], -1)), dim=2) y = torch.squeeze(y, 0) loss = torch.nn.NLLLoss(reduction='sum') - y = y.long() - output = loss(y_pred, y) + y = y.long().unsqueeze(0) + # always have a batch size of one + output = loss(y_pred[0], y[0]) return output def recognize(self, x): diff --git a/senone_classifier/run_training.sh b/senone_classifier/run_training.sh index 56e3f30..582a1d6 100755 --- a/senone_classifier/run_training.sh +++ b/senone_classifier/run_training.sh @@ -1,16 +1,29 @@ #!/bin/bash -train_scp_dir="/Users/michael/whitenoise/sotto-voce-corpus/senone_labels" -train_scp_file="/Users/michael/whitenoise/sotto-voce-corpus/libri_inp_data/train_full_nodup_tr90/feats.scp" -train_label_scp_file="/Users/michael/whitenoise/sotto-voce-corpus/libri_inp_data/train_full_nodup_tr90/phone.ctm2.scp" - -cv_scp_dir="/Users/michael/whitenoise/sotto-voce-corpus/senone_labels" -cv_scp_file="/Users/michael/whitenoise/sotto-voce-corpus/libri_inp_data/train_full_nodup_cv10/feats.scp" -cv_label_scp_file="/Users/michael/whitenoise/sotto-voce-corpus/libri_inp_data/train_full_nodup_cv10/phone.ctm2.scp" +azure=1 + +if [ $azure -eq 1 ] +then + train_scp_dir="/data/sotto-voce/senone_classifier/" + train_scp_file="train_full_nodup_tr90/feats.scp" + train_label_scp_file="train_full_nodup_tr90/phone.ctm2.scp" + + cv_scp_dir="/data/sotto-voce/senone_classifier/" + cv_scp_file="train_full_nodup_cv10/feats.scp" + cv_label_scp_file="train_full_nodup_cv10/phone.ctm2.scp" +else + train_scp_dir="/Users/michael/whitenoise/sotto-voce-corpus/senone_labels" + train_scp_file="/Users/michael/whitenoise/sotto-voce-corpus/libri_inp_data/train_full_nodup_tr90/feats.scp" + train_label_scp_file="/Users/michael/whitenoise/sotto-voce-corpus/libri_inp_data/train_full_nodup_tr90/phone.ctm2.scp" + + cv_scp_dir="/Users/michael/whitenoise/sotto-voce-corpus/senone_labels" + cv_scp_file="/Users/michael/whitenoise/sotto-voce-corpus/libri_inp_data/train_full_nodup_cv10/feats.scp" + cv_label_scp_file="/Users/michael/whitenoise/sotto-voce-corpus/libri_inp_data/train_full_nodup_cv10/phone.ctm2.scp" +fi input_dim=13 output_dim=9096 -fc_nodes=200 +fc_nodes=20 hidden_layers=2 @@ -27,7 +40,6 @@ mdl_path="final.pth.tar" pr_fr=1000 -workon smartnoise python train.py --train_scp_dir $train_scp_dir \ --train_scp_file $train_scp_file \ --train_label_scp_file $train_label_scp_file \ @@ -43,4 +55,5 @@ python train.py --train_scp_dir $train_scp_dir \ --epochs $ep \ --save_folder $save_fld \ --checkpoint $ckpt --model_path $mdl_path --print_freq $pr_fr \ - --step_epsilon 1.0 + --step_epsilon 100.0 \ + --num_workers 2 diff --git a/senone_classifier/train.py b/senone_classifier/train.py index 56d0ef5..f5034b4 100644 --- a/senone_classifier/train.py +++ b/senone_classifier/train.py @@ -5,8 +5,6 @@ from multiprocessing.context import Process from queue import Queue -import torch.distributed as dist - from data_io_utt import * from trainer import * @@ -37,7 +35,7 @@ def run_senone_worker(args, rank=None, size=None, step_limit=None, federation_sc print(model) - if rank: + if rank is not None: args.federation = { 'rank': rank, 'size': size, @@ -45,7 +43,7 @@ def run_senone_worker(args, rank=None, size=None, step_limit=None, federation_sc 'step_limit': step_limit, 'end_event': end_event } - # model.cuda() + model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=args.learn_rate) diff --git a/senone_classifier/trainer.py b/senone_classifier/trainer.py index d4ebf34..6dad33c 100644 --- a/senone_classifier/trainer.py +++ b/senone_classifier/trainer.py @@ -1,8 +1,9 @@ import os import time import torch +import torch.distributed as dist try: - from opendp.smartnoise.network import PrivacyAccountant, ModelCoordinator + from opendp.smartnoise.network import PrivacyAccountant except ImportError as e: print("Install smartnoise from the ms-external-sgd branch of this repository: https://github.com/opendifferentialprivacy/smartnoise-core-python") raise e @@ -20,7 +21,7 @@ def __init__(self, dataloader, model, optimizer, args): self.accountant = PrivacyAccountant(model=model, step_epsilon=args.step_epsilon) self.model = self.accountant.model - self.coordinator = ModelCoordinator(model=model, **args.federation) if hasattr(args, 'federation') else None + self.federation = args.federation if hasattr(args, 'federation') else {} # Training config self.epochs = args.epochs @@ -60,8 +61,6 @@ def _reset(self): def train(self): for epoch in range(self.start_epoch, self.epochs): - if self.coordinator: - self.coordinator.recv() print("training...") start = time.time() tr_avg_loss = self._run_one_epoch(epoch) @@ -122,14 +121,6 @@ def train(self): file_path) print("Find better validated model, saving to %s" % file_path) - if self.coordinator: - self.coordinator.send() - - - - - - def _run_one_epoch(self, epoch, cross_valid=False): start = time.time() total_loss = 0 @@ -137,8 +128,8 @@ def _run_one_epoch(self, epoch, cross_valid=False): data_loader = self.tr_loader if not cross_valid else self.cv_loader for i, sample in enumerate(data_loader): - x = sample['features'] # .cuda() - y = sample['labels'] # .cuda() + x = sample['features'].cuda() + y = sample['labels'].cuda() #print(x.shape) loss = self.model(x, y) if not cross_valid: @@ -148,11 +139,15 @@ def _run_one_epoch(self, epoch, cross_valid=False): if self.accountant: self.accountant.privatize_grad() + if self.federation: + for param in self.model.parameters(): + dist.all_reduce(param.data) + self.optimizer.step() total_loss += loss.item() - if i % self.print_freq == 0: - print ('Epoch {0} | Iter {1} | Average Loss {2:.3f} |' - 'Current Loss {3:.6f} | {4:.1f} ms/batch'.format( + if self.federation.get('rank') == 0 or (not self.federation and i % self.print_freq == 0): + print('Epoch {0} | Iter {1} | Average Loss {2:.3f} |' + ' Current Loss {3:.6f} | {4:.1f} ms/batch'.format( epoch + 1, i + 1, total_loss / (i + 1), loss.item(), 1000 * (time.time() - start) / (i + 1)), flush=True) From dc80c42d1ed831fe265dda02ae2f053809301de9 Mon Sep 17 00:00:00 2001 From: Shoeboxam Date: Thu, 4 Mar 2021 12:36:06 -0500 Subject: [PATCH 6/9] sync grads, not weights --- senone_classifier/model.py | 10 ++++++---- senone_classifier/run_training.sh | 4 ++-- senone_classifier/trainer.py | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/senone_classifier/model.py b/senone_classifier/model.py index 13f0d33..d20a3b6 100644 --- a/senone_classifier/model.py +++ b/senone_classifier/model.py @@ -25,14 +25,16 @@ def forward(self, x, y): well as arbitrary operators on Tensors. """ lstm_out, _ = self.lstm(x.view(x.shape[1], 1, -1)) + lstm_out = lstm_out.view(lstm_out.shape[1], lstm_out.shape[0], -1) + # swap batch axis to front - y_pred = F.log_softmax(self.linear1(lstm_out.view(lstm_out.shape[1], lstm_out.shape[0], -1)), dim=2) + y_pred = F.log_softmax(self.linear1(lstm_out), dim=2) y = torch.squeeze(y, 0) - + loss = torch.nn.NLLLoss(reduction='sum') - y = y.long().unsqueeze(0) + y = y.long() # always have a batch size of one - output = loss(y_pred[0], y[0]) + output = loss(y_pred[0], y) return output def recognize(self, x): diff --git a/senone_classifier/run_training.sh b/senone_classifier/run_training.sh index 582a1d6..4e03dde 100755 --- a/senone_classifier/run_training.sh +++ b/senone_classifier/run_training.sh @@ -23,7 +23,7 @@ fi input_dim=13 output_dim=9096 -fc_nodes=20 +fc_nodes=200 hidden_layers=2 @@ -55,5 +55,5 @@ python train.py --train_scp_dir $train_scp_dir \ --epochs $ep \ --save_folder $save_fld \ --checkpoint $ckpt --model_path $mdl_path --print_freq $pr_fr \ - --step_epsilon 100.0 \ + --step_epsilon 200.0 \ --num_workers 2 diff --git a/senone_classifier/trainer.py b/senone_classifier/trainer.py index 6dad33c..408c684 100644 --- a/senone_classifier/trainer.py +++ b/senone_classifier/trainer.py @@ -141,7 +141,7 @@ def _run_one_epoch(self, epoch, cross_valid=False): if self.federation: for param in self.model.parameters(): - dist.all_reduce(param.data) + dist.all_reduce(param.grad) self.optimizer.step() total_loss += loss.item() From 71ebb02f70caa415be6a0a7e67cf2b3f715e865c Mon Sep 17 00:00:00 2001 From: Shoeboxam Date: Sun, 18 Jul 2021 16:11:37 -0400 Subject: [PATCH 7/9] train with ManualOdometer checkpoint before re-enabling grad hooks --- run.bbn.sh | 10 +++++----- senone_classifier/run_training.sh | 6 +++--- senone_classifier/train.py | 8 ++++++-- senone_classifier/trainer.py | 20 +++++++------------- 4 files changed, 21 insertions(+), 23 deletions(-) diff --git a/run.bbn.sh b/run.bbn.sh index 3d42827..59ca406 100755 --- a/run.bbn.sh +++ b/run.bbn.sh @@ -87,7 +87,7 @@ if [ ${stage} -le 2 ]; then # utils/combine_data.sh --extra-files utt2num_frames $data_dir/${valid_set} $data_dir/dev_clean $data_dir/dev_other utils/combine_data.sh --extra-files utt2num_frames $data_dir/${valid_set} $data_dir/dev_clean - # compute global CMVN + # compute global CMVN - "potential issue here - mean variance normalization" compute-cmvn-stats scp:$data_dir/${train_set}/feats.scp $data_dir/${train_set}/cmvn.ark # dump features for training @@ -118,8 +118,8 @@ lmdatadir=$data_dir/lm_text if [ ${stage} -le 3 ]; then echo "Stage 3: Dictionary Preparation and Text Tokenization" mkdir -p $data_dir/lang - cut -f 2- -d" " $data_dir/${train_set}/text > $data_dir/lang/input - echo "$0: training sentencepiece model..." + cut -f 2- -d" " $data_dir/${train_set}/text > $data_dir/lang/input # Option 1: character level, Option 2: share data, create mixed model, Option 3: create 3 sentencepiece models + echo "$0: training sentencepiece model..." # works on text side, takes transcripts and breaks into segments. segments are the outermost layer of model python3 ../../scripts/spm_train.py --bos_id=-1 --pad_id=0 --eos_id=1 --unk_id=2 --input=$data_dir/lang/input \ --vocab_size=$((sentencepiece_vocabsize+3)) --character_coverage=1.0 \ --model_type=$sentencepiece_type --model_prefix=$sentencepiece_model \ @@ -145,7 +145,7 @@ if [ ${stage} -le 3 ]; then token_text=$data_dir/$dataset/token_text cut -f 2- -d" " $token_text > $lmdatadir/$dataset.tokens done - if [ ! -e $lmdatadir/librispeech-lm-norm.txt.gz ]; then + if [ ! -e $lmdatadir/librispeech-lm-norm.txt.gz ]; then # considered public, AMI is private data wget http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz -P $lmdatadir fi echo "$0: preparing extra corpus for subword LM training..." @@ -159,7 +159,7 @@ lmdict=$dict if $lm_shallow_fusion; then if [ ${stage} -le 4 ]; then echo "Stage 4: Text Binarization for subword LM Training" - mkdir -p $lmdatadir/log + mkdir -p $lmdatadir/log # unknown for dataset in $test_set; do test_paths="$test_paths $lmdatadir/$dataset.tokens"; done test_paths=$(echo $test_paths | awk '{$1=$1;print}' | tr ' ' ',') ${decode_cmd} $lmdatadir/log/preprocess.log \ diff --git a/senone_classifier/run_training.sh b/senone_classifier/run_training.sh index 4e03dde..2971e04 100755 --- a/senone_classifier/run_training.sh +++ b/senone_classifier/run_training.sh @@ -38,7 +38,7 @@ ckpt=0 cont_model=0 mdl_path="final.pth.tar" -pr_fr=1000 +pr_fr=1 python train.py --train_scp_dir $train_scp_dir \ --train_scp_file $train_scp_file \ @@ -55,5 +55,5 @@ python train.py --train_scp_dir $train_scp_dir \ --epochs $ep \ --save_folder $save_fld \ --checkpoint $ckpt --model_path $mdl_path --print_freq $pr_fr \ - --step_epsilon 200.0 \ - --num_workers 2 + --step_epsilon 100.0 \ + --num_workers 4 | tee output.log diff --git a/senone_classifier/train.py b/senone_classifier/train.py index f5034b4..5e07a77 100644 --- a/senone_classifier/train.py +++ b/senone_classifier/train.py @@ -4,6 +4,8 @@ from multiprocessing import Event from multiprocessing.context import Process from queue import Queue +from torch.nn.parallel import DistributedDataParallel as DDP +from torch import distributed as dist from data_io_utt import * from trainer import * @@ -17,12 +19,12 @@ def run_senone_worker(args, rank=None, size=None, step_limit=None, federation_sc tr_file_details = {'scp_dir': args.train_scp_dir, 'scp_file': args.train_scp_file_name, 'label_scp_file': args.train_label_scp_file} tr_dataset = SenoneClassification(tr_file_details) - tr_dataloader = data.DataLoader(tr_dataset, batch_size=1, shuffle=True, num_workers=50) + tr_dataloader = data.DataLoader(tr_dataset, batch_size=1, shuffle=True, num_workers=2) cv_file_details = {'scp_dir': args.cv_scp_dir, 'scp_file': args.cv_scp_file_name, 'label_scp_file': args.cv_label_scp_file} cv_dataset = SenoneClassification(cv_file_details) - cv_dataloader = data.DataLoader(cv_dataset, batch_size=1, shuffle=False, num_workers=50) + cv_dataloader = data.DataLoader(cv_dataset, batch_size=1, shuffle=False, num_workers=2) dataloader = {"tr_loader": tr_dataloader, "cv_loader": cv_dataloader} @@ -36,6 +38,8 @@ def run_senone_worker(args, rank=None, size=None, step_limit=None, federation_sc print(model) if rank is not None: + print('federating on rank', rank) + model = DDP(model) args.federation = { 'rank': rank, 'size': size, diff --git a/senone_classifier/trainer.py b/senone_classifier/trainer.py index 408c684..74dcfa2 100644 --- a/senone_classifier/trainer.py +++ b/senone_classifier/trainer.py @@ -1,9 +1,9 @@ import os import time import torch -import torch.distributed as dist try: - from opendp.smartnoise.network import PrivacyAccountant + # from opendp.network.odometer_stochastic import StochasticPrivacyOdometer + from opendp.network.odometer_manual import ManualPrivacyOdometer except ImportError as e: print("Install smartnoise from the ms-external-sgd branch of this repository: https://github.com/opendifferentialprivacy/smartnoise-core-python") raise e @@ -18,8 +18,8 @@ def __init__(self, dataloader, model, optimizer, args): self.accountant = None if args.step_epsilon: - self.accountant = PrivacyAccountant(model=model, step_epsilon=args.step_epsilon) - self.model = self.accountant.model + self.odometer = ManualPrivacyOdometer(model=self.model, step_epsilon=args.step_epsilon) + # self.odometer.track_(self.model) self.federation = args.federation if hasattr(args, 'federation') else {} @@ -83,8 +83,8 @@ def train(self): print("Cross validation...") val_loss = self._run_one_epoch(epoch, cross_valid=True) - if self.accountant: - self.accountant.increment_epoch() + if self.odometer: + self.odometer.increment_epoch() print('-' * 85) print('Valid Summary | End of Epoch {0} | Time {1:.2f}s | ' @@ -136,13 +136,7 @@ def _run_one_epoch(self, epoch, cross_valid=False): self.optimizer.zero_grad() loss.backward() - if self.accountant: - self.accountant.privatize_grad() - - if self.federation: - for param in self.model.parameters(): - dist.all_reduce(param.grad) - + self.odometer.privatize_grad() self.optimizer.step() total_loss += loss.item() if self.federation.get('rank') == 0 or (not self.federation and i % self.print_freq == 0): From c41ba62043ad2ebf0254f750d1f672fee948291b Mon Sep 17 00:00:00 2001 From: Shoeboxam Date: Sun, 18 Jul 2021 17:27:13 -0400 Subject: [PATCH 8/9] train with full grad reconstruction and DDP --- senone_classifier/run_training.sh | 2 +- senone_classifier/trainer.py | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/senone_classifier/run_training.sh b/senone_classifier/run_training.sh index 2971e04..1343dc2 100755 --- a/senone_classifier/run_training.sh +++ b/senone_classifier/run_training.sh @@ -56,4 +56,4 @@ python train.py --train_scp_dir $train_scp_dir \ --save_folder $save_fld \ --checkpoint $ckpt --model_path $mdl_path --print_freq $pr_fr \ --step_epsilon 100.0 \ - --num_workers 4 | tee output.log + --num_workers 2 | tee output.log diff --git a/senone_classifier/trainer.py b/senone_classifier/trainer.py index 74dcfa2..5b8eb66 100644 --- a/senone_classifier/trainer.py +++ b/senone_classifier/trainer.py @@ -2,7 +2,8 @@ import time import torch try: - # from opendp.network.odometer_stochastic import StochasticPrivacyOdometer + from opendp.network.odometer import PrivacyOdometer + from opendp.network.odometer_stochastic import StochasticPrivacyOdometer from opendp.network.odometer_manual import ManualPrivacyOdometer except ImportError as e: print("Install smartnoise from the ms-external-sgd branch of this repository: https://github.com/opendifferentialprivacy/smartnoise-core-python") @@ -18,9 +19,17 @@ def __init__(self, dataloader, model, optimizer, args): self.accountant = None if args.step_epsilon: - self.odometer = ManualPrivacyOdometer(model=self.model, step_epsilon=args.step_epsilon) + # full grad reconstruction odometer + self.odometer = PrivacyOdometer(step_epsilon=args.step_epsilon) + self.model = self.odometer.make_tracked_view(self.model) + + # # stochastic odometer + # self.odometer = StochasticPrivacyOdometer(step_epsilon=args.step_epsilon) # self.odometer.track_(self.model) + # # manual odometer + # self.odometer = ManualPrivacyOdometer(model=self.model, step_epsilon=args.step_epsilon) + self.federation = args.federation if hasattr(args, 'federation') else {} # Training config @@ -136,7 +145,8 @@ def _run_one_epoch(self, epoch, cross_valid=False): self.optimizer.zero_grad() loss.backward() - self.odometer.privatize_grad() + # enable only for ManualPrivacyOdometer + # self.odometer.privatize_grad() self.optimizer.step() total_loss += loss.item() if self.federation.get('rank') == 0 or (not self.federation and i % self.print_freq == 0): From aadad3588ccab743ba42c3c857f5d1a016f35deb Mon Sep 17 00:00:00 2001 From: Shoeboxam Date: Fri, 15 Oct 2021 15:34:47 -0400 Subject: [PATCH 9/9] adjust for api changes --- senone_classifier/run_training.sh | 6 +++--- senone_classifier/trainer.py | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/senone_classifier/run_training.sh b/senone_classifier/run_training.sh index 1343dc2..b3732d9 100755 --- a/senone_classifier/run_training.sh +++ b/senone_classifier/run_training.sh @@ -38,7 +38,7 @@ ckpt=0 cont_model=0 mdl_path="final.pth.tar" -pr_fr=1 +pr_fr=10 python train.py --train_scp_dir $train_scp_dir \ --train_scp_file $train_scp_file \ @@ -55,5 +55,5 @@ python train.py --train_scp_dir $train_scp_dir \ --epochs $ep \ --save_folder $save_fld \ --checkpoint $ckpt --model_path $mdl_path --print_freq $pr_fr \ - --step_epsilon 100.0 \ - --num_workers 2 | tee output.log + --step_epsilon 2.38547 \ + --num_workers 4 | tee output.log diff --git a/senone_classifier/trainer.py b/senone_classifier/trainer.py index 5b8eb66..e7e7188 100644 --- a/senone_classifier/trainer.py +++ b/senone_classifier/trainer.py @@ -2,11 +2,11 @@ import time import torch try: - from opendp.network.odometer import PrivacyOdometer + from opendp.network.odometer_reconstruction import ReconstructionPrivacyOdometer from opendp.network.odometer_stochastic import StochasticPrivacyOdometer from opendp.network.odometer_manual import ManualPrivacyOdometer except ImportError as e: - print("Install smartnoise from the ms-external-sgd branch of this repository: https://github.com/opendifferentialprivacy/smartnoise-core-python") + print("Install opendp from the external-sgd branch here: https://github.com/opendp/opendp/tree/external-sgd") raise e @@ -20,12 +20,12 @@ def __init__(self, dataloader, model, optimizer, args): self.accountant = None if args.step_epsilon: # full grad reconstruction odometer - self.odometer = PrivacyOdometer(step_epsilon=args.step_epsilon) - self.model = self.odometer.make_tracked_view(self.model) + # self.odometer = ReconstructionPrivacyOdometer(step_epsilon=args.step_epsilon) + # self.model = self.odometer.make_tracked_view(self.model) # # stochastic odometer - # self.odometer = StochasticPrivacyOdometer(step_epsilon=args.step_epsilon) - # self.odometer.track_(self.model) + self.odometer = StochasticPrivacyOdometer(step_epsilon=args.step_epsilon) + self.odometer.track_(self.model) # # manual odometer # self.odometer = ManualPrivacyOdometer(model=self.model, step_epsilon=args.step_epsilon) @@ -149,7 +149,7 @@ def _run_one_epoch(self, epoch, cross_valid=False): # self.odometer.privatize_grad() self.optimizer.step() total_loss += loss.item() - if self.federation.get('rank') == 0 or (not self.federation and i % self.print_freq == 0): + if (self.federation.get('rank') == 0 or not self.federation) and i % self.print_freq == 0: print('Epoch {0} | Iter {1} | Average Loss {2:.3f} |' ' Current Loss {3:.6f} | {4:.1f} ms/batch'.format( epoch + 1, i + 1, total_loss / (i + 1),