From c7b72279ec43d8c9ba9aa2d9f5e6311527779606 Mon Sep 17 00:00:00 2001 From: maggiezha <42832776+maggiezha@users.noreply.github.com> Date: Thu, 5 Nov 2020 15:26:50 +1100 Subject: [PATCH 1/9] Add files via upload require: PyTorch NGC container 20.08 + which has PyTorch 1.7 for AMP upstream to work --- PyTorch/SpeechSynthesis/Tacotron2/Dockerfile | 2 +- PyTorch/SpeechSynthesis/Tacotron2/train.py | 56 +++++++++++++------- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/PyTorch/SpeechSynthesis/Tacotron2/Dockerfile b/PyTorch/SpeechSynthesis/Tacotron2/Dockerfile index 1028e7a91..4eb5105c1 100644 --- a/PyTorch/SpeechSynthesis/Tacotron2/Dockerfile +++ b/PyTorch/SpeechSynthesis/Tacotron2/Dockerfile @@ -1,4 +1,4 @@ -ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3 +ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.10-py3 FROM ${FROM_IMAGE_NAME} ADD . /workspace/tacotron2 diff --git a/PyTorch/SpeechSynthesis/Tacotron2/train.py b/PyTorch/SpeechSynthesis/Tacotron2/train.py index b808910ec..af6689fd5 100644 --- a/PyTorch/SpeechSynthesis/Tacotron2/train.py +++ b/PyTorch/SpeechSynthesis/Tacotron2/train.py @@ -51,9 +51,9 @@ from scipy.io.wavfile import write as write_wav -from apex import amp -amp.lists.functional_overrides.FP32_FUNCS.remove('softmax') -amp.lists.functional_overrides.FP16_FUNCS.append('softmax') +#from apex import amp +#amp.lists.functional_overrides.FP32_FUNCS.remove('softmax') +#amp.lists.functional_overrides.FP16_FUNCS.append('softmax') def parse_args(parser): @@ -188,7 +188,7 @@ def init_distributed(args, world_size, rank, group_name): print("Done initializing distributed") -def save_checkpoint(model, optimizer, epoch, config, amp_run, output_dir, model_name, +def save_checkpoint(model, optimizer, scaler, epoch, config, amp_run, output_dir, model_name, local_rank, world_size): random_rng_state = torch.random.get_rng_state().cuda() @@ -215,7 +215,10 @@ def save_checkpoint(model, optimizer, epoch, config, amp_run, output_dir, model_ 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()} if amp_run: - checkpoint['amp'] = amp.state_dict() + #checkpoint['amp'] = amp.state_dict() + checkpoint = {'model': model.state_dict(), + 'optimizer': optimizer.state_dict(), + 'scaler': scaler.state_dict()} checkpoint_filename = "checkpoint_{}_{}.pt".format(model_name, epoch) checkpoint_path = os.path.join(output_dir, checkpoint_filename) @@ -256,8 +259,10 @@ def load_checkpoint(model, optimizer, epoch, config, amp_run, filepath, local_ra optimizer.load_state_dict(checkpoint['optimizer']) if amp_run: - amp.load_state_dict(checkpoint['amp']) - + #amp.load_state_dict(checkpoint['amp']) + model.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + scaler.load_state_dict(checkpoint['scaler']) # adapted from: https://discuss.pytorch.org/t/opinion-eval-should-be-a-context-manager/18998/3 # Following snippet is licensed under MIT license @@ -384,16 +389,19 @@ def main(): cpu_run=False, uniform_initialize_bn_weight=not args.disable_uniform_initialize_bn_weight) - if not args.amp and distributed_run: + #if not args.amp and distributed_run: + if distributed_run: model = DDP(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) - if args.amp: - model, optimizer = amp.initialize(model, optimizer, opt_level="O1") - if distributed_run: - model = DDP(model) + scaler = torch.cuda.amp.GradScaler(enabled=args.amp) + + #if args.amp: + #model, optimizer = amp.initialize(model, optimizer, opt_level="O1") + #if distributed_run: + #model = DDP(model) try: sigma = args.sigma @@ -475,9 +483,11 @@ def main(): model.zero_grad() x, y, num_items = batch_to_gpu(batch) - y_pred = model(x) - loss = criterion(y_pred, y) - + #AMP upstream autocast + with torch.cuda.amp.autocast(enabled=args.amp): + y_pred = model(x) + loss = criterion(y_pred, y) + if distributed_run: reduced_loss = reduce_tensor(loss.data, world_size).item() reduced_num_items = reduce_tensor(num_items.data, 1).item() @@ -495,10 +505,16 @@ def main(): reduced_num_items_epoch += reduced_num_items if args.amp: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - grad_norm = torch.nn.utils.clip_grad_norm_( - amp.master_params(optimizer), args.grad_clip_thresh) + #with amp.scale_loss(loss, optimizer) as scaled_loss: + #scaled_loss.backward() + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() + optimizer.zero_grad(set_to_none=True) + #optimizer.zero_grad() + + #grad_norm = torch.nn.utils.clip_grad_norm_( + #amp.master_params(optimizer), args.grad_clip_thresh) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( @@ -532,7 +548,7 @@ def main(): batch_to_gpu) if (epoch % args.epochs_per_checkpoint == 0) and args.bench_class == "": - save_checkpoint(model, optimizer, epoch, model_config, + save_checkpoint(model, optimizer, scaler, epoch, model_config, args.amp, args.output, args.model_name, local_rank, world_size) if local_rank == 0: From ec58eebe72980362b0866f72f31ad955627536e9 Mon Sep 17 00:00:00 2001 From: maggiezha <42832776+maggiezha@users.noreply.github.com> Date: Thu, 5 Nov 2020 20:18:38 +1100 Subject: [PATCH 2/9] Add files via upload From 73d8a6bd314c0933d20e2163b801783ceed18145 Mon Sep 17 00:00:00 2001 From: maggiezha <42832776+maggiezha@users.noreply.github.com> Date: Thu, 5 Nov 2020 20:19:55 +1100 Subject: [PATCH 3/9] Add files via upload --- PyTorch/SpeechSynthesis/Tacotron2/tacotron2/model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/PyTorch/SpeechSynthesis/Tacotron2/tacotron2/model.py b/PyTorch/SpeechSynthesis/Tacotron2/tacotron2/model.py index 4976b54c3..23710d7ab 100644 --- a/PyTorch/SpeechSynthesis/Tacotron2/tacotron2/model.py +++ b/PyTorch/SpeechSynthesis/Tacotron2/tacotron2/model.py @@ -661,7 +661,6 @@ def forward(self, inputs): input_lengths, output_lengths = input_lengths.data, output_lengths.data embedded_inputs = self.embedding(inputs).transpose(1, 2) - encoder_outputs = self.encoder(embedded_inputs, input_lengths) mel_outputs, gate_outputs, alignments = self.decoder( From a7017c07515ae3210f973f5f2337a1390af65a72 Mon Sep 17 00:00:00 2001 From: maggiezha <42832776+maggiezha@users.noreply.github.com> Date: Thu, 5 Nov 2020 20:21:04 +1100 Subject: [PATCH 4/9] Add files via upload From d7a000ee159f4c8e0d929a17d9586915434e3e38 Mon Sep 17 00:00:00 2001 From: maggiezha <42832776+maggiezha@users.noreply.github.com> Date: Thu, 5 Nov 2020 21:35:56 +1100 Subject: [PATCH 5/9] Add files via upload --- PyTorch/SpeechSynthesis/Tacotron2/train.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/PyTorch/SpeechSynthesis/Tacotron2/train.py b/PyTorch/SpeechSynthesis/Tacotron2/train.py index af6689fd5..53d4d390f 100644 --- a/PyTorch/SpeechSynthesis/Tacotron2/train.py +++ b/PyTorch/SpeechSynthesis/Tacotron2/train.py @@ -39,7 +39,7 @@ import torch.distributed as dist from torch.utils.data.distributed import DistributedSampler -from apex.parallel import DistributedDataParallel as DDP +#from apex.parallel import DistributedDataParallel as DDP import models import loss_functions @@ -391,7 +391,8 @@ def main(): #if not args.amp and distributed_run: if distributed_run: - model = DDP(model) + #model = DDP(model) + model = nn.parallel.DistributedDataParallel(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) From fa4f1492320025148725b17a998b06de7ead571e Mon Sep 17 00:00:00 2001 From: maggiezha <42832776+maggiezha@users.noreply.github.com> Date: Thu, 5 Nov 2020 22:06:08 +1100 Subject: [PATCH 6/9] Add files via upload --- PyTorch/SpeechSynthesis/Tacotron2/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/PyTorch/SpeechSynthesis/Tacotron2/train.py b/PyTorch/SpeechSynthesis/Tacotron2/train.py index 53d4d390f..d8fa8efd3 100644 --- a/PyTorch/SpeechSynthesis/Tacotron2/train.py +++ b/PyTorch/SpeechSynthesis/Tacotron2/train.py @@ -40,6 +40,7 @@ from torch.utils.data.distributed import DistributedSampler #from apex.parallel import DistributedDataParallel as DDP +from torch.nn.parallel import DistributedDataParallel as DDP import models import loss_functions @@ -391,8 +392,7 @@ def main(): #if not args.amp and distributed_run: if distributed_run: - #model = DDP(model) - model = nn.parallel.DistributedDataParallel(model) + model = DDP(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) From ceb12bf5f7c2802a73051bd98aab3ff7afc2dda6 Mon Sep 17 00:00:00 2001 From: maggiezha <42832776+maggiezha@users.noreply.github.com> Date: Tue, 10 Nov 2020 22:07:58 +1100 Subject: [PATCH 7/9] Add files via upload --- PyTorch/SpeechSynthesis/Tacotron2/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PyTorch/SpeechSynthesis/Tacotron2/train.py b/PyTorch/SpeechSynthesis/Tacotron2/train.py index d8fa8efd3..190186320 100644 --- a/PyTorch/SpeechSynthesis/Tacotron2/train.py +++ b/PyTorch/SpeechSynthesis/Tacotron2/train.py @@ -392,7 +392,7 @@ def main(): #if not args.amp and distributed_run: if distributed_run: - model = DDP(model) + model = DDP(model,device_ids=[local_rank],output_device=local_rank) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) From 46e6371539e52c67c85a437bffa7049cca3ceb1a Mon Sep 17 00:00:00 2001 From: maggiezha <42832776+maggiezha@users.noreply.github.com> Date: Wed, 11 Nov 2020 20:01:55 +1100 Subject: [PATCH 8/9] Add files via upload --- PyTorch/SpeechSynthesis/Tacotron2/train.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/PyTorch/SpeechSynthesis/Tacotron2/train.py b/PyTorch/SpeechSynthesis/Tacotron2/train.py index 190186320..344f44bb7 100644 --- a/PyTorch/SpeechSynthesis/Tacotron2/train.py +++ b/PyTorch/SpeechSynthesis/Tacotron2/train.py @@ -39,7 +39,6 @@ import torch.distributed as dist from torch.utils.data.distributed import DistributedSampler -#from apex.parallel import DistributedDataParallel as DDP from torch.nn.parallel import DistributedDataParallel as DDP import models @@ -52,11 +51,6 @@ from scipy.io.wavfile import write as write_wav -#from apex import amp -#amp.lists.functional_overrides.FP32_FUNCS.remove('softmax') -#amp.lists.functional_overrides.FP16_FUNCS.append('softmax') - - def parse_args(parser): """ Parse commandline arguments. @@ -216,7 +210,6 @@ def save_checkpoint(model, optimizer, scaler, epoch, config, amp_run, output_dir 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()} if amp_run: - #checkpoint['amp'] = amp.state_dict() checkpoint = {'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'scaler': scaler.state_dict()} @@ -260,7 +253,6 @@ def load_checkpoint(model, optimizer, epoch, config, amp_run, filepath, local_ra optimizer.load_state_dict(checkpoint['optimizer']) if amp_run: - #amp.load_state_dict(checkpoint['amp']) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) scaler.load_state_dict(checkpoint['scaler']) @@ -390,7 +382,6 @@ def main(): cpu_run=False, uniform_initialize_bn_weight=not args.disable_uniform_initialize_bn_weight) - #if not args.amp and distributed_run: if distributed_run: model = DDP(model,device_ids=[local_rank],output_device=local_rank) @@ -398,11 +389,6 @@ def main(): weight_decay=args.weight_decay) scaler = torch.cuda.amp.GradScaler(enabled=args.amp) - - #if args.amp: - #model, optimizer = amp.initialize(model, optimizer, opt_level="O1") - #if distributed_run: - #model = DDP(model) try: sigma = args.sigma @@ -506,16 +492,11 @@ def main(): reduced_num_items_epoch += reduced_num_items if args.amp: - #with amp.scale_loss(loss, optimizer) as scaled_loss: - #scaled_loss.backward() scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() optimizer.zero_grad(set_to_none=True) - #optimizer.zero_grad() - - #grad_norm = torch.nn.utils.clip_grad_norm_( - #amp.master_params(optimizer), args.grad_clip_thresh) + else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( From 7f5db4546c7d7c48e19350d04155b6d467e70635 Mon Sep 17 00:00:00 2001 From: maggiezha <42832776+maggiezha@users.noreply.github.com> Date: Thu, 12 Nov 2020 19:28:49 +1100 Subject: [PATCH 9/9] Add files via upload --- PyTorch/SpeechSynthesis/FastPitch/Dockerfile | 2 +- PyTorch/SpeechSynthesis/FastPitch/train.py | 52 +++++++++++++------- 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/PyTorch/SpeechSynthesis/FastPitch/Dockerfile b/PyTorch/SpeechSynthesis/FastPitch/Dockerfile index 9dfaf3920..4cad0dc05 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/Dockerfile +++ b/PyTorch/SpeechSynthesis/FastPitch/Dockerfile @@ -1,4 +1,4 @@ -ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.09-py3 +ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.10-py3 FROM ${FROM_IMAGE_NAME} ADD requirements.txt . diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py index 1a962e85a..0f5e52b40 100644 --- a/PyTorch/SpeechSynthesis/FastPitch/train.py +++ b/PyTorch/SpeechSynthesis/FastPitch/train.py @@ -46,7 +46,7 @@ from torch.utils.data.distributed import DistributedSampler import common.tb_dllogger as logger -from apex import amp +#from apex import amp from apex.optimizers import FusedAdam, FusedLAMB import common @@ -172,8 +172,7 @@ def corrupted(fpath): return None -def save_checkpoint(local_rank, model, ema_model, optimizer, epoch, total_iter, - config, amp_run, filepath): +def save_checkpoint(local_rank, model, ema_model, optimizer, scaler, epoch, total_iter, config, amp_run, filepath): if local_rank != 0: return @@ -186,11 +185,15 @@ def save_checkpoint(local_rank, model, ema_model, optimizer, epoch, total_iter, 'ema_state_dict': ema_dict, 'optimizer': optimizer.state_dict()} if amp_run: - checkpoint['amp'] = amp.state_dict() + #checkpoint['amp'] = amp.state_dict() + checkpoint = {"model": model.state_dict(), + "optimizer": optimizer.state_dict(), + "scaler": scaler.state_dict()} + torch.save(checkpoint, filepath) -def load_checkpoint(local_rank, model, ema_model, optimizer, epoch, total_iter, +def load_checkpoint(local_rank, model, ema_model, optimizer, scaler, epoch, total_iter, config, amp_run, filepath, world_size): if local_rank == 0: print(f'Loading model and optimizer state from {filepath}') @@ -205,7 +208,10 @@ def load_checkpoint(local_rank, model, ema_model, optimizer, epoch, total_iter, optimizer.load_state_dict(checkpoint['optimizer']) if amp_run: - amp.load_state_dict(checkpoint['amp']) + #amp.load_state_dict(checkpoint['amp']) + model.load_state_dict(checkpoint["model"]) + optimizer.load_state_dict(checkpoint["optimizer"]) + scaler.load_state_dict(checkpoint["scaler"]) if ema_model is not None: ema_model.load_state_dict(checkpoint['ema_state_dict']) @@ -336,8 +342,10 @@ def main(): else: raise ValueError - if args.amp: - model, optimizer = amp.initialize(model, optimizer, opt_level="O1") + scaler = torch.cuda.amp.GradScaler(enabled=args.amp) + + #if args.amp: + #model, optimizer = amp.initialize(model, optimizer, opt_level="O1") if args.ema_decay > 0: ema_model = copy.deepcopy(model) @@ -426,16 +434,20 @@ def main(): model.zero_grad() x, y, num_frames = batch_to_gpu(batch) - y_pred = model(x, use_gt_durations=True) - loss, meta = criterion(y_pred, y) - loss /= args.gradient_accumulation_steps + #AMP upstream autocast + with torch.cuda.amp.autocast(enabled=args.amp): + y_pred = model(x, use_gt_durations=True) + loss, meta = criterion(y_pred, y) + + loss /= args.gradient_accumulation_steps meta = {k: v / args.gradient_accumulation_steps for k, v in meta.items()} if args.amp: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() + #with amp.scale_loss(loss, optimizer) as scaled_loss: + #scaled_loss.backward() + scaler.scale(loss).backward() else: loss.backward() @@ -458,13 +470,17 @@ def main(): logger.log_grads_tb(total_iter, model) if args.amp: - torch.nn.utils.clip_grad_norm_( - amp.master_params(optimizer), args.grad_clip_thresh) + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip_thresh) + scaler.step(optimizer) + scaler.update() + #optimizer.zero_grad(set_to_none=True) + optimizer.zero_grad() else: torch.nn.utils.clip_grad_norm_( model.parameters(), args.grad_clip_thresh) - optimizer.step() + optimizer.step() apply_ema_decay(model, ema_model, args.ema_decay) iter_time = time.perf_counter() - iter_start_time @@ -517,7 +533,7 @@ def main(): checkpoint_path = os.path.join( args.output, f"FastPitch_checkpoint_{epoch}.pt") - save_checkpoint(args.local_rank, model, ema_model, optimizer, epoch, + save_checkpoint(args.local_rank, model, ema_model, optimizer, scaler, epoch, total_iter, model_config, args.amp, checkpoint_path) logger.flush() @@ -538,7 +554,7 @@ def main(): (epoch % args.epochs_per_checkpoint != 0) and args.local_rank == 0): checkpoint_path = os.path.join( args.output, f"FastPitch_checkpoint_{epoch}.pt") - save_checkpoint(args.local_rank, model, ema_model, optimizer, epoch, + save_checkpoint(args.local_rank, model, ema_model, optimizer, scaler, epoch, total_iter, model_config, args.amp, checkpoint_path)