Get DDP to work

2022-11-29 22:06:21 -07:00 · 2022-11-29 22:06:21 -07:00 · b0cec788be
parent 8decb0bc7d
commit b0cec788be
1 changed files with 7 additions and 4 deletions
--- a/trainer/diffusers_trainer.py
+++ b/trainer/diffusers_trainer.py
@ -743,7 +743,8 @@ def main():
        print(f"Completed resize and migration to '{args.dataset}_cropped' please relaunch the trainer without the --resize argument and train on the migrated dataset.")
        exit(0)

-    #unet = torch.nn.parallel.DistributedDataParallel(unet, device_ids=[rank], output_device=rank, gradient_as_bucket_view=True)
+    dist_unet = torch.nn.parallel.DistributedDataParallel(unet, device_ids=[rank], gradient_as_bucket_view=True)
+    unet = dist_unet.module

    # create ema
    if args.use_ema:
@ -835,9 +836,7 @@ def main():
                    
                loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="mean")

-                # All-reduce loss, backprop, and update weights
-                torch.distributed.all_reduce(loss, op=torch.distributed.ReduceOp.SUM)
-                loss = loss / world_size
+                # backprop and update
                scaler.scale(loss).backward()
                torch.nn.utils.clip_grad_norm_(unet.parameters(), 1.0)
                scaler.step(optimizer)
@ -857,6 +856,10 @@ def main():
                world_images_per_second = rank_images_per_second * world_size
                samples_seen = global_step * args.batch_size * world_size

+                # get global loss for logging
+                torch.distributed.all_reduce(loss, op=torch.distributed.ReduceOp.SUM)
+                loss = loss / world_size
+
                if rank == 0:
                    progress_bar.update(1)
                    global_step += 1