runpod notebook

2022-11-07 22:31:50 -05:00 · 2022-11-07 22:31:50 -05:00 · bde3c3808c
parent 9d113f5eda
commit bde3c3808c
4 changed files with 1980 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,4 +10,6 @@ logs/**
 training_samples/**
 outputs/**
 *.ckpt
-src/**
+src/**
+.vscode/**
+./test/**
--- a/Train-Runpod.ipynb
+++ b/Train-Runpod.ipynb
--- a/autoprune_all.py
+++ b/autoprune_all.py
@ -0,0 +1,23 @@
+import prune_ckpt
+import os
+import glob
+import argparse
+
+parser = argparse.ArgumentParser(description='prune all ckpt')
+parser.add_argument('--delete', type=bool, default=None, help='delete the 11gb files')
+args = parser.parse_args()
+ckpt = args.delete
+
+# path to logs folder
+logs_path = "logs"
+
+# resurcively search logs folder for ckpt files and prune them
+for path in glob.glob(logs_path + "/**/*.ckpt", recursive=True):
+    path_here = os.path.basename(path)
+    
+    os.rename(path, path_here)
+
+    prune_ckpt.prune_it(path_here, keep_only_ema=False)
+    
+    if args.delete:
+        os.remove(path_here)
--- a/configs/stable-diffusion/v1-finetune_runpod.yaml
+++ b/configs/stable-diffusion/v1-finetune_runpod.yaml
@ -0,0 +1,108 @@
+model:
+  base_learning_rate: 1.0e-6
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: image
+    cond_stage_key: caption
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: true   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    unfreeze_model: True
+    model_lr: 1.0e-6
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 512
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+data:
+  target: main.DataModuleFromConfig
+  params:
+    batch_size: 4 
+    num_workers: 8
+    wrap: falsegit
+    train:
+      target: ldm.data.every_dream.EveryDreamBatch
+      params:
+        repeats: 50   # Adjust how much trainging to do.  Fewer training images need more repeats.  This is multiplied by max_epochs for "amount" of training.
+        flip_p: 0   # use 0.5 to randomly flip images each repeat, not recommended unless very low training data (<20 images)
+        debug_level: 1   # 1 to print if images are dropped due to multiple-aspect ratio image bucketing
+    validation:
+      target: ldm.data.ed_validate.EDValidateBatch
+      params:
+        repeats: 10
+    test:
+      target: ldm.data.ed_validate.EDValidateBatch
+      params:
+        repeats: 1
+
+lightning:
+  modelcheckpoint:
+    params:
+      every_n_epochs: 1  # produce a ckpt every epoch, leave 1!
+      #every_n_train_steps: 1400 # can only use epoch or train step checkpoints, can use this if you want instead of every_n_epochs but suggest epochs
+      save_top_k: 2   # *** How many checkpoints you will get to try out, automatically keeps what it thinks are the best. ** REQUIRES ~15GB+ of VOLUME store per checkpoint!!! ***
+      # Above is important.  It costs a lot of VOLUME store but keeps you from having to start over if you overtrain by giving you a few checkpoints to try out.
+      save_last: False
+      filename: "{epoch:02d}-{step:05d}"
+  callbacks:
+    image_logger:
+      target: main.ImageLogger
+      params:
+        batch_frequency: 150
+        max_images: 16
+        increase_log_steps: False
+
+  trainer:
+    benchmark: True
+    max_epochs: 4   # suggest 3-4+ and adjust repeats above, only "save_top_k" number (above) of epochs are kept.
+    max_steps: 99000   # better to end on epochs not steps, especially with >500 images to ensure even distribution, but you can set this if you really want...
+    check_val_every_n_epoch: 1
+    gpus: 0,