From 822597db49218de17e105e62075096284dfcfd41 Mon Sep 17 00:00:00 2001
From: catboxanon <122327233+catboxanon@users.noreply.github.com>
Date: Sun, 13 Aug 2023 04:16:48 -0400
Subject: [PATCH] Encode batches separately

Significantly reduces VRAM.
This makes encoding more inline with how decoding currently functions.
---
 modules/sd_samplers_common.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/modules/sd_samplers_common.py b/modules/sd_samplers_common.py
index 09d1e11e3..f9d034ca1 100644
--- a/modules/sd_samplers_common.py
+++ b/modules/sd_samplers_common.py
@@ -92,7 +92,15 @@ def images_tensor_to_samples(image, approximation=None, model=None):
             model = shared.sd_model
         image = image.to(shared.device, dtype=devices.dtype_vae)
         image = image * 2 - 1
-        x_latent = model.get_first_stage_encoding(model.encode_first_stage(image))
+        if len(image) > 1:
+            x_latent = torch.stack([
+                model.get_first_stage_encoding(
+                    model.encode_first_stage(torch.unsqueeze(img, 0))
+                )[0]
+                for img in image
+            ])
+        else:
+            x_latent = model.get_first_stage_encoding(model.encode_first_stage(image))
 
     return x_latent