From ec98a79b1e01a14fc66feadccd651a3c0ed48a83 Mon Sep 17 00:00:00 2001
From: Victor Hall <victor.charles.hall@gmail.com>
Date: Sat, 25 Nov 2023 17:21:34 -0500
Subject: [PATCH] add tokenizer tester script

---
 scripts/test_tokenizer.py | 22 ++++++++++++++++++++++
 scripts/txt2img.py        |  3 ---
 2 files changed, 22 insertions(+), 3 deletions(-)
 create mode 100644 scripts/test_tokenizer.py

diff --git a/scripts/test_tokenizer.py b/scripts/test_tokenizer.py
new file mode 100644
index 0000000..ea86e70
--- /dev/null
+++ b/scripts/test_tokenizer.py
@@ -0,0 +1,22 @@
+import json
+from transformers import CLIPTokenizer
+
+tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+text = "αβ, Γ γ, Δ δ, Ε ε, Ζ ζ, Η η, Θ θ, Ι ι, Κ κ, Λ λ, Μ μ, Ν ν, Ξ ξ, Ο ο, Π π, Ρ ρ, Σ σ/ς, Τ τ, Υ υ, Φ φ, Χ χ, Ψ ψ, Ω ω."
+
+token_dict = json.load(open("SD15/tokenizer/vocab.json", "r"))
+# reverse key and value in token_dict because the token ids are value in vocab.json
+token_dict = {v: k for k, v in token_dict.items()}
+
+
+tokens = tokenizer(text, truncation=False, padding="max_length", return_tensors="pt").input_ids
+
+tokens = tokens.tolist()[0]
+tokens = [t for t in tokens if t not in [49406, 49407]] #remove start/end/pad tokens
+tokens2 = [token_dict[t] for t in tokens]
+print(f"text: {text}")
+print(f"token ids:{tokens}")
+print(f"tokens: {tokens2}")
+print(f"length (special tokens removed, max 75): {len(tokens)}, over limit: {len(tokens) > 75}")
+
diff --git a/scripts/txt2img.py b/scripts/txt2img.py
index 9ec7039..71ef0e1 100644
--- a/scripts/txt2img.py
+++ b/scripts/txt2img.py
@@ -24,9 +24,6 @@ from diffusers import StableDiffusionPipeline, AutoencoderKL, UNet2DConditionMod
 from torch.cuda.amp import autocast
 from transformers import CLIPTextModel, CLIPTokenizer
 
-
-# from diffusers.models import AttentionBlock
-
 def __generate_sample(pipe: StableDiffusionPipeline, prompt: str, cfg: float, height: int, width: int, gen,
                       steps: int = 30, batch_size: int = 1):
     """