From acac3f154752eedd48cb0c226f8a0e034d497e2e Mon Sep 17 00:00:00 2001
From: Victor Hall <victor.charles.hall@gmail.com>
Date: Fri, 30 Jun 2023 00:37:49 -0400
Subject: [PATCH] update caption doc

---
 caption_fl.py  |  9 +++++----
 doc/CAPTION.md | 15 +++++++++++++++
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/caption_fl.py b/caption_fl.py
index baec83e..1376631 100644
--- a/caption_fl.py
+++ b/caption_fl.py
@@ -84,8 +84,8 @@ def main(args):
         lang_encoder_path="anas-awadalla/mpt-7b"
         tokenizer_path="anas-awadalla/mpt-7b"
     elif "mpt1b" in args.model:
-        lang_encoder_path="anas-awadalla/mpt-1b"
-        tokenizer_path="anas-awadalla/mpt-1b"
+        lang_encoder_path="anas-awadalla/mpt-1b-redpajama-200b"
+        tokenizer_path="anas-awadalla/mpt-1b-redpajama-200b"
 
     model, image_processor, tokenizer = create_model_and_transforms(
         clip_vision_encoder_path="ViT-L-14",
@@ -141,7 +141,7 @@ def main(args):
 
                 input_ids = lang_x["input_ids"].to(device)
 
-                with torch.cuda.amp.autocast(dtype=dtype):
+                with torch.cuda.amp.autocast(dtype=dtype), torch.no_grad():
                     generated_text = model.generate(
                         vision_x=vision_x,
                         lang_x=input_ids,
@@ -171,6 +171,7 @@ def main(args):
                 if not os.path.exists(name):
                     with open(f"{name}.txt", "w") as f:
                         f.write(generated_text)
+    print("Done!")
 
 if __name__ == "__main__":
     print(f"Available models:")
@@ -188,7 +189,7 @@ if __name__ == "__main__":
     parser.add_argument("--prompt", type=str, default="Output: ", help="prompt to use for generation, default is 'Output: '")
     parser.add_argument("--temperature", type=float, default=1.0, help="temperature for sampling, 1.0 is default")
     parser.add_argument("--top_k", type=int, default=0, help="top_k sampling, 0 is default")
-    parser.add_argument("--top_p", type=float, default=0.9, help="top_p sampling, 0.9 is default")
+    parser.add_argument("--top_p", type=float, default=1.0, help="top_p sampling, 1.0 is default")
     parser.add_argument("--repetition_penalty", type=float, default=1.0, help="repetition penalty, 1.0 is default")
     parser.add_argument("--length_penalty", type=float, default=1.0, help="length penalty, 1.0 is default")
     args = parser.parse_args()
diff --git a/doc/CAPTION.md b/doc/CAPTION.md
index d28a024..50ab754 100644
--- a/doc/CAPTION.md
+++ b/doc/CAPTION.md
@@ -4,6 +4,21 @@
 
 `python caption_fl.py --data_root input --min_new_tokens 20 --max_new_tokens 30 --num_beams 3 --model "openflamingo/OpenFlamingo-9B-vitl-mpt7b"`
 
+This script uses two example image/caption pairs located in the example folder to prime the system to caption, then captions the images in the input folder. It will save a `.txt` file of the same base filename with the captoin in the same folder. 
+
+This script currently requires an AMPERE or newer GPU due to using bfloat16. 
+
+**Trying out different example image/caption pairs will influence how the system captions the input images.**
+
+Supported models:
+`openflamingo/OpenFlamingo-3B-vitl-mpt1b` Small model, requires 8 GB VRAM a num_beams 3, or 12 GB at num_beams 16
+`openflamingo/OpenFlamingo-9B-vitl-mpt7b` Large model, requires 24 GB VRAM at num_beams 3
+
+The small model with more beams (ex. 16) performs well with details and should not be immediately discounted. 
+
+The larger model is more accurate with proper names (i.e. identifying well-known celebrities, objects, or locations) and seems to exhibit a larger vocabulary.
+
+Primary params:
 `--num_beams 3` increasing uses more VRAM but may improve detail, also can increase hallicunations
 `--min_new_tokens 20` and `--max_new_tokens 35` control the length of the caption