update caption doc

This commit is contained in:
Victor Hall 2023-06-30 00:37:49 -04:00
parent 41eaa09938
commit acac3f1547
2 changed files with 20 additions and 4 deletions

View File

@ -84,8 +84,8 @@ def main(args):
lang_encoder_path="anas-awadalla/mpt-7b"
tokenizer_path="anas-awadalla/mpt-7b"
elif "mpt1b" in args.model:
lang_encoder_path="anas-awadalla/mpt-1b"
tokenizer_path="anas-awadalla/mpt-1b"
lang_encoder_path="anas-awadalla/mpt-1b-redpajama-200b"
tokenizer_path="anas-awadalla/mpt-1b-redpajama-200b"
model, image_processor, tokenizer = create_model_and_transforms(
clip_vision_encoder_path="ViT-L-14",
@ -141,7 +141,7 @@ def main(args):
input_ids = lang_x["input_ids"].to(device)
with torch.cuda.amp.autocast(dtype=dtype):
with torch.cuda.amp.autocast(dtype=dtype), torch.no_grad():
generated_text = model.generate(
vision_x=vision_x,
lang_x=input_ids,
@ -171,6 +171,7 @@ def main(args):
if not os.path.exists(name):
with open(f"{name}.txt", "w") as f:
f.write(generated_text)
print("Done!")
if __name__ == "__main__":
print(f"Available models:")
@ -188,7 +189,7 @@ if __name__ == "__main__":
parser.add_argument("--prompt", type=str, default="Output: ", help="prompt to use for generation, default is 'Output: '")
parser.add_argument("--temperature", type=float, default=1.0, help="temperature for sampling, 1.0 is default")
parser.add_argument("--top_k", type=int, default=0, help="top_k sampling, 0 is default")
parser.add_argument("--top_p", type=float, default=0.9, help="top_p sampling, 0.9 is default")
parser.add_argument("--top_p", type=float, default=1.0, help="top_p sampling, 1.0 is default")
parser.add_argument("--repetition_penalty", type=float, default=1.0, help="repetition penalty, 1.0 is default")
parser.add_argument("--length_penalty", type=float, default=1.0, help="length penalty, 1.0 is default")
args = parser.parse_args()

View File

@ -4,6 +4,21 @@
`python caption_fl.py --data_root input --min_new_tokens 20 --max_new_tokens 30 --num_beams 3 --model "openflamingo/OpenFlamingo-9B-vitl-mpt7b"`
This script uses two example image/caption pairs located in the example folder to prime the system to caption, then captions the images in the input folder. It will save a `.txt` file of the same base filename with the captoin in the same folder.
This script currently requires an AMPERE or newer GPU due to using bfloat16.
**Trying out different example image/caption pairs will influence how the system captions the input images.**
Supported models:
`openflamingo/OpenFlamingo-3B-vitl-mpt1b` Small model, requires 8 GB VRAM a num_beams 3, or 12 GB at num_beams 16
`openflamingo/OpenFlamingo-9B-vitl-mpt7b` Large model, requires 24 GB VRAM at num_beams 3
The small model with more beams (ex. 16) performs well with details and should not be immediately discounted.
The larger model is more accurate with proper names (i.e. identifying well-known celebrities, objects, or locations) and seems to exhibit a larger vocabulary.
Primary params:
`--num_beams 3` increasing uses more VRAM but may improve detail, also can increase hallicunations
`--min_new_tokens 20` and `--max_new_tokens 35` control the length of the caption