update caption doc
This commit is contained in:
parent
41eaa09938
commit
acac3f1547
|
@ -84,8 +84,8 @@ def main(args):
|
|||
lang_encoder_path="anas-awadalla/mpt-7b"
|
||||
tokenizer_path="anas-awadalla/mpt-7b"
|
||||
elif "mpt1b" in args.model:
|
||||
lang_encoder_path="anas-awadalla/mpt-1b"
|
||||
tokenizer_path="anas-awadalla/mpt-1b"
|
||||
lang_encoder_path="anas-awadalla/mpt-1b-redpajama-200b"
|
||||
tokenizer_path="anas-awadalla/mpt-1b-redpajama-200b"
|
||||
|
||||
model, image_processor, tokenizer = create_model_and_transforms(
|
||||
clip_vision_encoder_path="ViT-L-14",
|
||||
|
@ -141,7 +141,7 @@ def main(args):
|
|||
|
||||
input_ids = lang_x["input_ids"].to(device)
|
||||
|
||||
with torch.cuda.amp.autocast(dtype=dtype):
|
||||
with torch.cuda.amp.autocast(dtype=dtype), torch.no_grad():
|
||||
generated_text = model.generate(
|
||||
vision_x=vision_x,
|
||||
lang_x=input_ids,
|
||||
|
@ -171,6 +171,7 @@ def main(args):
|
|||
if not os.path.exists(name):
|
||||
with open(f"{name}.txt", "w") as f:
|
||||
f.write(generated_text)
|
||||
print("Done!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(f"Available models:")
|
||||
|
@ -188,7 +189,7 @@ if __name__ == "__main__":
|
|||
parser.add_argument("--prompt", type=str, default="Output: ", help="prompt to use for generation, default is 'Output: '")
|
||||
parser.add_argument("--temperature", type=float, default=1.0, help="temperature for sampling, 1.0 is default")
|
||||
parser.add_argument("--top_k", type=int, default=0, help="top_k sampling, 0 is default")
|
||||
parser.add_argument("--top_p", type=float, default=0.9, help="top_p sampling, 0.9 is default")
|
||||
parser.add_argument("--top_p", type=float, default=1.0, help="top_p sampling, 1.0 is default")
|
||||
parser.add_argument("--repetition_penalty", type=float, default=1.0, help="repetition penalty, 1.0 is default")
|
||||
parser.add_argument("--length_penalty", type=float, default=1.0, help="length penalty, 1.0 is default")
|
||||
args = parser.parse_args()
|
||||
|
|
|
@ -4,6 +4,21 @@
|
|||
|
||||
`python caption_fl.py --data_root input --min_new_tokens 20 --max_new_tokens 30 --num_beams 3 --model "openflamingo/OpenFlamingo-9B-vitl-mpt7b"`
|
||||
|
||||
This script uses two example image/caption pairs located in the example folder to prime the system to caption, then captions the images in the input folder. It will save a `.txt` file of the same base filename with the captoin in the same folder.
|
||||
|
||||
This script currently requires an AMPERE or newer GPU due to using bfloat16.
|
||||
|
||||
**Trying out different example image/caption pairs will influence how the system captions the input images.**
|
||||
|
||||
Supported models:
|
||||
`openflamingo/OpenFlamingo-3B-vitl-mpt1b` Small model, requires 8 GB VRAM a num_beams 3, or 12 GB at num_beams 16
|
||||
`openflamingo/OpenFlamingo-9B-vitl-mpt7b` Large model, requires 24 GB VRAM at num_beams 3
|
||||
|
||||
The small model with more beams (ex. 16) performs well with details and should not be immediately discounted.
|
||||
|
||||
The larger model is more accurate with proper names (i.e. identifying well-known celebrities, objects, or locations) and seems to exhibit a larger vocabulary.
|
||||
|
||||
Primary params:
|
||||
`--num_beams 3` increasing uses more VRAM but may improve detail, also can increase hallicunations
|
||||
`--min_new_tokens 20` and `--max_new_tokens 35` control the length of the caption
|
||||
|
||||
|
|
Loading…
Reference in New Issue