This commit is contained in:
Mohit Sharma 2024-09-11 06:52:30 +00:00
parent 3f2dc61500
commit f3bc038430
1 changed files with 4 additions and 0 deletions

View File

@ -1162,6 +1162,10 @@ class FlashCausalLM(Model):
f"PyTorch TunableOp is enabled. The warmup may take several minutes, picking the ROCm optimal matrix multiplication kernel for the target lengths {', '.join([str(seqlen) for seqlen in tuning_sequences])}, with typical 5-8% latency improvement for small sequence lengths. The picked GEMMs are saved in the file {tunableop_filepath}. To disable TunableOp, please launch TGI with `PYTORCH_TUNABLEOP_ENABLED=0`.",
)
torch.cuda.tunable.set_filename(
tunableop_filepath, insert_device_ordinal=False
)
if os.path.isfile(tunableop_filepath):
log_master(
logger.info,