From fb0840944c4deb0b7d32feebd35a607af0cde550 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 6 Jun 2023 11:56:10 +0000 Subject: [PATCH] Reducing number of reps while autotuning. --- server/text_generation_server/quant/custom_autotune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/text_generation_server/quant/custom_autotune.py b/server/text_generation_server/quant/custom_autotune.py index 875c832e..85b6115d 100644 --- a/server/text_generation_server/quant/custom_autotune.py +++ b/server/text_generation_server/quant/custom_autotune.py @@ -69,7 +69,7 @@ class Autotuner(triton.KernelInterface): try: # In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default - return triton.testing.do_bench(kernel_call, percentiles=(0.5, 0.2, 0.8), rep=40) + return triton.testing.do_bench(kernel_call, percentiles=(0.5, 0.2, 0.8), rep=10) except triton.compiler.OutOfResources: return (float('inf'), float('inf'), float('inf'))