From fb0840944c4deb0b7d32feebd35a607af0cde550 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-41-161.ec2.internal>
Date: Tue, 6 Jun 2023 11:56:10 +0000
Subject: [PATCH] Reducing number of reps while autotuning.

---
 server/text_generation_server/quant/custom_autotune.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/text_generation_server/quant/custom_autotune.py b/server/text_generation_server/quant/custom_autotune.py
index 875c832e..85b6115d 100644
--- a/server/text_generation_server/quant/custom_autotune.py
+++ b/server/text_generation_server/quant/custom_autotune.py
@@ -69,7 +69,7 @@ class Autotuner(triton.KernelInterface):
         try:
             # In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses
             # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default
-            return triton.testing.do_bench(kernel_call, percentiles=(0.5, 0.2, 0.8), rep=40)
+            return triton.testing.do_bench(kernel_call, percentiles=(0.5, 0.2, 0.8), rep=10)
         except triton.compiler.OutOfResources:
             return (float('inf'), float('inf'), float('inf'))