Update server tests

- Default to throughput test in k6 - Use TGI_WIGGLE_ROOM to adjust wiggle room
2024-08-28 15:42:05 +02:00 · 2024-08-28 15:42:05 +02:00 · 8d01848370
parent 12325564dc
commit 8d01848370
4 changed files with 22 additions and 18 deletions
--- a/load_tests/common.js
+++ b/load_tests/common.js
@ -33,13 +33,13 @@ export function get_options() {
            //     rate: 20,
            //     timeUnit: '1s',
            // },
-            load_test: {
-                executor: 'constant-arrival-rate',
-                duration: '60s',
-                preAllocatedVUs: 100,
-                rate: 1,
-                timeUnit: '1s',
-            },
+            // load_test: {
+            //     executor: 'constant-arrival-rate',
+            //     duration: '60s',
+            //     preAllocatedVUs: 100,
+            //     rate: 1,
+            //     timeUnit: '1s',
+            // },
            // breakpoint: {
            //     executor: 'ramping-arrival-rate', //Assure load increase if the system slows
            //     preAllocatedVUs: 300,
@ -47,12 +47,12 @@ export function get_options() {
            //         { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
            //     ],
            // },
-            // throughput: {
-            //     executor: 'shared-iterations',
-            //     vus: 100,
-            //     iterations: 200,
-            //     maxDuration: '40s',
-            // },
+            throughput: {
+                executor: 'shared-iterations',
+                vus: 100,
+                iterations: 200,
+                maxDuration: '40s',
+            },
        },
    };
 }
--- a/server/tests/conftest.py
+++ b/server/tests/conftest.py
@ -2,8 +2,8 @@ import pytest
 import os
 from text_generation_server.pb import generate_pb2

-os.environ["USE_PREFIX_CACHING"] = "0"
-os.environ["ATTENTION"] = "flashdecoding"
+os.environ["USE_PREFIX_CACHING"] = "1"
+os.environ["ATTENTION"] = "flashinfer"


@pytest.fixture
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -43,6 +43,7 @@ from text_generation_server.models.globals import (
    ATTENTION,
    BLOCK_SIZE,
    CUDA_GRAPHS,
+    TGI_WIGGLE_ROOM,
    get_adapter_to_index,
 )
 from text_generation_server.layers.attention import Seqlen
@ -1283,7 +1284,7 @@ class FlashCausalLM(Model):

        num_blocks = (
            # Leave 5% for some wiggle room
-            int((free_memory * 0.95) // total_cache_size)
+            int((free_memory * TGI_WIGGLE_ROOM) // total_cache_size)
            # Add batch.num_blocks as we allocated it above, so it is included in the peak memory.
            + batch_num_blocks
        )
--- a/server/text_generation_server/models/globals.py
+++ b/server/text_generation_server/models/globals.py
@ -14,10 +14,13 @@ assert (
 ), f"Attention is not valid {ATTENTION}, expected {_expected}"
 log_master(logger.info, f"Using Attention = {ATTENTION}")

-# if PREFIX_CACHING and ATTENTION != "flashinfer":
-#     raise RuntimeError("Prefix caching is only supported with flashinfer")
+if PREFIX_CACHING and ATTENTION not in {"flashinfer", "flashdecoding"}:
+    raise RuntimeError("Prefix caching is only supported with flashinfer")

 MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
+TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.95"))
+assert TGI_WIGGLE_ROOM > 0
+assert TGI_WIGGLE_ROOM < 1

 # This is overridden by the cli
 BLOCK_SIZE: int