Update server tests

- Default to throughput test in k6
- Use TGI_WIGGLE_ROOM to adjust wiggle room
This commit is contained in:
Nicolas Patry 2024-08-28 15:42:05 +02:00
parent 12325564dc
commit 8d01848370
No known key found for this signature in database
GPG Key ID: 64AF4752B2967863
4 changed files with 22 additions and 18 deletions

View File

@ -33,13 +33,13 @@ export function get_options() {
// rate: 20,
// timeUnit: '1s',
// },
load_test: {
executor: 'constant-arrival-rate',
duration: '60s',
preAllocatedVUs: 100,
rate: 1,
timeUnit: '1s',
},
// load_test: {
// executor: 'constant-arrival-rate',
// duration: '60s',
// preAllocatedVUs: 100,
// rate: 1,
// timeUnit: '1s',
// },
// breakpoint: {
// executor: 'ramping-arrival-rate', //Assure load increase if the system slows
// preAllocatedVUs: 300,
@ -47,12 +47,12 @@ export function get_options() {
// { duration: '60s', target: 100 }, // just slowly ramp-up to a HUGE load
// ],
// },
// throughput: {
// executor: 'shared-iterations',
// vus: 100,
// iterations: 200,
// maxDuration: '40s',
// },
throughput: {
executor: 'shared-iterations',
vus: 100,
iterations: 200,
maxDuration: '40s',
},
},
};
}

View File

@ -2,8 +2,8 @@ import pytest
import os
from text_generation_server.pb import generate_pb2
os.environ["USE_PREFIX_CACHING"] = "0"
os.environ["ATTENTION"] = "flashdecoding"
os.environ["USE_PREFIX_CACHING"] = "1"
os.environ["ATTENTION"] = "flashinfer"
@pytest.fixture

View File

@ -43,6 +43,7 @@ from text_generation_server.models.globals import (
ATTENTION,
BLOCK_SIZE,
CUDA_GRAPHS,
TGI_WIGGLE_ROOM,
get_adapter_to_index,
)
from text_generation_server.layers.attention import Seqlen
@ -1283,7 +1284,7 @@ class FlashCausalLM(Model):
num_blocks = (
# Leave 5% for some wiggle room
int((free_memory * 0.95) // total_cache_size)
int((free_memory * TGI_WIGGLE_ROOM) // total_cache_size)
# Add batch.num_blocks as we allocated it above, so it is included in the peak memory.
+ batch_num_blocks
)

View File

@ -14,10 +14,13 @@ assert (
), f"Attention is not valid {ATTENTION}, expected {_expected}"
log_master(logger.info, f"Using Attention = {ATTENTION}")
# if PREFIX_CACHING and ATTENTION != "flashinfer":
# raise RuntimeError("Prefix caching is only supported with flashinfer")
if PREFIX_CACHING and ATTENTION not in {"flashinfer", "flashdecoding"}:
raise RuntimeError("Prefix caching is only supported with flashinfer")
MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.95"))
assert TGI_WIGGLE_ROOM > 0
assert TGI_WIGGLE_ROOM < 1
# This is overridden by the cli
BLOCK_SIZE: int