diff --git a/.github/workflows/load_test.yaml b/.github/workflows/load_test.yaml index 88e80b90..0f9407d8 100644 --- a/.github/workflows/load_test.yaml +++ b/.github/workflows/load_test.yaml @@ -21,7 +21,7 @@ jobs: group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true runs-on: - group: aws-g6-12xlarge-plus-priv + group: aws-g6-12xl-plus-priv-cache env: DOCKER_VOLUME: /cache steps: @@ -41,8 +41,10 @@ jobs: - name: Run bench test run: | + export PATH="$HOME/.local/bin:$PATH" cd load_tests - python benchmarks.py + poetry install + poetry run python benchmarks.py shell: bash env: HF_TOKEN: ${{ secrets.HF_TOKEN_BENCHMARK }} diff --git a/load_tests/benchmarks.py b/load_tests/benchmarks.py index 5697585f..6c681a67 100644 --- a/load_tests/benchmarks.py +++ b/load_tests/benchmarks.py @@ -1,6 +1,7 @@ import json import os import traceback +from typing import Dict, Tuple, List import GPUtil import docker @@ -13,7 +14,7 @@ class InferenceEngineRunner: def __init__(self, model: str): self.model = model - def run(self, parameters: list[tuple]): + def run(self, parameters: list[tuple], gpus: int = 0): NotImplementedError("This method should be implemented by the subclass") def stop(self): @@ -32,7 +33,7 @@ class TGIDockerRunner(InferenceEngineRunner): self.image = image self.volumes = volumes - def run(self, parameters: list[tuple]): + def run(self, parameters: list[tuple], gpus: int = 0): params = f"--model-id {self.model} --port 8080" for p in parameters: params += f" --{p[0]} {str(p[1])}" @@ -43,7 +44,10 @@ class TGIDockerRunner(InferenceEngineRunner): self.container = run_docker(self.image, params, "Connected", "ERROR", - volumes=volumes) + volumes=volumes, + gpus=gpus, + ports={"8080/tcp": 8080} + ) def stop(self): if self.container: @@ -53,15 +57,15 @@ class TGIDockerRunner(InferenceEngineRunner): class BenchmarkRunner: def __init__(self, image: str = "ghcr.io/huggingface/text-generation-inference-benchmark:latest", - volumes=None): + volumes: List[Tuple[str, str]] = None): if volumes is None: volumes = [] self.container = None self.image = image self.volumes = volumes - def run(self, parameters: list[tuple]): - params = "" + def run(self, parameters: list[tuple], network_mode): + params = "text-generation-inference-benchmark" for p in parameters: params += f" --{p[0]} {str(p[1])}" if p[1] is not None else f" --{p[0]}" logger.info(f"Running text-generation-inference-benchmarks with parameters: {params}") @@ -70,8 +74,11 @@ class BenchmarkRunner: volumes[v[0]] = {"bind": v[1], "mode": "rw"} self.container = run_docker(self.image, params, "Benchmark finished", - "Error", - volumes=volumes) + "Fatal:", + volumes=volumes, + extra_env={"RUST_LOG": "text_generation_inference_benchmark=info", + "RUST_BACKTRACE": "full"}, + network_mode=network_mode) def stop(self): if self.container: @@ -79,23 +86,31 @@ class BenchmarkRunner: def run_docker(image: str, args: str, success_sentinel: str, - error_sentinel: str, volumes=None, gpus: int = 0) -> Container: + error_sentinel: str, ports: Dict[str, int] = None, volumes=None, network_mode: str = "bridge", + gpus: int = 0, extra_env: Dict[str, str] = None) -> Container: + if ports is None: + ports = {} if volumes is None: volumes = {} - client = docker.from_env() + if extra_env is None: + extra_env = {} + client = docker.from_env(timeout=300) # retrieve the GPU devices from CUDA_VISIBLE_DEVICES devices = [f"{i}" for i in range(get_num_gpus())][:gpus] + environment = {"HF_TOKEN": os.environ.get("HF_TOKEN")} + environment.update(extra_env) container = client.containers.run(image, args, detach=True, device_requests=[ docker.types.DeviceRequest(device_ids=devices, - capabilities=[['gpu']]) if gpus > 0 else None - ], + capabilities=[['gpu']]) + ] if gpus > 0 else None, volumes=volumes, shm_size="1g", - ports={"8080/tcp": 8080}, - environment={"HF_TOKEN": os.environ.get("HF_TOKEN")}, ) + ports=ports, + network_mode=network_mode, + environment=environment, ) for line in container.logs(stream=True): print(line.decode("utf-8"), end="") if success_sentinel.encode("utf-8") in line: @@ -145,6 +160,8 @@ def build_df(model: str, data_files: dict[str, str]) -> pd.DataFrame: def main(): results_dir = 'results' + # get absolute path + results_dir = os.path.join(os.path.dirname(__file__), results_dir) logger.info('Starting benchmark') models = [ ('meta-llama/Llama-3.1-8B-Instruct', 1), @@ -152,15 +169,17 @@ def main(): # ('mistralai/Mixtral-8x7B-Instruct-v0.1', 2), ] sha = os.environ.get('GITHUB_SHA') - # create results directory - os.makedirs(results_dir, exist_ok=True) + success = True for model in models: tgi_runner = TGIDockerRunner(model[0]) + # create results directory + model_dir = os.path.join(results_dir, f'{model[0].replace("/", "_").replace(".", "_")}') + os.makedirs(model_dir, exist_ok=True) runner = BenchmarkRunner( - volumes=['results', '/opt/text-generation-inference-benchmark/results'] + volumes=[(model_dir, '/opt/text-generation-inference-benchmark/results')] ) try: - tgi_runner.run([('max-concurrent-requests', 512)]) + tgi_runner.run([('max-concurrent-requests', 512)], gpus=model[1]) logger.info(f'TGI started for model {model[0]}') parameters = [ ('tokenizer-name', model[0]), @@ -171,27 +190,38 @@ def main(): ('benchmark-kind', 'rate'), ('prompt-options', 'num_tokens=200,max_tokens=220,min_tokens=180,variance=10'), ('decode-options', 'num_tokens=200,max_tokens=220,min_tokens=180,variance=10'), - ('extra-meta', f'engine=TGI,tp={model[1]},version={sha},gpu={get_gpu_name()}'), - ('--no-console', None) + ('extra-meta', f'"engine=TGI,tp={model[1]},version={sha},gpu={get_gpu_name()}"'), + ('no-console', None) ] - runner.run(parameters) + rates = [('rates', f'{r / 10.}') for r in list(range(8, 248, 8))] + parameters.extend(rates) + runner.run(parameters, f'container:{tgi_runner.container.id}') except Exception as e: logger.error(f'Error running benchmark for model {model[0]}: {e}') # print the stack trace print(traceback.format_exc()) + success = False finally: tgi_runner.stop() runner.stop() - # list json files in results directory - data_files = {} + if not success: + logger.error('Some benchmarks failed') + exit(1) + df = pd.DataFrame() - for filename in os.listdir(results_dir): - if filename.endswith('.json'): - data_files[filename.split('.')[-2]] = f'{results_dir}/{filename}' - df = pd.concat([df, build_df(results_dir.split('/')[-1], data_files)]) + # list recursively directories + directories = [f'{results_dir}/{d}' for d in os.listdir(results_dir) if os.path.isdir(f'{results_dir}/{d}')] + logger.info(f'Found result directories: {directories}') + for directory in directories: + data_files = {} + for filename in os.listdir(directory): + if filename.endswith('.json'): + data_files[filename.split('.')[-2]] = f'{directory}/{filename}' + logger.info(f'Processing directory {directory}') + df = pd.concat([df, build_df(directory.split('/')[-1], data_files)]) df['device'] = get_gpu_name() df['error_rate'] = df['failed_requests'] / (df['failed_requests'] + df['successful_requests']) * 100.0 - df.to_parquet('s3://text-generation-inference-ci/benchmarks/ci/') + df.to_parquet(f's3://text-generation-inference-ci/benchmarks/ci/{sha}.parquet') if __name__ == "__main__":