fix: Update runners group

This commit is contained in:
Hugo Larcher 2024-09-30 18:00:54 +02:00
parent fc7dcb0ba6
commit 2980720af4
No known key found for this signature in database
GPG Key ID: 3DAF63124699CA2B
2 changed files with 62 additions and 30 deletions

View File

@ -21,7 +21,7 @@ jobs:
group: ${{ github.workflow }}-${{ github.job }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
runs-on:
group: aws-g6-12xlarge-plus-priv
group: aws-g6-12xl-plus-priv-cache
env:
DOCKER_VOLUME: /cache
steps:
@ -41,8 +41,10 @@ jobs:
- name: Run bench test
run: |
export PATH="$HOME/.local/bin:$PATH"
cd load_tests
python benchmarks.py
poetry install
poetry run python benchmarks.py
shell: bash
env:
HF_TOKEN: ${{ secrets.HF_TOKEN_BENCHMARK }}

View File

@ -1,6 +1,7 @@
import json
import os
import traceback
from typing import Dict, Tuple, List
import GPUtil
import docker
@ -13,7 +14,7 @@ class InferenceEngineRunner:
def __init__(self, model: str):
self.model = model
def run(self, parameters: list[tuple]):
def run(self, parameters: list[tuple], gpus: int = 0):
NotImplementedError("This method should be implemented by the subclass")
def stop(self):
@ -32,7 +33,7 @@ class TGIDockerRunner(InferenceEngineRunner):
self.image = image
self.volumes = volumes
def run(self, parameters: list[tuple]):
def run(self, parameters: list[tuple], gpus: int = 0):
params = f"--model-id {self.model} --port 8080"
for p in parameters:
params += f" --{p[0]} {str(p[1])}"
@ -43,7 +44,10 @@ class TGIDockerRunner(InferenceEngineRunner):
self.container = run_docker(self.image, params,
"Connected",
"ERROR",
volumes=volumes)
volumes=volumes,
gpus=gpus,
ports={"8080/tcp": 8080}
)
def stop(self):
if self.container:
@ -53,15 +57,15 @@ class TGIDockerRunner(InferenceEngineRunner):
class BenchmarkRunner:
def __init__(self,
image: str = "ghcr.io/huggingface/text-generation-inference-benchmark:latest",
volumes=None):
volumes: List[Tuple[str, str]] = None):
if volumes is None:
volumes = []
self.container = None
self.image = image
self.volumes = volumes
def run(self, parameters: list[tuple]):
params = ""
def run(self, parameters: list[tuple], network_mode):
params = "text-generation-inference-benchmark"
for p in parameters:
params += f" --{p[0]} {str(p[1])}" if p[1] is not None else f" --{p[0]}"
logger.info(f"Running text-generation-inference-benchmarks with parameters: {params}")
@ -70,8 +74,11 @@ class BenchmarkRunner:
volumes[v[0]] = {"bind": v[1], "mode": "rw"}
self.container = run_docker(self.image, params,
"Benchmark finished",
"Error",
volumes=volumes)
"Fatal:",
volumes=volumes,
extra_env={"RUST_LOG": "text_generation_inference_benchmark=info",
"RUST_BACKTRACE": "full"},
network_mode=network_mode)
def stop(self):
if self.container:
@ -79,23 +86,31 @@ class BenchmarkRunner:
def run_docker(image: str, args: str, success_sentinel: str,
error_sentinel: str, volumes=None, gpus: int = 0) -> Container:
error_sentinel: str, ports: Dict[str, int] = None, volumes=None, network_mode: str = "bridge",
gpus: int = 0, extra_env: Dict[str, str] = None) -> Container:
if ports is None:
ports = {}
if volumes is None:
volumes = {}
client = docker.from_env()
if extra_env is None:
extra_env = {}
client = docker.from_env(timeout=300)
# retrieve the GPU devices from CUDA_VISIBLE_DEVICES
devices = [f"{i}" for i in
range(get_num_gpus())][:gpus]
environment = {"HF_TOKEN": os.environ.get("HF_TOKEN")}
environment.update(extra_env)
container = client.containers.run(image, args,
detach=True,
device_requests=[
docker.types.DeviceRequest(device_ids=devices,
capabilities=[['gpu']]) if gpus > 0 else None
],
capabilities=[['gpu']])
] if gpus > 0 else None,
volumes=volumes,
shm_size="1g",
ports={"8080/tcp": 8080},
environment={"HF_TOKEN": os.environ.get("HF_TOKEN")}, )
ports=ports,
network_mode=network_mode,
environment=environment, )
for line in container.logs(stream=True):
print(line.decode("utf-8"), end="")
if success_sentinel.encode("utf-8") in line:
@ -145,6 +160,8 @@ def build_df(model: str, data_files: dict[str, str]) -> pd.DataFrame:
def main():
results_dir = 'results'
# get absolute path
results_dir = os.path.join(os.path.dirname(__file__), results_dir)
logger.info('Starting benchmark')
models = [
('meta-llama/Llama-3.1-8B-Instruct', 1),
@ -152,15 +169,17 @@ def main():
# ('mistralai/Mixtral-8x7B-Instruct-v0.1', 2),
]
sha = os.environ.get('GITHUB_SHA')
# create results directory
os.makedirs(results_dir, exist_ok=True)
success = True
for model in models:
tgi_runner = TGIDockerRunner(model[0])
# create results directory
model_dir = os.path.join(results_dir, f'{model[0].replace("/", "_").replace(".", "_")}')
os.makedirs(model_dir, exist_ok=True)
runner = BenchmarkRunner(
volumes=['results', '/opt/text-generation-inference-benchmark/results']
volumes=[(model_dir, '/opt/text-generation-inference-benchmark/results')]
)
try:
tgi_runner.run([('max-concurrent-requests', 512)])
tgi_runner.run([('max-concurrent-requests', 512)], gpus=model[1])
logger.info(f'TGI started for model {model[0]}')
parameters = [
('tokenizer-name', model[0]),
@ -171,27 +190,38 @@ def main():
('benchmark-kind', 'rate'),
('prompt-options', 'num_tokens=200,max_tokens=220,min_tokens=180,variance=10'),
('decode-options', 'num_tokens=200,max_tokens=220,min_tokens=180,variance=10'),
('extra-meta', f'engine=TGI,tp={model[1]},version={sha},gpu={get_gpu_name()}'),
('--no-console', None)
('extra-meta', f'"engine=TGI,tp={model[1]},version={sha},gpu={get_gpu_name()}"'),
('no-console', None)
]
runner.run(parameters)
rates = [('rates', f'{r / 10.}') for r in list(range(8, 248, 8))]
parameters.extend(rates)
runner.run(parameters, f'container:{tgi_runner.container.id}')
except Exception as e:
logger.error(f'Error running benchmark for model {model[0]}: {e}')
# print the stack trace
print(traceback.format_exc())
success = False
finally:
tgi_runner.stop()
runner.stop()
# list json files in results directory
data_files = {}
if not success:
logger.error('Some benchmarks failed')
exit(1)
df = pd.DataFrame()
for filename in os.listdir(results_dir):
if filename.endswith('.json'):
data_files[filename.split('.')[-2]] = f'{results_dir}/{filename}'
df = pd.concat([df, build_df(results_dir.split('/')[-1], data_files)])
# list recursively directories
directories = [f'{results_dir}/{d}' for d in os.listdir(results_dir) if os.path.isdir(f'{results_dir}/{d}')]
logger.info(f'Found result directories: {directories}')
for directory in directories:
data_files = {}
for filename in os.listdir(directory):
if filename.endswith('.json'):
data_files[filename.split('.')[-2]] = f'{directory}/{filename}'
logger.info(f'Processing directory {directory}')
df = pd.concat([df, build_df(directory.split('/')[-1], data_files)])
df['device'] = get_gpu_name()
df['error_rate'] = df['failed_requests'] / (df['failed_requests'] + df['successful_requests']) * 100.0
df.to_parquet('s3://text-generation-inference-ci/benchmarks/ci/')
df.to_parquet(f's3://text-generation-inference-ci/benchmarks/ci/{sha}.parquet')
if __name__ == "__main__":