From 83634dc1227d24624514a9577050886e712ffb13 Mon Sep 17 00:00:00 2001 From: "Wang, Yi" Date: Tue, 25 Jun 2024 16:15:46 +0800 Subject: [PATCH] use xpu-smi to dump used memory (#2047) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * use xpu-smi to dump used memory xpu use "ZE_AFFINITY_MASK" to control card, usage is like CUDA_VISIBLE_DEVICES Signed-off-by: Wang, Yi A * Update server/text_generation_server/utils/import_utils.py Co-authored-by: Daniël de Kok --------- Signed-off-by: Wang, Yi A Co-authored-by: Daniël de Kok --- Dockerfile_intel | 2 +- launcher/src/main.rs | 9 ++++++--- server/text_generation_server/utils/import_utils.py | 9 +++++++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/Dockerfile_intel b/Dockerfile_intel index 131f49ba..f09614d4 100644 --- a/Dockerfile_intel +++ b/Dockerfile_intel @@ -49,7 +49,7 @@ RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dea RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list -RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build +RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build pciutils # Text Generation Inference base env ENV HUGGINGFACE_HUB_CACHE=/data \ diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 14d03429..2e06c1ef 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -759,7 +759,10 @@ fn shutdown_shards(shutdown: Arc, shutdown_receiver: &mpsc::Receiver fn num_cuda_devices() -> Option { let devices = match env::var("CUDA_VISIBLE_DEVICES") { Ok(devices) => devices, - Err(_) => env::var("NVIDIA_VISIBLE_DEVICES").ok()?, + Err(_) => match env::var("NVIDIA_VISIBLE_DEVICES") { + Ok(devices) => devices, + Err(_) => env::var("ZE_AFFINITY_MASK").ok()?, + } }; let n_devices = devices.split(',').count(); Some(n_devices) @@ -832,9 +835,9 @@ fn find_num_shards( let num_shard = match (sharded, num_shard) { (Some(true), None) => { // try to default to the number of available GPUs - tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES"); + tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK"); let n_devices = num_cuda_devices() - .expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES are not set"); + .expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK are not set"); if n_devices <= 1 { return Err(LauncherError::NotEnoughCUDADevices(format!( "`sharded` is true but only found {n_devices} CUDA devices" diff --git a/server/text_generation_server/utils/import_utils.py b/server/text_generation_server/utils/import_utils.py index d79e36c2..c3929392 100644 --- a/server/text_generation_server/utils/import_utils.py +++ b/server/text_generation_server/utils/import_utils.py @@ -1,5 +1,6 @@ import torch from loguru import logger +import subprocess def is_xpu_available(): @@ -19,8 +20,12 @@ def get_cuda_free_memory(device, memory_fraction): def get_xpu_free_memory(device, memory_fraction): - total_gpu_memory = torch.xpu.get_device_properties(device).total_memory - free_memory = int(total_gpu_memory * 0.5) + total_memory = torch.xpu.get_device_properties(device).total_memory + device_id = device.index + query = f"xpu-smi dump -d {device_id} -m 18 -n 1" + output = subprocess.check_output(query.split()).decode("utf-8").split("\n") + used_memory = float(output[1].split(",")[-1]) * 1024 * 1024 + free_memory = int(total_memory * 0.95 - used_memory) return free_memory