From 83634dc1227d24624514a9577050886e712ffb13 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Tue, 25 Jun 2024 16:15:46 +0800
Subject: [PATCH] use xpu-smi to dump used memory (#2047)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* use xpu-smi to dump used memory
xpu use "ZE_AFFINITY_MASK" to control card, usage is like CUDA_VISIBLE_DEVICES

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* Update server/text_generation_server/utils/import_utils.py

Co-authored-by: Daniël de Kok <me@github.danieldk.eu>

---------

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
Co-authored-by: Daniël de Kok <me@github.danieldk.eu>
---
 Dockerfile_intel                                    | 2 +-
 launcher/src/main.rs                                | 9 ++++++---
 server/text_generation_server/utils/import_utils.py | 9 +++++++--
 3 files changed, 14 insertions(+), 6 deletions(-)
diff --git a/Dockerfile_intel b/Dockerfile_intel
index 131f49ba..f09614d4 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -49,7 +49,7 @@ RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dea
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
 | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
 
-RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build
+RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build pciutils
 
 # Text Generation Inference base env
 ENV HUGGINGFACE_HUB_CACHE=/data \
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 14d03429..2e06c1ef 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -759,7 +759,10 @@ fn shutdown_shards(shutdown: Arc<AtomicBool>, shutdown_receiver: &mpsc::Receiver
 fn num_cuda_devices() -> Option<usize> {
     let devices = match env::var("CUDA_VISIBLE_DEVICES") {
         Ok(devices) => devices,
-        Err(_) => env::var("NVIDIA_VISIBLE_DEVICES").ok()?,
+        Err(_) => match env::var("NVIDIA_VISIBLE_DEVICES") {
+            Ok(devices) => devices,
+            Err(_) => env::var("ZE_AFFINITY_MASK").ok()?,
+        }
     };
     let n_devices = devices.split(',').count();
     Some(n_devices)
@@ -832,9 +835,9 @@ fn find_num_shards(
     let num_shard = match (sharded, num_shard) {
         (Some(true), None) => {
             // try to default to the number of available GPUs
-            tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES");
+            tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK");
             let n_devices = num_cuda_devices()
-                .expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES are not set");
+                .expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK are not set");
             if n_devices <= 1 {
                 return Err(LauncherError::NotEnoughCUDADevices(format!(
                     "`sharded` is true but only found {n_devices} CUDA devices"
diff --git a/server/text_generation_server/utils/import_utils.py b/server/text_generation_server/utils/import_utils.py
index d79e36c2..c3929392 100644
--- a/server/text_generation_server/utils/import_utils.py
+++ b/server/text_generation_server/utils/import_utils.py
@@ -1,5 +1,6 @@
 import torch
 from loguru import logger
+import subprocess
 
 
 def is_xpu_available():
@@ -19,8 +20,12 @@ def get_cuda_free_memory(device, memory_fraction):
 
 
 def get_xpu_free_memory(device, memory_fraction):
-    total_gpu_memory = torch.xpu.get_device_properties(device).total_memory
-    free_memory = int(total_gpu_memory * 0.5)
+    total_memory = torch.xpu.get_device_properties(device).total_memory
+    device_id = device.index
+    query = f"xpu-smi dump -d {device_id} -m 18 -n 1"
+    output = subprocess.check_output(query.split()).decode("utf-8").split("\n")
+    used_memory = float(output[1].split(",")[-1]) * 1024 * 1024
+    free_memory = int(total_memory * 0.95 - used_memory)
     return free_memory