use xpu-smi to dump used memory (#2047)

* use xpu-smi to dump used memory
xpu use "ZE_AFFINITY_MASK" to control card, usage is like CUDA_VISIBLE_DEVICES

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* Update server/text_generation_server/utils/import_utils.py

Co-authored-by: Daniël de Kok <me@github.danieldk.eu>

---------

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
Co-authored-by: Daniël de Kok <me@github.danieldk.eu>
This commit is contained in:
Wang, Yi 2024-06-25 16:15:46 +08:00 committed by GitHub
parent 5b2155b0f8
commit 83634dc122
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 14 additions and 6 deletions

View File

@ -49,7 +49,7 @@ RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dea
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build
RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build pciutils
# Text Generation Inference base env
ENV HUGGINGFACE_HUB_CACHE=/data \

View File

@ -759,7 +759,10 @@ fn shutdown_shards(shutdown: Arc<AtomicBool>, shutdown_receiver: &mpsc::Receiver
fn num_cuda_devices() -> Option<usize> {
let devices = match env::var("CUDA_VISIBLE_DEVICES") {
Ok(devices) => devices,
Err(_) => env::var("NVIDIA_VISIBLE_DEVICES").ok()?,
Err(_) => match env::var("NVIDIA_VISIBLE_DEVICES") {
Ok(devices) => devices,
Err(_) => env::var("ZE_AFFINITY_MASK").ok()?,
}
};
let n_devices = devices.split(',').count();
Some(n_devices)
@ -832,9 +835,9 @@ fn find_num_shards(
let num_shard = match (sharded, num_shard) {
(Some(true), None) => {
// try to default to the number of available GPUs
tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES");
tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK");
let n_devices = num_cuda_devices()
.expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES are not set");
.expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK are not set");
if n_devices <= 1 {
return Err(LauncherError::NotEnoughCUDADevices(format!(
"`sharded` is true but only found {n_devices} CUDA devices"

View File

@ -1,5 +1,6 @@
import torch
from loguru import logger
import subprocess
def is_xpu_available():
@ -19,8 +20,12 @@ def get_cuda_free_memory(device, memory_fraction):
def get_xpu_free_memory(device, memory_fraction):
total_gpu_memory = torch.xpu.get_device_properties(device).total_memory
free_memory = int(total_gpu_memory * 0.5)
total_memory = torch.xpu.get_device_properties(device).total_memory
device_id = device.index
query = f"xpu-smi dump -d {device_id} -m 18 -n 1"
output = subprocess.check_output(query.split()).decode("utf-8").split("\n")
used_memory = float(output[1].split(",")[-1]) * 1024 * 1024
free_memory = int(total_memory * 0.95 - used_memory)
return free_memory