use xpu-smi to dump used memory (#2047)
* use xpu-smi to dump used memory xpu use "ZE_AFFINITY_MASK" to control card, usage is like CUDA_VISIBLE_DEVICES Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * Update server/text_generation_server/utils/import_utils.py Co-authored-by: Daniël de Kok <me@github.danieldk.eu> --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Co-authored-by: Daniël de Kok <me@github.danieldk.eu>
This commit is contained in:
parent
5b2155b0f8
commit
83634dc122
|
@ -49,7 +49,7 @@ RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dea
|
|||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
|
||||
| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
|
||||
|
||||
RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build
|
||||
RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build pciutils
|
||||
|
||||
# Text Generation Inference base env
|
||||
ENV HUGGINGFACE_HUB_CACHE=/data \
|
||||
|
|
|
@ -759,7 +759,10 @@ fn shutdown_shards(shutdown: Arc<AtomicBool>, shutdown_receiver: &mpsc::Receiver
|
|||
fn num_cuda_devices() -> Option<usize> {
|
||||
let devices = match env::var("CUDA_VISIBLE_DEVICES") {
|
||||
Ok(devices) => devices,
|
||||
Err(_) => env::var("NVIDIA_VISIBLE_DEVICES").ok()?,
|
||||
Err(_) => match env::var("NVIDIA_VISIBLE_DEVICES") {
|
||||
Ok(devices) => devices,
|
||||
Err(_) => env::var("ZE_AFFINITY_MASK").ok()?,
|
||||
}
|
||||
};
|
||||
let n_devices = devices.split(',').count();
|
||||
Some(n_devices)
|
||||
|
@ -832,9 +835,9 @@ fn find_num_shards(
|
|||
let num_shard = match (sharded, num_shard) {
|
||||
(Some(true), None) => {
|
||||
// try to default to the number of available GPUs
|
||||
tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES");
|
||||
tracing::info!("Parsing num_shard from CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK");
|
||||
let n_devices = num_cuda_devices()
|
||||
.expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES are not set");
|
||||
.expect("--num-shard and CUDA_VISIBLE_DEVICES/NVIDIA_VISIBLE_DEVICES/ZE_AFFINITY_MASK are not set");
|
||||
if n_devices <= 1 {
|
||||
return Err(LauncherError::NotEnoughCUDADevices(format!(
|
||||
"`sharded` is true but only found {n_devices} CUDA devices"
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import torch
|
||||
from loguru import logger
|
||||
import subprocess
|
||||
|
||||
|
||||
def is_xpu_available():
|
||||
|
@ -19,8 +20,12 @@ def get_cuda_free_memory(device, memory_fraction):
|
|||
|
||||
|
||||
def get_xpu_free_memory(device, memory_fraction):
|
||||
total_gpu_memory = torch.xpu.get_device_properties(device).total_memory
|
||||
free_memory = int(total_gpu_memory * 0.5)
|
||||
total_memory = torch.xpu.get_device_properties(device).total_memory
|
||||
device_id = device.index
|
||||
query = f"xpu-smi dump -d {device_id} -m 18 -n 1"
|
||||
output = subprocess.check_output(query.split()).decode("utf-8").split("\n")
|
||||
used_memory = float(output[1].split(",")[-1]) * 1024 * 1024
|
||||
free_memory = int(total_memory * 0.95 - used_memory)
|
||||
return free_memory
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue