From 59922f9bc16afee9efcc7ee1c5f9d753ef314ffa Mon Sep 17 00:00:00 2001 From: "Wang, Yi" Date: Tue, 13 Aug 2024 21:33:55 +0800 Subject: [PATCH] add numa to improve cpu inference perf (#2330) Signed-off-by: Wang, Yi A --- Dockerfile_intel | 12 +++---- .../models/flash_causal_lm.py | 31 +++++++++++++++++++ 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/Dockerfile_intel b/Dockerfile_intel index 158c5a89..12480c70 100644 --- a/Dockerfile_intel +++ b/Dockerfile_intel @@ -106,7 +106,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins g++ \ git \ wget \ - cmake + cmake \ + libnuma-dev ENV HUGGINGFACE_HUB_CACHE=/data \ HF_HUB_ENABLE_HF_TRANSFER=1 \ @@ -135,7 +136,7 @@ RUN conda install -c conda-forge gperftools mkl RUN pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.4.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchvision-0.19.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchaudio-2.4.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl -RUN pip install triton +RUN pip install triton numa WORKDIR /usr/src @@ -147,16 +148,11 @@ RUN cd intel-extension-for-pytorch && git submodule sync && git submodule update RUN cd torch-ccl && git submodule sync && git submodule update --init --recursive && pip install . -ENV LD_PRELOAD=/opt/conda/lib/libtcmalloc.so:/opt/conda/lib/libiomp5.so +ENV LD_PRELOAD=/opt/conda/lib/libtcmalloc.so ENV CCL_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch ENV I_MPI_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch ENV FI_PROVIDER_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib/prov:/usr/lib64/libfabric ENV LD_LIBRARY_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/lib -ENV KMP_BLOCKTIME=1 -ENV KMP_TPAUSE=0 -ENV KMP_FORKJOIN_BARRIER_PATTERN=dist,dist -ENV KMP_PLAIN_BARRIER_PATTERN=dist,dist -ENV KMP_REDUCTION_BARRIER_PATTERN=dist,dist # Install server COPY proto proto diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 21b66a68..42d93a12 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -74,6 +74,36 @@ def get_sliding_windows() -> int: return SLIDING_WINDOW +def init_cpu_threads_env(rank_id: int, world_size: int): + import importlib.util + + if importlib.util.find_spec("numa") is not None: + import numa + import psutil + + nodes = numa.get_max_node() + 1 + rank_per_node = math.ceil(world_size / nodes) + num_cpus_per_nodes = int(psutil.cpu_count(logical=False) / nodes) + node_id = int(rank_id / rank_per_node) + rank_offset_per_node = rank_id % rank_per_node + if os.getenv("OMP_NUM_THREADS") is None: + num_cpus_per_rank = max(int(num_cpus_per_nodes / rank_per_node), 1) + else: + num_cpus_per_rank = int(os.getenv("OMP_NUM_THREADS")) + if len(numa.get_membind()) == nodes: + numa.set_membind([node_id]) + torch.set_num_threads(num_cpus_per_rank) + if len(numa.get_affinity(0)) == psutil.cpu_count(logical=True): + cpu_start = num_cpus_per_rank * rank_offset_per_node + numa.set_affinity( + 0, + list(numa.node_to_cpus(node_id))[ + cpu_start : cpu_start + num_cpus_per_rank + ], + ) + logger.info(f"affinity={numa.get_affinity(0)}, membind = {numa.get_membind()}") + + @dataclass class FlashCausalLMBatch(Batch): batch_id: int @@ -854,6 +884,7 @@ class FlashCausalLM(Model): device = torch.device("cpu") # Float16 doesn't exist on target. dtype = torch.bfloat16 if dtype is None else dtype + init_cpu_threads_env(rank_id=rank, world_size=world_size) else: raise NotImplementedError(f"{model_class} is only available on GPU")