add numa to improve cpu inference perf (#2330)
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
This commit is contained in:
parent
cd9b15d17f
commit
59922f9bc1
|
@ -106,7 +106,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
|
||||||
g++ \
|
g++ \
|
||||||
git \
|
git \
|
||||||
wget \
|
wget \
|
||||||
cmake
|
cmake \
|
||||||
|
libnuma-dev
|
||||||
|
|
||||||
ENV HUGGINGFACE_HUB_CACHE=/data \
|
ENV HUGGINGFACE_HUB_CACHE=/data \
|
||||||
HF_HUB_ENABLE_HF_TRANSFER=1 \
|
HF_HUB_ENABLE_HF_TRANSFER=1 \
|
||||||
|
@ -135,7 +136,7 @@ RUN conda install -c conda-forge gperftools mkl
|
||||||
RUN pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.4.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
|
RUN pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.4.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
|
||||||
RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchvision-0.19.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
|
RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchvision-0.19.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
|
||||||
RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchaudio-2.4.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
|
RUN pip install https://download.pytorch.org/whl/nightly/cpu/torchaudio-2.4.0.dev20240612%2Bcpu-cp310-cp310-linux_x86_64.whl
|
||||||
RUN pip install triton
|
RUN pip install triton numa
|
||||||
|
|
||||||
WORKDIR /usr/src
|
WORKDIR /usr/src
|
||||||
|
|
||||||
|
@ -147,16 +148,11 @@ RUN cd intel-extension-for-pytorch && git submodule sync && git submodule update
|
||||||
|
|
||||||
RUN cd torch-ccl && git submodule sync && git submodule update --init --recursive && pip install .
|
RUN cd torch-ccl && git submodule sync && git submodule update --init --recursive && pip install .
|
||||||
|
|
||||||
ENV LD_PRELOAD=/opt/conda/lib/libtcmalloc.so:/opt/conda/lib/libiomp5.so
|
ENV LD_PRELOAD=/opt/conda/lib/libtcmalloc.so
|
||||||
ENV CCL_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch
|
ENV CCL_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch
|
||||||
ENV I_MPI_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch
|
ENV I_MPI_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch
|
||||||
ENV FI_PROVIDER_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib/prov:/usr/lib64/libfabric
|
ENV FI_PROVIDER_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib/prov:/usr/lib64/libfabric
|
||||||
ENV LD_LIBRARY_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/lib
|
ENV LD_LIBRARY_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/lib
|
||||||
ENV KMP_BLOCKTIME=1
|
|
||||||
ENV KMP_TPAUSE=0
|
|
||||||
ENV KMP_FORKJOIN_BARRIER_PATTERN=dist,dist
|
|
||||||
ENV KMP_PLAIN_BARRIER_PATTERN=dist,dist
|
|
||||||
ENV KMP_REDUCTION_BARRIER_PATTERN=dist,dist
|
|
||||||
|
|
||||||
# Install server
|
# Install server
|
||||||
COPY proto proto
|
COPY proto proto
|
||||||
|
|
|
@ -74,6 +74,36 @@ def get_sliding_windows() -> int:
|
||||||
return SLIDING_WINDOW
|
return SLIDING_WINDOW
|
||||||
|
|
||||||
|
|
||||||
|
def init_cpu_threads_env(rank_id: int, world_size: int):
|
||||||
|
import importlib.util
|
||||||
|
|
||||||
|
if importlib.util.find_spec("numa") is not None:
|
||||||
|
import numa
|
||||||
|
import psutil
|
||||||
|
|
||||||
|
nodes = numa.get_max_node() + 1
|
||||||
|
rank_per_node = math.ceil(world_size / nodes)
|
||||||
|
num_cpus_per_nodes = int(psutil.cpu_count(logical=False) / nodes)
|
||||||
|
node_id = int(rank_id / rank_per_node)
|
||||||
|
rank_offset_per_node = rank_id % rank_per_node
|
||||||
|
if os.getenv("OMP_NUM_THREADS") is None:
|
||||||
|
num_cpus_per_rank = max(int(num_cpus_per_nodes / rank_per_node), 1)
|
||||||
|
else:
|
||||||
|
num_cpus_per_rank = int(os.getenv("OMP_NUM_THREADS"))
|
||||||
|
if len(numa.get_membind()) == nodes:
|
||||||
|
numa.set_membind([node_id])
|
||||||
|
torch.set_num_threads(num_cpus_per_rank)
|
||||||
|
if len(numa.get_affinity(0)) == psutil.cpu_count(logical=True):
|
||||||
|
cpu_start = num_cpus_per_rank * rank_offset_per_node
|
||||||
|
numa.set_affinity(
|
||||||
|
0,
|
||||||
|
list(numa.node_to_cpus(node_id))[
|
||||||
|
cpu_start : cpu_start + num_cpus_per_rank
|
||||||
|
],
|
||||||
|
)
|
||||||
|
logger.info(f"affinity={numa.get_affinity(0)}, membind = {numa.get_membind()}")
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class FlashCausalLMBatch(Batch):
|
class FlashCausalLMBatch(Batch):
|
||||||
batch_id: int
|
batch_id: int
|
||||||
|
@ -854,6 +884,7 @@ class FlashCausalLM(Model):
|
||||||
device = torch.device("cpu")
|
device = torch.device("cpu")
|
||||||
# Float16 doesn't exist on target.
|
# Float16 doesn't exist on target.
|
||||||
dtype = torch.bfloat16 if dtype is None else dtype
|
dtype = torch.bfloat16 if dtype is None else dtype
|
||||||
|
init_cpu_threads_env(rank_id=rank, world_size=world_size)
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(f"{model_class} is only available on GPU")
|
raise NotImplementedError(f"{model_class} is only available on GPU")
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue