From 4c50b6d04bbf4db0d61ae6a04c9f44662b608c52 Mon Sep 17 00:00:00 2001 From: fxmarty <9808326+fxmarty@users.noreply.github.com> Date: Mon, 8 Jul 2024 17:52:10 +0200 Subject: [PATCH] Fix nccl regression on PyTorch 2.3 upgrade (#2099) * fix nccl issue * add note in dockerfile * use v2.22.3 that also fixes @samsamoa's repro * poetry actually can't handle the conflict between torch and nccl * set LD_PRELOAD --- Dockerfile | 7 ++++++- server/Makefile | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index d4772b4a..3f2e8ef0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -40,7 +40,9 @@ RUN cargo build --profile release-opt # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS pytorch-install +# NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099 ARG PYTORCH_VERSION=2.3.0 + ARG PYTHON_VERSION=3.10 # Keep in sync with `server/pyproject.toml ARG CUDA_VERSION=12.1 @@ -241,7 +243,10 @@ COPY server/Makefile server/Makefile RUN cd server && \ make gen-server && \ pip install -r requirements_cuda.txt && \ - pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir + pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir && \ + pip install nvidia-nccl-cu12==2.22.3 + +ENV LD_PRELOAD=/opt/conda/lib/python3.10/site-packages/nvidia/nccl/lib/libnccl.so.2 # Deps before the binaries # The binaries change on every build given we burn the SHA into them diff --git a/server/Makefile b/server/Makefile index 0099c56a..d701c819 100644 --- a/server/Makefile +++ b/server/Makefile @@ -35,5 +35,5 @@ run-dev: SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded export-requirements: - poetry export -o requirements_cuda.txt --without-hashes + poetry export -o requirements_cuda.txt --without-hashes --with cuda poetry export -o requirements_rocm.txt --without-hashes