From 4c50b6d04bbf4db0d61ae6a04c9f44662b608c52 Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Mon, 8 Jul 2024 17:52:10 +0200
Subject: [PATCH] Fix nccl regression on PyTorch 2.3 upgrade (#2099)

* fix nccl issue

* add note in dockerfile

* use v2.22.3 that also fixes @samsamoa's repro

* poetry actually can't handle the conflict between torch and nccl

* set LD_PRELOAD
---
 Dockerfile      | 7 ++++++-
 server/Makefile | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index d4772b4a..3f2e8ef0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -40,7 +40,9 @@ RUN cargo build --profile release-opt
 # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
 FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS pytorch-install
 
+# NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
 ARG PYTORCH_VERSION=2.3.0
+
 ARG PYTHON_VERSION=3.10
 # Keep in sync with `server/pyproject.toml
 ARG CUDA_VERSION=12.1
@@ -241,7 +243,10 @@ COPY server/Makefile server/Makefile
 RUN cd server && \
     make gen-server && \
     pip install -r requirements_cuda.txt && \
-    pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir
+    pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir && \
+    pip install nvidia-nccl-cu12==2.22.3
+
+ENV LD_PRELOAD=/opt/conda/lib/python3.10/site-packages/nvidia/nccl/lib/libnccl.so.2
 
 # Deps before the binaries
 # The binaries change on every build given we burn the SHA into them
diff --git a/server/Makefile b/server/Makefile
index 0099c56a..d701c819 100644
--- a/server/Makefile
+++ b/server/Makefile
@@ -35,5 +35,5 @@ run-dev:
 	SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded
 
 export-requirements:
-	poetry export -o requirements_cuda.txt --without-hashes
+	poetry export -o requirements_cuda.txt --without-hashes --with cuda
 	poetry export -o requirements_rocm.txt --without-hashes