Fix nccl regression on PyTorch 2.3 upgrade (#2099)
* fix nccl issue * add note in dockerfile * use v2.22.3 that also fixes @samsamoa's repro * poetry actually can't handle the conflict between torch and nccl * set LD_PRELOAD
This commit is contained in:
parent
87ebb6477b
commit
4c50b6d04b
|
@ -40,7 +40,9 @@ RUN cargo build --profile release-opt
|
|||
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
|
||||
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS pytorch-install
|
||||
|
||||
# NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
|
||||
ARG PYTORCH_VERSION=2.3.0
|
||||
|
||||
ARG PYTHON_VERSION=3.10
|
||||
# Keep in sync with `server/pyproject.toml
|
||||
ARG CUDA_VERSION=12.1
|
||||
|
@ -241,7 +243,10 @@ COPY server/Makefile server/Makefile
|
|||
RUN cd server && \
|
||||
make gen-server && \
|
||||
pip install -r requirements_cuda.txt && \
|
||||
pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir
|
||||
pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir && \
|
||||
pip install nvidia-nccl-cu12==2.22.3
|
||||
|
||||
ENV LD_PRELOAD=/opt/conda/lib/python3.10/site-packages/nvidia/nccl/lib/libnccl.so.2
|
||||
|
||||
# Deps before the binaries
|
||||
# The binaries change on every build given we burn the SHA into them
|
||||
|
|
|
@ -35,5 +35,5 @@ run-dev:
|
|||
SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded
|
||||
|
||||
export-requirements:
|
||||
poetry export -o requirements_cuda.txt --without-hashes
|
||||
poetry export -o requirements_cuda.txt --without-hashes --with cuda
|
||||
poetry export -o requirements_rocm.txt --without-hashes
|
||||
|
|
Loading…
Reference in New Issue