cuda nn
This commit is contained in:
parent
b2b6cdabaa
commit
f213b9a3ae
|
@ -55,7 +55,16 @@ ARG TARGETPLATFORM
|
||||||
|
|
||||||
ENV PATH /opt/conda/bin:$PATH
|
ENV PATH /opt/conda/bin:$PATH
|
||||||
|
|
||||||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
RUN apt-get update
|
||||||
|
|
||||||
|
# Add new repo to install gcc 11 on Ubuntu 20.04
|
||||||
|
RUN echo "deb http://ppa.launchpad.net/ubuntu-toolchain-r/test/ubuntu focal main" >> /etc/apt/sources.list && \
|
||||||
|
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 60C317803A41BA51845E371A1E9377A2BA9EF27F && \
|
||||||
|
apt-get update
|
||||||
|
|
||||||
|
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
||||||
|
gcc-11 \
|
||||||
|
g++-11 \
|
||||||
build-essential \
|
build-essential \
|
||||||
ca-certificates \
|
ca-certificates \
|
||||||
ccache \
|
ccache \
|
||||||
|
@ -70,6 +79,9 @@ RUN /usr/sbin/update-ccache-symlinks
|
||||||
RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
|
RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
|
||||||
ENV PATH /opt/conda/bin:$PATH
|
ENV PATH /opt/conda/bin:$PATH
|
||||||
|
|
||||||
|
# Set gcc path to new gcc version
|
||||||
|
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 60 --slave /usr/bin/g++ g++ /usr/bin/g++-11
|
||||||
|
|
||||||
# Install conda
|
# Install conda
|
||||||
# translating Docker's TARGETPLATFORM into mamba arches
|
# translating Docker's TARGETPLATFORM into mamba arches
|
||||||
RUN case ${TARGETPLATFORM} in \
|
RUN case ${TARGETPLATFORM} in \
|
||||||
|
@ -88,30 +100,89 @@ RUN git clone --recursive https://github.com/pytorch/pytorch && \
|
||||||
|
|
||||||
WORKDIR /pytorch
|
WORKDIR /pytorch
|
||||||
|
|
||||||
# Write the Pytorch version into the version.txt file because it isn't always the same as the tag
|
# Write the Pytorch version into the version.txt file because it isn't always the same as the tag we checked out
|
||||||
RUN echo $PYTORCH_VERSION > version.txt
|
RUN echo $PYTORCH_VERSION > version.txt
|
||||||
|
|
||||||
RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake ninja conda-build pyyaml numpy ipython mkl mkl-include && \
|
RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake ninja conda-build pyyaml numpy ipython mkl mkl-include cudnn && \
|
||||||
/opt/conda/bin/conda install -c pytorch magma-cuda118 && \
|
/opt/conda/bin/conda install -c pytorch magma-cuda118 && \
|
||||||
/opt/conda/bin/python -mpip install -r requirements.txt && \
|
/opt/conda/bin/python -mpip install -r requirements.txt && \
|
||||||
/opt/conda/bin/conda clean -ya
|
/opt/conda/bin/conda clean -ya
|
||||||
|
|
||||||
|
# https://github.com/cresset-template/cresset/blob/37c7b5df7236d3b9d96c4908efe5af8bc90066e3/reqs/train-conda-build.requirements.txt
|
||||||
|
RUN /opt/conda/bin/conda install -y \
|
||||||
|
jemalloc \
|
||||||
|
astunparse \
|
||||||
|
ccache \
|
||||||
|
cmake \
|
||||||
|
expecttest \
|
||||||
|
filelock \
|
||||||
|
fsspec \
|
||||||
|
git \
|
||||||
|
hypothesis \
|
||||||
|
jinja2 \
|
||||||
|
libjpeg-turbo \
|
||||||
|
libpng \
|
||||||
|
networkx \
|
||||||
|
ninja \
|
||||||
|
numpy \
|
||||||
|
psutil \
|
||||||
|
pyyaml \
|
||||||
|
requests \
|
||||||
|
setuptools \
|
||||||
|
sympy \
|
||||||
|
types-dataclasses \
|
||||||
|
typing-extensions
|
||||||
|
|
||||||
|
# Use Intel OpenMP with optimizations. See the documentation for details.
|
||||||
|
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
||||||
|
# Intel OpenMP thread blocking time in ms.
|
||||||
|
ENV KMP_BLOCKTIME=0
|
||||||
|
# Configure CPU thread affinity.
|
||||||
|
# ENV KMP_AFFINITY="granularity=fine,compact,1,0"
|
||||||
|
ENV LD_PRELOAD=/opt/conda/lib/libiomp5.so:${LD_PRELOAD}
|
||||||
|
|
||||||
|
# Use Jemalloc for efficient memory management.
|
||||||
|
ENV LD_PRELOAD=/opt/conda/lib/libjemalloc.so:${LD_PRELOAD}
|
||||||
|
ENV MALLOC_CONF="background_thread:true,metadata_thp:auto,dirty_decay_ms:30000,muzzy_decay_ms:30000"
|
||||||
|
|
||||||
# Install PyTorch without AVX2
|
# Install PyTorch without AVX2
|
||||||
RUN python setup.py clean && \
|
# https://github.com/cresset-template/cresset/blob/37c7b5df7236d3b9d96c4908efe5af8bc90066e3/docker-compose.yaml#L124
|
||||||
USE_CUDA=1 \
|
# print(torch.__config__.show().split("\n"), sep="\n")
|
||||||
TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
|
RUN --mount=type=cache,target=/opt/ccache \
|
||||||
|
python setup.py clean && \
|
||||||
|
|
||||||
|
BLAS_INFO=mklBUILD_TYPE=Release \
|
||||||
|
CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow" \
|
||||||
|
LAPACK_INFO=mkl \
|
||||||
|
PERF_WITH_AVX=1 \
|
||||||
|
PERF_WITH_AVX2=0 \
|
||||||
|
PERF_WITH_AVX512=0 \
|
||||||
|
TORCH_DISABLE_GPU_ASSERTS=ON \
|
||||||
|
TORCH_VERSION=${PYTORCH_VERSION} \
|
||||||
|
USE_CUDA=ON \
|
||||||
|
USE_CUDNN=ON \
|
||||||
|
USE_EXCEPTION_PTR=1 \
|
||||||
|
USE_GFLAGS=OFF \
|
||||||
|
USE_GLOG=OFF \
|
||||||
|
USE_MKL=ON \
|
||||||
|
USE_MKLDNN=ON \
|
||||||
|
USE_MPI=OFF \
|
||||||
|
USE_NCCL=1 \
|
||||||
|
USE_NNPACK=ON \
|
||||||
|
USE_OPENMP=ON \
|
||||||
|
USE_ROCM=OFF \
|
||||||
|
BUILD_TEST=0 \
|
||||||
|
TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
|
||||||
CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
|
CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
|
||||||
CMAKE_ARGS='-DDISABLE_AVX2:BOOL=TRUE -DCXX_AVX2_FOUND:BOOL=FALSE -DC_AVX2_FOUND:BOOL=FALSE -DDISABLE_AVX512F:BOOL=TRUE' \
|
CMAKE_ARGS='-DDISABLE_AVX2:BOOL=TRUE -DCXX_AVX2_FOUND:BOOL=FALSE -DC_AVX2_FOUND:BOOL=FALSE -DDISABLE_AVX512F:BOOL=TRUE' \
|
||||||
python setup.py install && \
|
python setup.py install && \
|
||||||
cd .. && \
|
cd .. && \
|
||||||
rm -rf pytorch
|
rm -rf pytorch
|
||||||
|
|
||||||
# BUILD_TEST=0 \
|
|
||||||
|
|
||||||
# Make sure we built everything properly. Build will fail if CUDA isn't available.
|
# Make sure we built everything properly. Build will fail if CUDA isn't available.
|
||||||
RUN python -c "import torch; exit(1 if not torch.version.cuda else 0)"
|
RUN python -c "import torch; exit(1 if not torch.version.cuda else 0)"
|
||||||
|
|
||||||
# RUN pip freeze | grep "torch"
|
RUN nm -D /opt/conda/lib/python3.9/site-packages/torch/lib/libtorch.so
|
||||||
|
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
|
@ -165,7 +236,7 @@ WORKDIR /usr/src
|
||||||
COPY server/custom_kernels/ .
|
COPY server/custom_kernels/ .
|
||||||
|
|
||||||
# Build specific version of transformers
|
# Build specific version of transformers
|
||||||
RUN BUILD_EXTENSIONS=True MAX_JOBS=5 python setup.py build
|
RUN MAX_JOBS=5 python setup.py build
|
||||||
|
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
|
@ -190,7 +261,8 @@ ENV PATH=/opt/conda/bin:$PATH \
|
||||||
# Text Generation Inference base env
|
# Text Generation Inference base env
|
||||||
ENV HUGGINGFACE_HUB_CACHE=/data \
|
ENV HUGGINGFACE_HUB_CACHE=/data \
|
||||||
HF_HUB_ENABLE_HF_TRANSFER=1 \
|
HF_HUB_ENABLE_HF_TRANSFER=1 \
|
||||||
PORT=80
|
PORT=80 \
|
||||||
|
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.8/targets/x86_64-linux/lib:/opt/conda/lib/python3.9/site-packages/torch/lib
|
||||||
|
|
||||||
WORKDIR /usr/src
|
WORKDIR /usr/src
|
||||||
|
|
||||||
|
@ -220,8 +292,6 @@ COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /
|
||||||
# Copy builds artifacts from vllm builder
|
# Copy builds artifacts from vllm builder
|
||||||
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
|
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
|
||||||
|
|
||||||
RUN python -c "import torch; exit(1 if not torch.version.cuda else 0)"
|
|
||||||
|
|
||||||
# Install flash-attention dependencies
|
# Install flash-attention dependencies
|
||||||
RUN pip install einops --no-cache-dir
|
RUN pip install einops --no-cache-dir
|
||||||
|
|
||||||
|
@ -232,15 +302,41 @@ COPY server/Makefile server/Makefile
|
||||||
|
|
||||||
RUN cd server && \
|
RUN cd server && \
|
||||||
make gen-server && \
|
make gen-server && \
|
||||||
sed -i '/torch/d' requirements.txt && \
|
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
|
||||||
RUN pip freeze | grep torch
|
|
||||||
|
|
||||||
RUN cd server && \
|
RUN cd server && \
|
||||||
pip install ".[bnb, accelerate, quantize]" --no-cache-dir && \
|
pip install ".[bnb, accelerate, quantize]" --no-cache-dir && \
|
||||||
pip install optimum auto-gptq
|
pip install optimum auto-gptq
|
||||||
|
|
||||||
|
# Fix the error
|
||||||
|
# /opt/conda/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32
|
||||||
|
RUN cp /opt/conda/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda118.so /opt/conda/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
RUN ldd /opt/conda/lib/python3.9/site-packages/exllama_kernels.cpython-39-x86_64-linux-gnu.so
|
||||||
|
|
||||||
|
RUN nm -D /opt/conda/lib/python3.9/site-packages/torch/lib/libtorch.so
|
||||||
|
|
||||||
|
RUN python3 -c "import torch; import text_generation_server.utils.gptq.exllama"
|
||||||
|
|
||||||
|
# RUN ls /opt/conda/lib/python3.9/site-packages/bitsandbytes/
|
||||||
|
# RUN find / -name libcudart.so 2>/dev/null
|
||||||
|
# RUN find / -name "*libc10.so*"
|
||||||
|
# RUN find / -name libtorch.so
|
||||||
|
|
||||||
|
|
||||||
|
# Make sure our special dependencies were compiled and copied correctly
|
||||||
|
RUN python -c "import torch; exit(1 if not torch.version.cuda else 0)"
|
||||||
|
RUN python -c "import torch; torch.cuda.is_available()"
|
||||||
|
RUN python -c "import torch; import flash_attn_2_cuda"
|
||||||
|
RUN python -c "import torch; import flash_attn_cuda"
|
||||||
|
RUN python -c "import torch; import vllm_cache_ops"
|
||||||
|
RUN python -c "import torch; import vllm_attention_ops"
|
||||||
|
RUN python -c "import torch; import custom_kernels"
|
||||||
|
RUN python -c "import torch; import text_generation_server.utils.gptq.exllama"
|
||||||
|
|
||||||
# Install benchmarker
|
# Install benchmarker
|
||||||
COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
|
COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
|
||||||
# Install router
|
# Install router
|
||||||
|
|
Reference in New Issue