From f213b9a3ae31888dac3aa4162ad20df162dc4c23 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Wed, 6 Sep 2023 22:27:48 -0600 Subject: [PATCH] cuda nn --- other/non-avx tgi docker/Dockerfile | 130 ++++++++++++++++++++++++---- 1 file changed, 113 insertions(+), 17 deletions(-) diff --git a/other/non-avx tgi docker/Dockerfile b/other/non-avx tgi docker/Dockerfile index 4ed39b8..528e54b 100644 --- a/other/non-avx tgi docker/Dockerfile +++ b/other/non-avx tgi docker/Dockerfile @@ -55,7 +55,16 @@ ARG TARGETPLATFORM ENV PATH /opt/conda/bin:$PATH -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ +RUN apt-get update + +# Add new repo to install gcc 11 on Ubuntu 20.04 +RUN echo "deb http://ppa.launchpad.net/ubuntu-toolchain-r/test/ubuntu focal main" >> /etc/apt/sources.list && \ + apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 60C317803A41BA51845E371A1E9377A2BA9EF27F && \ + apt-get update + +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + gcc-11 \ + g++-11 \ build-essential \ ca-certificates \ ccache \ @@ -70,6 +79,9 @@ RUN /usr/sbin/update-ccache-symlinks RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache ENV PATH /opt/conda/bin:$PATH +# Set gcc path to new gcc version +RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 60 --slave /usr/bin/g++ g++ /usr/bin/g++-11 + # Install conda # translating Docker's TARGETPLATFORM into mamba arches RUN case ${TARGETPLATFORM} in \ @@ -88,30 +100,89 @@ RUN git clone --recursive https://github.com/pytorch/pytorch && \ WORKDIR /pytorch -# Write the Pytorch version into the version.txt file because it isn't always the same as the tag +# Write the Pytorch version into the version.txt file because it isn't always the same as the tag we checked out RUN echo $PYTORCH_VERSION > version.txt -RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake ninja conda-build pyyaml numpy ipython mkl mkl-include && \ - /opt/conda/bin/conda install -c pytorch magma-cuda118 && \ +RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake ninja conda-build pyyaml numpy ipython mkl mkl-include cudnn && \ + /opt/conda/bin/conda install -c pytorch magma-cuda118 && \ /opt/conda/bin/python -mpip install -r requirements.txt && \ /opt/conda/bin/conda clean -ya +# https://github.com/cresset-template/cresset/blob/37c7b5df7236d3b9d96c4908efe5af8bc90066e3/reqs/train-conda-build.requirements.txt +RUN /opt/conda/bin/conda install -y \ + jemalloc \ + astunparse \ + ccache \ + cmake \ + expecttest \ + filelock \ + fsspec \ + git \ + hypothesis \ + jinja2 \ + libjpeg-turbo \ + libpng \ + networkx \ + ninja \ + numpy \ + psutil \ + pyyaml \ + requests \ + setuptools \ + sympy \ + types-dataclasses \ + typing-extensions + +# Use Intel OpenMP with optimizations. See the documentation for details. +# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html +# Intel OpenMP thread blocking time in ms. +ENV KMP_BLOCKTIME=0 +# Configure CPU thread affinity. +# ENV KMP_AFFINITY="granularity=fine,compact,1,0" +ENV LD_PRELOAD=/opt/conda/lib/libiomp5.so:${LD_PRELOAD} + +# Use Jemalloc for efficient memory management. +ENV LD_PRELOAD=/opt/conda/lib/libjemalloc.so:${LD_PRELOAD} +ENV MALLOC_CONF="background_thread:true,metadata_thp:auto,dirty_decay_ms:30000,muzzy_decay_ms:30000" + # Install PyTorch without AVX2 -RUN python setup.py clean && \ - USE_CUDA=1 \ - TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ +# https://github.com/cresset-template/cresset/blob/37c7b5df7236d3b9d96c4908efe5af8bc90066e3/docker-compose.yaml#L124 +# print(torch.__config__.show().split("\n"), sep="\n") +RUN --mount=type=cache,target=/opt/ccache \ + python setup.py clean && \ + + BLAS_INFO=mklBUILD_TYPE=Release \ + CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow" \ + LAPACK_INFO=mkl \ + PERF_WITH_AVX=1 \ + PERF_WITH_AVX2=0 \ + PERF_WITH_AVX512=0 \ + TORCH_DISABLE_GPU_ASSERTS=ON \ + TORCH_VERSION=${PYTORCH_VERSION} \ + USE_CUDA=ON \ + USE_CUDNN=ON \ + USE_EXCEPTION_PTR=1 \ + USE_GFLAGS=OFF \ + USE_GLOG=OFF \ + USE_MKL=ON \ + USE_MKLDNN=ON \ + USE_MPI=OFF \ + USE_NCCL=1 \ + USE_NNPACK=ON \ + USE_OPENMP=ON \ + USE_ROCM=OFF \ + BUILD_TEST=0 \ + TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ CMAKE_ARGS='-DDISABLE_AVX2:BOOL=TRUE -DCXX_AVX2_FOUND:BOOL=FALSE -DC_AVX2_FOUND:BOOL=FALSE -DDISABLE_AVX512F:BOOL=TRUE' \ python setup.py install && \ cd .. && \ rm -rf pytorch -# BUILD_TEST=0 \ - # Make sure we built everything properly. Build will fail if CUDA isn't available. RUN python -c "import torch; exit(1 if not torch.version.cuda else 0)" -# RUN pip freeze | grep "torch" +RUN nm -D /opt/conda/lib/python3.9/site-packages/torch/lib/libtorch.so # ============================================================================== @@ -165,7 +236,7 @@ WORKDIR /usr/src COPY server/custom_kernels/ . # Build specific version of transformers -RUN BUILD_EXTENSIONS=True MAX_JOBS=5 python setup.py build +RUN MAX_JOBS=5 python setup.py build # ============================================================================== @@ -190,7 +261,8 @@ ENV PATH=/opt/conda/bin:$PATH \ # Text Generation Inference base env ENV HUGGINGFACE_HUB_CACHE=/data \ HF_HUB_ENABLE_HF_TRANSFER=1 \ - PORT=80 + PORT=80 \ + LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.8/targets/x86_64-linux/lib:/opt/conda/lib/python3.9/site-packages/torch/lib WORKDIR /usr/src @@ -220,8 +292,6 @@ COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 / # Copy builds artifacts from vllm builder COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages -RUN python -c "import torch; exit(1 if not torch.version.cuda else 0)" - # Install flash-attention dependencies RUN pip install einops --no-cache-dir @@ -232,15 +302,41 @@ COPY server/Makefile server/Makefile RUN cd server && \ make gen-server && \ - sed -i '/torch/d' requirements.txt && \ pip install -r requirements.txt -RUN pip freeze | grep torch - RUN cd server && \ pip install ".[bnb, accelerate, quantize]" --no-cache-dir && \ pip install optimum auto-gptq +# Fix the error +# /opt/conda/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32 +RUN cp /opt/conda/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda118.so /opt/conda/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so + + + + +RUN ldd /opt/conda/lib/python3.9/site-packages/exllama_kernels.cpython-39-x86_64-linux-gnu.so + +RUN nm -D /opt/conda/lib/python3.9/site-packages/torch/lib/libtorch.so + +RUN python3 -c "import torch; import text_generation_server.utils.gptq.exllama" + +# RUN ls /opt/conda/lib/python3.9/site-packages/bitsandbytes/ +# RUN find / -name libcudart.so 2>/dev/null +# RUN find / -name "*libc10.so*" +# RUN find / -name libtorch.so + + +# Make sure our special dependencies were compiled and copied correctly +RUN python -c "import torch; exit(1 if not torch.version.cuda else 0)" +RUN python -c "import torch; torch.cuda.is_available()" +RUN python -c "import torch; import flash_attn_2_cuda" +RUN python -c "import torch; import flash_attn_cuda" +RUN python -c "import torch; import vllm_cache_ops" +RUN python -c "import torch; import vllm_attention_ops" +RUN python -c "import torch; import custom_kernels" +RUN python -c "import torch; import text_generation_server.utils.gptq.exllama" + # Install benchmarker COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark # Install router