diff --git a/llm_server/routes/openai/chat_completions.py b/llm_server/routes/openai/chat_completions.py index ee7b615..9d6a0af 100644 --- a/llm_server/routes/openai/chat_completions.py +++ b/llm_server/routes/openai/chat_completions.py @@ -1,8 +1,11 @@ +import traceback + from flask import jsonify, request from . import openai_bp +from ..helpers.client import format_sillytavern_err from ..helpers.http import validate_json -from ..openai_request_handler import OpenAIRequestHandler +from ..openai_request_handler import OpenAIRequestHandler, build_openai_response @openai_bp.route('/chat/completions', methods=['POST']) @@ -16,4 +19,6 @@ def openai_chat_completions(): return OpenAIRequestHandler(request).handle_request() except Exception as e: print(f'EXCEPTION on {request.url}!!!', f'{e.__class__.__name__}: {e}') + print(print(traceback.format_exc())) print(request.data) + return build_openai_response('', format_sillytavern_err(f'Server encountered exception', 'error')), 200 diff --git a/llm_server/routes/v1/generate.py b/llm_server/routes/v1/generate.py index 5294259..eff516a 100644 --- a/llm_server/routes/v1/generate.py +++ b/llm_server/routes/v1/generate.py @@ -1,6 +1,9 @@ +import traceback + from flask import jsonify, request from . import bp +from ..helpers.client import format_sillytavern_err from ..helpers.http import validate_json from ..ooba_request_handler import OobaRequestHandler @@ -15,4 +18,6 @@ def generate(): return OobaRequestHandler(request).handle_request() except Exception as e: print(f'EXCEPTION on {request.url}!!!', f'{e.__class__.__name__}: {e}') + print(print(traceback.format_exc())) print(request.data) + return format_sillytavern_err(f'Server encountered exception', 'error'), 200 diff --git a/other/non-avx tgi docker/Dockerfile b/other/non-avx tgi docker/Dockerfile deleted file mode 100644 index 56ef16f..0000000 --- a/other/non-avx tgi docker/Dockerfile +++ /dev/null @@ -1,375 +0,0 @@ -# syntax = docker/dockerfile:experimental - -# DOCKER_BUILDKIT=1 DOCKER_CLI_EXPERIMENTAL=enabled - -# Rust builder -FROM lukemathwalker/cargo-chef:latest-rust-1.71 AS chef -WORKDIR /usr/src - -ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse - -FROM chef as planner -COPY Cargo.toml Cargo.toml -COPY rust-toolchain.toml rust-toolchain.toml -COPY proto proto -COPY benchmark benchmark -COPY router router -COPY launcher launcher -RUN cargo chef prepare --recipe-path recipe.json - -FROM chef AS builder - -ARG GIT_SHA -ARG DOCKER_LABEL - -RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ - curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ - unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ - unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \ - rm -f $PROTOC_ZIP - -COPY --from=planner /usr/src/recipe.json recipe.json -RUN cargo chef cook --release --recipe-path recipe.json - -COPY Cargo.toml Cargo.toml -COPY rust-toolchain.toml rust-toolchain.toml -COPY proto proto -COPY benchmark benchmark -COPY router router -COPY launcher launcher -RUN cargo build --release - -# ============================================================================== -# Build PyTorch - -# Python builder -# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile -FROM nvidia/cuda:11.8.0-devel-ubuntu20.04 as dev-base -# FROM nvidia/cuda:11.8.0-devel-ubuntu20.04 as pytorch-build -# ubuntu:20.04 - -ARG PYTORCH_VERSION=2.0.1 -ARG PYTHON_VERSION=3.9 -# Keep in sync with `server/pyproject.toml -ARG CUDA_VERSION=11.8 - -# Automatically set by buildx -ARG TARGETPLATFORM - -ENV PATH /opt/conda/bin:$PATH - -RUN apt-get update - -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends gnupg2 - -# Add new repo to install gcc 11 on Ubuntu 20.04 -RUN echo "deb http://ppa.launchpad.net/ubuntu-toolchain-r/test/ubuntu focal main" >> /etc/apt/sources.list && \ - apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 60C317803A41BA51845E371A1E9377A2BA9EF27F && \ - apt-get update - -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - gcc-11 \ - g++-11 \ - build-essential \ - ca-certificates \ - ccache \ - cmake \ - curl \ - git \ - ninja-build \ - libjpeg-dev \ - libpng-dev && \ - rm -rf /var/lib/apt/lists/* -RUN /usr/sbin/update-ccache-symlinks -RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache -ENV PATH /opt/conda/bin:$PATH - -# Set gcc path to new gcc version -RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 60 --slave /usr/bin/g++ g++ /usr/bin/g++-11 - -# Install conda -FROM dev-base as pytorch-build -# Automatically set by buildx -ARG TARGETPLATFORM -# translating Docker's TARGETPLATFORM into miniconda arches -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") MINICONDA_ARCH=aarch64 ;; \ - *) MINICONDA_ARCH=x86_64 ;; \ - esac && \ - curl -fsSL -v -o ~/miniconda.sh -O "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh" - -# Manually invoke bash on miniconda script per https://github.com/conda/conda/issues/10431 -RUN chmod +x ~/miniconda.sh && \ - bash ~/miniconda.sh -b -p /opt/conda && \ - rm ~/miniconda.sh - -RUN git clone --recursive https://github.com/pytorch/pytorch && \ - cd pytorch && \ - git checkout v${PYTORCH_VERSION} - -WORKDIR /pytorch - -RUN git submodule update --init --recursive - -# Write the Pytorch version into the version.txt file because it isn't always the same as the tag we checked out -RUN echo $PYTORCH_VERSION > version.txt - -# Install things for building PyTorch -RUN /opt/conda/bin/conda install -y mkl mkl-include cudnn libgcc && \ - /opt/conda/bin/conda install -c conda-forge libstdcxx-ng=12 && \ - /opt/conda/bin/conda install -c pytorch magma-cuda118 && \ - /opt/conda/bin/conda install -c "nvidia/label/cuda-11.8.0" cuda==11.8 - -RUN find / -name libcudart_static* - -RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \ - /opt/conda/bin/python -mpip install -r requirements.txt - -# https://github.com/cresset-template/cresset/blob/37c7b5df7236d3b9d96c4908efe5af8bc90066e3/reqs/train-conda-build.requirements.txt -# TODO: remove what we don't need -RUN /opt/conda/bin/conda install -y \ - jemalloc \ - ccache \ - cmake \ - expecttest \ - filelock \ - fsspec \ - git \ - hypothesis \ - jinja2 \ - libpng \ - networkx \ - ninja \ - numpy \ - psutil \ - pyyaml \ - requests \ - setuptools \ - sympy \ - typing-extensions - -RUN /opt/conda/bin/conda clean -ya - -# Use Intel OpenMP with optimizations. See the documentation for details. -# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html -# Intel OpenMP thread blocking time in ms. -ENV KMP_BLOCKTIME=0 -# Configure CPU thread affinity. -# ENV KMP_AFFINITY="granularity=fine,compact,1,0" -ENV LD_PRELOAD=/opt/conda/lib/libiomp5.so:${LD_PRELOAD} - -# Use Jemalloc for efficient memory management. -ENV LD_PRELOAD=/opt/conda/lib/libjemalloc.so:${LD_PRELOAD} -ENV MALLOC_CONF="background_thread:true,metadata_thp:auto,dirty_decay_ms:30000,muzzy_decay_ms:30000" - -RUN cat version.txt - -RUN make triton - -# Install PyTorch without AVX2 -# https://github.com/cresset-template/cresset/blob/37c7b5df7236d3b9d96c4908efe5af8bc90066e3/docker-compose.yaml#L124 -# print(torch.__config__.show().split("\n"), sep="\n") -RUN --mount=type=cache,target=/opt/ccache \ - python setup.py clean && \ - BLAS_INFO=mkl \ - BUILD_TYPE=Release \ - CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow" \ - LAPACK_INFO=mkl \ - PERF_WITH_AVX=1 \ - PERF_WITH_AVX2=0 \ - PERF_WITH_AVX512=0 \ - TORCH_DISABLE_GPU_ASSERTS=ON \ - TORCH_VERSION=${PYTORCH_VERSION} \ - USE_CUDA=ON \ - USE_CUDNN=ON \ - USE_EXCEPTION_PTR=1 \ - USE_GFLAGS=OFF \ - USE_GLOG=OFF \ - USE_MKL=ON \ - USE_MKLDNN=ON \ - USE_MPI=OFF \ - USE_NCCL=1 \ - USE_NNPACK=ON \ - USE_OPENMP=ON \ - USE_ROCM=OFF \ - BUILD_TEST=0 \ - CMAKE_ARGS='-DDISABLE_AVX2:BOOL=TRUE -DCXX_AVX2_FOUND:BOOL=FALSE -DC_AVX2_FOUND:BOOL=FALSE -DDISABLE_AVX512F:BOOL=TRUE' \ - TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ - CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ - python setup.py install && \ - cd .. && \ - rm -rf pytorch - - -# Make sure we built everything properly. Build will fail if CUDA isn't available. -# RUN python -c "import torch; exit(1 if not torch.version.cuda else 0)" - -RUN nm -D /opt/conda/lib/python3.9/site-packages/torch/lib/libtorch.so - -# ============================================================================== -# Build Flash Attention CUDA kernels - -FROM pytorch-build as flash-att-builder -WORKDIR /usr/src -COPY server/Makefile-flash-att Makefile - -# Build specific version of flash attention -RUN MAX_JOBS=5 make build-flash-attention - -# ============================================================================== -# Build Flash Attention v2 CUDA kernels - -FROM pytorch-build as flash-att-v2-builder -WORKDIR /usr/src -COPY server/Makefile-flash-att-v2 Makefile - -# Build specific version of flash attention v2 -RUN MAX_JOBS=10 make build-flash-attention-v2 - -# ============================================================================== -# Build Transformers exllama kernels - -FROM pytorch-build as exllama-kernels-builder -WORKDIR /usr/src -COPY server/exllama_kernels/ . - -# Build specific version of transformers -RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build - -# ============================================================================== -# Build Transformers CUDA kernels - -FROM pytorch-build as custom-kernels-builder -WORKDIR /usr/src -COPY server/custom_kernels/ . - -# Build specific version of transformers -RUN MAX_JOBS=5 python setup.py build - -# ============================================================================== -# Build vllm CUDA kernels - -FROM pytorch-build as vllm-builder -WORKDIR /usr/src -COPY server/Makefile-vllm Makefile - -# Build specific version of vllm -RUN MAX_JOBS=5 make build-vllm - -# ============================================================================== - -# Text Generation Inference base image -# nvidia/cuda:11.8.0-base-ubuntu20.04 -FROM nvidia/cuda:11.8.0-base-ubuntu20.04 as base - -# Conda env -ENV PATH=/opt/conda/bin:$PATH \ - CONDA_PREFIX=/opt/conda - -# Text Generation Inference base env -ENV HUGGINGFACE_HUB_CACHE=/data \ - HF_HUB_ENABLE_HF_TRANSFER=1 \ - PORT=80 \ - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.8/targets/x86_64-linux/lib:/opt/conda/lib/python3.9/site-packages/torch/lib - -WORKDIR /usr/src - -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - libssl-dev \ - ca-certificates \ - make \ - && rm -rf /var/lib/apt/lists/* - -# Copy conda with PyTorch installed -COPY --from=pytorch-build /opt/conda /opt/conda - -# Copy build artifacts from flash attention builder -COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages -COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages -COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages - -# Copy build artifacts from flash attention v2 builder -COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages - -# Copy build artifacts from custom kernels builder -COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages - -# Copy build artifacts from exllama kernels builder -COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages - -# Copy builds artifacts from vllm builder -COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages - -# Install flash-attention dependencies -RUN pip install einops --no-cache-dir - -# Install server -COPY proto proto -COPY server server -COPY server/Makefile server/Makefile - -RUN cd server && \ - make gen-server && \ - pip install -r requirements.txt - -RUN cd server && \ - pip install ".[bnb, accelerate, quantize]" --no-cache-dir && \ - pip install optimum auto-gptq - -RUN /opt/conda/bin/conda clean -ya - -# Fix the error -# /opt/conda/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32 -RUN cp /opt/conda/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda118.so /opt/conda/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so - - - - -RUN ldd /opt/conda/lib/python3.9/site-packages/exllama_kernels.cpython-39-x86_64-linux-gnu.so - -RUN nm -D /opt/conda/lib/python3.9/site-packages/torch/lib/libtorch.so - -RUN python3 -c "import torch; import text_generation_server.utils.gptq.exllama" - -# RUN ls /opt/conda/lib/python3.9/site-packages/bitsandbytes/ -# RUN find / -name libcudart.so 2>/dev/null -# RUN find / -name "*libc10.so*" -# RUN find / -name libtorch.so - - -# Make sure our special dependencies were compiled and copied correctly -# RUN python -c "import torch; exit(1 if not torch.version.cuda else 0)" -# RUN python -c "import torch; torch.cuda.is_available()" -RUN python -c "import torch; import flash_attn_2_cuda" -RUN python -c "import torch; import flash_attn_cuda" -RUN python -c "import torch; import vllm_cache_ops" -RUN python -c "import torch; import vllm_attention_ops" -RUN python -c "import torch; import custom_kernels" -RUN python -c "import torch; import text_generation_server.utils.gptq.exllama" - -# Install benchmarker -COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark -# Install router -COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router -# Install launcher -COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher - -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - build-essential \ - g++ \ - && rm -rf /var/lib/apt/lists/* - -# ============================================================================== - -# AWS Sagemaker compatbile image -FROM base as sagemaker -COPY sagemaker-entrypoint.sh entrypoint.sh -RUN chmod +x entrypoint.sh -ENTRYPOINT ["./entrypoint.sh"] - -# ============================================================================== - -# Final image -FROM base -ENTRYPOINT ["text-generation-launcher"] -CMD ["--json-output"]