adjust

2023-09-14 14:36:22 -06:00 · 2023-09-14 14:36:22 -06:00 · 8f4f17166e
parent 507327db49
commit 8f4f17166e
3 changed files with 11 additions and 376 deletions
--- a/llm_server/routes/openai/chat_completions.py
+++ b/llm_server/routes/openai/chat_completions.py
@ -1,8 +1,11 @@
+import traceback
+
 from flask import jsonify, request

 from . import openai_bp
+from ..helpers.client import format_sillytavern_err
 from ..helpers.http import validate_json
-from ..openai_request_handler import OpenAIRequestHandler
+from ..openai_request_handler import OpenAIRequestHandler, build_openai_response


@openai_bp.route('/chat/completions', methods=['POST'])
@ -16,4 +19,6 @@ def openai_chat_completions():
            return OpenAIRequestHandler(request).handle_request()
        except Exception as e:
            print(f'EXCEPTION on {request.url}!!!', f'{e.__class__.__name__}: {e}')
+            print(print(traceback.format_exc()))
            print(request.data)
+            return build_openai_response('', format_sillytavern_err(f'Server encountered exception', 'error')), 200
--- a/llm_server/routes/v1/generate.py
+++ b/llm_server/routes/v1/generate.py
@ -1,6 +1,9 @@
+import traceback
+
 from flask import jsonify, request

 from . import bp
+from ..helpers.client import format_sillytavern_err
 from ..helpers.http import validate_json
 from ..ooba_request_handler import OobaRequestHandler

@ -15,4 +18,6 @@ def generate():
            return OobaRequestHandler(request).handle_request()
        except Exception as e:
            print(f'EXCEPTION on {request.url}!!!', f'{e.__class__.__name__}: {e}')
+            print(print(traceback.format_exc()))
            print(request.data)
+            return format_sillytavern_err(f'Server encountered exception', 'error'), 200
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -1,375 +0,0 @@
-# syntax = docker/dockerfile:experimental
-
-# DOCKER_BUILDKIT=1 DOCKER_CLI_EXPERIMENTAL=enabled
-
-# Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.71 AS chef
-WORKDIR /usr/src
-
-ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
-
-FROM chef as planner
-COPY Cargo.toml Cargo.toml
-COPY rust-toolchain.toml rust-toolchain.toml
-COPY proto proto
-COPY benchmark benchmark
-COPY router router
-COPY launcher launcher
-RUN cargo chef prepare --recipe-path recipe.json
-
-FROM chef AS builder
-
-ARG GIT_SHA
-ARG DOCKER_LABEL
-
-RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
-    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
-    unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
-    unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
-    rm -f $PROTOC_ZIP
-
-COPY --from=planner /usr/src/recipe.json recipe.json
-RUN cargo chef cook --release --recipe-path recipe.json
-
-COPY Cargo.toml Cargo.toml
-COPY rust-toolchain.toml rust-toolchain.toml
-COPY proto proto
-COPY benchmark benchmark
-COPY router router
-COPY launcher launcher
-RUN cargo build --release
-
-# ==============================================================================
-# Build PyTorch
-
-# Python builder
-# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
-FROM nvidia/cuda:11.8.0-devel-ubuntu20.04 as dev-base
-# FROM nvidia/cuda:11.8.0-devel-ubuntu20.04 as pytorch-build
-# ubuntu:20.04
-
-ARG PYTORCH_VERSION=2.0.1
-ARG PYTHON_VERSION=3.9
-# Keep in sync with `server/pyproject.toml
-ARG CUDA_VERSION=11.8
-
-# Automatically set by buildx
-ARG TARGETPLATFORM
-
-ENV PATH /opt/conda/bin:$PATH
-
-RUN apt-get update
-
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends gnupg2
-
-# Add new repo to install gcc 11 on Ubuntu 20.04
-RUN echo "deb http://ppa.launchpad.net/ubuntu-toolchain-r/test/ubuntu focal main" >> /etc/apt/sources.list && \
-    apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 60C317803A41BA51845E371A1E9377A2BA9EF27F && \
-    apt-get update
-
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-        gcc-11 \
-        g++-11 \
-        build-essential \
-        ca-certificates \
-        ccache \
-        cmake \
-        curl \
-        git \
-        ninja-build \
-        libjpeg-dev \
-        libpng-dev && \
-    rm -rf /var/lib/apt/lists/*
-RUN /usr/sbin/update-ccache-symlinks
-RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
-ENV PATH /opt/conda/bin:$PATH
-
-# Set gcc path to new gcc version
-RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 60 --slave /usr/bin/g++ g++ /usr/bin/g++-11
-
-# Install conda
-FROM dev-base as pytorch-build
-# Automatically set by buildx
-ARG TARGETPLATFORM
-# translating Docker's TARGETPLATFORM into miniconda arches
-RUN case ${TARGETPLATFORM} in \
-         "linux/arm64")  MINICONDA_ARCH=aarch64  ;; \
-         *)              MINICONDA_ARCH=x86_64   ;; \
-    esac && \
-    curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
-
-# Manually invoke bash on miniconda script per https://github.com/conda/conda/issues/10431
-RUN chmod +x ~/miniconda.sh && \
-    bash ~/miniconda.sh -b -p /opt/conda && \
-    rm ~/miniconda.sh
-
-RUN git clone --recursive https://github.com/pytorch/pytorch && \
-    cd pytorch && \
-    git checkout v${PYTORCH_VERSION}
-
-WORKDIR /pytorch
-
-RUN git submodule update --init --recursive
-
-# Write the Pytorch version into the version.txt file because it isn't always the same as the tag we checked out
-RUN echo $PYTORCH_VERSION > version.txt
-
-# Install things for building PyTorch
-RUN /opt/conda/bin/conda install -y mkl mkl-include cudnn libgcc && \
-    /opt/conda/bin/conda install -c conda-forge libstdcxx-ng=12 && \
-    /opt/conda/bin/conda install -c pytorch magma-cuda118 && \
-    /opt/conda/bin/conda install -c "nvidia/label/cuda-11.8.0"  cuda==11.8
-
-RUN find / -name libcudart_static*
-
-RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
-    /opt/conda/bin/python -mpip install -r requirements.txt
-
-# https://github.com/cresset-template/cresset/blob/37c7b5df7236d3b9d96c4908efe5af8bc90066e3/reqs/train-conda-build.requirements.txt
-# TODO: remove what we don't need
-RUN /opt/conda/bin/conda install -y \
-      jemalloc \
-      ccache \
-      cmake \
-      expecttest \
-      filelock \
-      fsspec \
-      git \
-      hypothesis \
-      jinja2 \
-      libpng \
-      networkx \
-      ninja \
-      numpy \
-      psutil \
-      pyyaml \
-      requests \
-      setuptools \
-      sympy \
-      typing-extensions
-
-RUN /opt/conda/bin/conda clean -ya
-
-# Use Intel OpenMP with optimizations. See the documentation for details.
-# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
-# Intel OpenMP thread blocking time in ms.
-ENV KMP_BLOCKTIME=0
-# Configure CPU thread affinity.
-# ENV KMP_AFFINITY="granularity=fine,compact,1,0"
-ENV LD_PRELOAD=/opt/conda/lib/libiomp5.so:${LD_PRELOAD}
-
-# Use Jemalloc for efficient memory management.
-ENV LD_PRELOAD=/opt/conda/lib/libjemalloc.so:${LD_PRELOAD}
-ENV MALLOC_CONF="background_thread:true,metadata_thp:auto,dirty_decay_ms:30000,muzzy_decay_ms:30000"
-
-RUN cat version.txt
-
-RUN make triton
-
-# Install PyTorch without AVX2
-# https://github.com/cresset-template/cresset/blob/37c7b5df7236d3b9d96c4908efe5af8bc90066e3/docker-compose.yaml#L124
-# print(torch.__config__.show().split("\n"), sep="\n")
-RUN --mount=type=cache,target=/opt/ccache \
-    python setup.py clean && \
-    BLAS_INFO=mkl \
-    BUILD_TYPE=Release \
-    CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wunused-local-typedefs -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Werror=cast-function-type -Wno-stringop-overflow" \
-    LAPACK_INFO=mkl \
-    PERF_WITH_AVX=1 \
-    PERF_WITH_AVX2=0 \
-    PERF_WITH_AVX512=0 \
-    TORCH_DISABLE_GPU_ASSERTS=ON \
-    TORCH_VERSION=${PYTORCH_VERSION} \
-    USE_CUDA=ON \
-    USE_CUDNN=ON \
-    USE_EXCEPTION_PTR=1 \
-    USE_GFLAGS=OFF \
-    USE_GLOG=OFF \
-    USE_MKL=ON \
-    USE_MKLDNN=ON \
-    USE_MPI=OFF \
-    USE_NCCL=1 \
-    USE_NNPACK=ON \
-    USE_OPENMP=ON \
-    USE_ROCM=OFF \
-    BUILD_TEST=0 \
-    CMAKE_ARGS='-DDISABLE_AVX2:BOOL=TRUE -DCXX_AVX2_FOUND:BOOL=FALSE -DC_AVX2_FOUND:BOOL=FALSE -DDISABLE_AVX512F:BOOL=TRUE' \
-    TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
-    CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
-    python setup.py install && \
-    cd .. && \
-    rm -rf pytorch
-
-
-# Make sure we built everything properly. Build will fail if CUDA isn't available.
-# RUN python -c "import torch; exit(1 if not torch.version.cuda else 0)"
-
-RUN nm -D /opt/conda/lib/python3.9/site-packages/torch/lib/libtorch.so
-
-# ==============================================================================
-# Build Flash Attention CUDA kernels
-
-FROM pytorch-build as flash-att-builder
-WORKDIR /usr/src
-COPY server/Makefile-flash-att Makefile
-
-# Build specific version of flash attention
-RUN MAX_JOBS=5 make build-flash-attention
-
-# ==============================================================================
-# Build Flash Attention v2 CUDA kernels
-
-FROM pytorch-build as flash-att-v2-builder
-WORKDIR /usr/src
-COPY server/Makefile-flash-att-v2 Makefile
-
-# Build specific version of flash attention v2
-RUN MAX_JOBS=10 make build-flash-attention-v2
-
-# ==============================================================================
-# Build Transformers exllama kernels
-
-FROM pytorch-build as exllama-kernels-builder
-WORKDIR /usr/src
-COPY server/exllama_kernels/ .
-
-# Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
-
-# ==============================================================================
-# Build Transformers CUDA kernels
-
-FROM pytorch-build as custom-kernels-builder
-WORKDIR /usr/src
-COPY server/custom_kernels/ .
-
-# Build specific version of transformers
-RUN MAX_JOBS=5 python setup.py build
-
-# ==============================================================================
-# Build vllm CUDA kernels
-
-FROM pytorch-build as vllm-builder
-WORKDIR /usr/src
-COPY server/Makefile-vllm Makefile
-
-# Build specific version of vllm
-RUN MAX_JOBS=5 make build-vllm
-
-# ==============================================================================
-
-# Text Generation Inference base image
-# nvidia/cuda:11.8.0-base-ubuntu20.04
-FROM nvidia/cuda:11.8.0-base-ubuntu20.04 as base
-
-# Conda env
-ENV PATH=/opt/conda/bin:$PATH \
-    CONDA_PREFIX=/opt/conda
-
-# Text Generation Inference base env
-ENV HUGGINGFACE_HUB_CACHE=/data \
-    HF_HUB_ENABLE_HF_TRANSFER=1 \
-    PORT=80 \
-    LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.8/targets/x86_64-linux/lib:/opt/conda/lib/python3.9/site-packages/torch/lib
-
-WORKDIR /usr/src
-
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-      libssl-dev \
-      ca-certificates \
-      make \
-    && rm -rf /var/lib/apt/lists/*
-
-# Copy conda with PyTorch installed
-COPY --from=pytorch-build /opt/conda /opt/conda
-
-# Copy build artifacts from flash attention builder
-COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-
-# Copy build artifacts from flash attention v2 builder
-COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-
-# Copy build artifacts from custom kernels builder
-COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-
-# Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-
-# Copy builds artifacts from vllm builder
-COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
-
-# Install flash-attention dependencies
-RUN pip install einops --no-cache-dir
-
-# Install server
-COPY proto proto
-COPY server server
-COPY server/Makefile server/Makefile
-
-RUN cd server && \
-    make gen-server && \
-    pip install -r requirements.txt
-
-RUN cd server && \
-    pip install ".[bnb, accelerate, quantize]" --no-cache-dir && \
-    pip install optimum auto-gptq
-
-RUN /opt/conda/bin/conda clean -ya
-
-# Fix the error
-# /opt/conda/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32
-RUN cp /opt/conda/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda118.so /opt/conda/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so
-
-
-
-
-RUN ldd /opt/conda/lib/python3.9/site-packages/exllama_kernels.cpython-39-x86_64-linux-gnu.so
-
-RUN nm -D /opt/conda/lib/python3.9/site-packages/torch/lib/libtorch.so
-
-RUN python3 -c "import torch; import text_generation_server.utils.gptq.exllama"
-
-# RUN ls /opt/conda/lib/python3.9/site-packages/bitsandbytes/
-# RUN find / -name libcudart.so 2>/dev/null
-# RUN find / -name "*libc10.so*"
-# RUN find / -name libtorch.so
-
-
-# Make sure our special dependencies were compiled and copied correctly
-# RUN python -c "import torch; exit(1 if not torch.version.cuda else 0)"
-# RUN python -c "import torch; torch.cuda.is_available()"
-RUN python -c "import torch; import flash_attn_2_cuda"
-RUN python -c "import torch; import flash_attn_cuda"
-RUN python -c "import torch; import vllm_cache_ops"
-RUN python -c "import torch; import vllm_attention_ops"
-RUN python -c "import torch; import custom_kernels"
-RUN python -c "import torch; import text_generation_server.utils.gptq.exllama"
-
-# Install benchmarker
-COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
-# Install router
-COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
-# Install launcher
-COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
-
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-      build-essential \
-      g++ \
-    && rm -rf /var/lib/apt/lists/*
-
-# ==============================================================================
-
-# AWS Sagemaker compatbile image
-FROM base as sagemaker
-COPY sagemaker-entrypoint.sh entrypoint.sh
-RUN chmod +x entrypoint.sh
-ENTRYPOINT ["./entrypoint.sh"]
-
-# ==============================================================================
-
-# Final image
-FROM base
-ENTRYPOINT ["text-generation-launcher"]
-CMD ["--json-output"]