More fixes trtllm (#2342)

* (backend) use parking_lot crate for RwLock fairness

* (docker) let's put rust in the TRTLLM folder when building

* (docker) build ompi with SLURM support

* (launcher) default new server::run parameters to false for now

* (chore) fmt ... why?
This commit is contained in:
Funtowicz Morgan 2024-08-14 12:02:05 +02:00 committed by GitHub
parent f3b5c69441
commit 3f385991b0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 15 additions and 11 deletions

1
Cargo.lock generated
View File

@ -4028,6 +4028,7 @@ dependencies = [
"cxx",
"cxx-build",
"log",
"parking_lot",
"pkg-config",
"text-generation-router",
"thiserror",

View File

@ -8,17 +8,18 @@ homepage.workspace = true
[dependencies]
async-trait = "0.1"
async-stream = "0.3"
clap = { version = "4.5", features = ["derive"] }
cxx = "1.0"
log = { version = "0.4", features = [] }
text-generation-router = { path = "../../router" }
tokenizers = { version = "0.19", features = ["hf-hub"] }
tokio = { version = "1.38", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
tokio-stream = "0.1.15"
clap = { version = "4.5", features = ["derive"] }
thiserror = "1.0.62"
tracing = "0.1"
tracing-opentelemetry = "0.24"
tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
log = { version = "0.4", features = [] }
parking_lot = "0.12"
[build-dependencies]
cmake = "0.1"

View File

@ -3,7 +3,7 @@ ARG OMPI_VERSION="4.1.6"
# Build dependencies resolver stage
FROM lukemathwalker/cargo-chef:latest AS chef
WORKDIR /usr/src/text-generation-inference
WORKDIR /usr/src/text-generation-inference/backends/trtllm
FROM chef AS planner
COPY . .
@ -42,7 +42,7 @@ RUN wget "https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILE
mkdir /usr/src/mpi && \
tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \
cd /usr/src/mpi && \
./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --without-slurm && \
./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda && \
make -j all && \
make install && \
rm -rf "/opt/src/$OMPI_TARBALL_FILENAME"
@ -66,7 +66,7 @@ ENV PATH="/root/.cargo/bin:$PATH"
RUN cargo install cargo-chef
# Cache dependencies
COPY --from=planner /usr/src/text-generation-inference/recipe.json .
COPY --from=planner /usr/src/text-generation-inference/backends/trtllm/recipe.json .
RUN cargo chef cook --release --recipe-path recipe.json
# Build actual TGI
@ -79,7 +79,8 @@ COPY . .
COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \
CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release --bin text-generation-backends-trtllm
cd backends/trtllm && \
CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release
FROM nvidia/cuda:12.5.1-cudnn-runtime-ubuntu22.04 AS runtime
WORKDIR /usr/local/tgi/bin

View File

@ -12,12 +12,13 @@ use cxx::UniquePtr;
use log::{error, warn};
use tokenizers::Tokenizer;
use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
use tokio::sync::RwLock;
use tokio::time::{sleep, Instant};
use tokio_stream::wrappers::UnboundedReceiverStream;
use tokio_stream::{Stream, StreamExt};
use tracing::{instrument, span, Level};
// use tokio::sync::RwLock;
use parking_lot::RwLock;
use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
use text_generation_router::validation::ValidationError::UnsupportedModality;
use text_generation_router::validation::{Chunk, ValidGenerateRequest, ValidationError};

View File

@ -1,12 +1,10 @@
use clap::Parser;
use std::collections::HashMap;
use std::path::PathBuf;
use clap::Parser;
use tokenizers::{FromPretrainedParameters, Tokenizer};
use text_generation_backends_trtllm::errors::TensorRtLlmBackendError;
use text_generation_backends_trtllm::TensorRtLlmBackend;
use text_generation_router::server;
use tokenizers::{FromPretrainedParameters, Tokenizer};
/// App Configuration
#[derive(Parser, Debug)]
@ -160,6 +158,8 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
messages_api_enabled,
true,
max_client_batch_size,
false,
false,
)
.await?;
Ok(())