More fixes trtllm (#2342)
* (backend) use parking_lot crate for RwLock fairness * (docker) let's put rust in the TRTLLM folder when building * (docker) build ompi with SLURM support * (launcher) default new server::run parameters to false for now * (chore) fmt ... why?
This commit is contained in:
parent
f3b5c69441
commit
3f385991b0
|
@ -4028,6 +4028,7 @@ dependencies = [
|
||||||
"cxx",
|
"cxx",
|
||||||
"cxx-build",
|
"cxx-build",
|
||||||
"log",
|
"log",
|
||||||
|
"parking_lot",
|
||||||
"pkg-config",
|
"pkg-config",
|
||||||
"text-generation-router",
|
"text-generation-router",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
|
|
@ -8,17 +8,18 @@ homepage.workspace = true
|
||||||
[dependencies]
|
[dependencies]
|
||||||
async-trait = "0.1"
|
async-trait = "0.1"
|
||||||
async-stream = "0.3"
|
async-stream = "0.3"
|
||||||
|
clap = { version = "4.5", features = ["derive"] }
|
||||||
cxx = "1.0"
|
cxx = "1.0"
|
||||||
|
log = { version = "0.4", features = [] }
|
||||||
text-generation-router = { path = "../../router" }
|
text-generation-router = { path = "../../router" }
|
||||||
tokenizers = { version = "0.19", features = ["hf-hub"] }
|
tokenizers = { version = "0.19", features = ["hf-hub"] }
|
||||||
tokio = { version = "1.38", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
|
tokio = { version = "1.38", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
|
||||||
tokio-stream = "0.1.15"
|
tokio-stream = "0.1.15"
|
||||||
clap = { version = "4.5", features = ["derive"] }
|
|
||||||
thiserror = "1.0.62"
|
thiserror = "1.0.62"
|
||||||
tracing = "0.1"
|
tracing = "0.1"
|
||||||
tracing-opentelemetry = "0.24"
|
tracing-opentelemetry = "0.24"
|
||||||
tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
|
tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
|
||||||
log = { version = "0.4", features = [] }
|
parking_lot = "0.12"
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
cmake = "0.1"
|
cmake = "0.1"
|
||||||
|
|
|
@ -3,7 +3,7 @@ ARG OMPI_VERSION="4.1.6"
|
||||||
|
|
||||||
# Build dependencies resolver stage
|
# Build dependencies resolver stage
|
||||||
FROM lukemathwalker/cargo-chef:latest AS chef
|
FROM lukemathwalker/cargo-chef:latest AS chef
|
||||||
WORKDIR /usr/src/text-generation-inference
|
WORKDIR /usr/src/text-generation-inference/backends/trtllm
|
||||||
|
|
||||||
FROM chef AS planner
|
FROM chef AS planner
|
||||||
COPY . .
|
COPY . .
|
||||||
|
@ -42,7 +42,7 @@ RUN wget "https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILE
|
||||||
mkdir /usr/src/mpi && \
|
mkdir /usr/src/mpi && \
|
||||||
tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \
|
tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \
|
||||||
cd /usr/src/mpi && \
|
cd /usr/src/mpi && \
|
||||||
./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --without-slurm && \
|
./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda && \
|
||||||
make -j all && \
|
make -j all && \
|
||||||
make install && \
|
make install && \
|
||||||
rm -rf "/opt/src/$OMPI_TARBALL_FILENAME"
|
rm -rf "/opt/src/$OMPI_TARBALL_FILENAME"
|
||||||
|
@ -66,7 +66,7 @@ ENV PATH="/root/.cargo/bin:$PATH"
|
||||||
RUN cargo install cargo-chef
|
RUN cargo install cargo-chef
|
||||||
|
|
||||||
# Cache dependencies
|
# Cache dependencies
|
||||||
COPY --from=planner /usr/src/text-generation-inference/recipe.json .
|
COPY --from=planner /usr/src/text-generation-inference/backends/trtllm/recipe.json .
|
||||||
RUN cargo chef cook --release --recipe-path recipe.json
|
RUN cargo chef cook --release --recipe-path recipe.json
|
||||||
|
|
||||||
# Build actual TGI
|
# Build actual TGI
|
||||||
|
@ -79,7 +79,8 @@ COPY . .
|
||||||
COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
|
COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
|
||||||
COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
|
COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
|
||||||
RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \
|
RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$TGI_INSTALL_PREFIX/lib" && \
|
||||||
CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release --bin text-generation-backends-trtllm
|
cd backends/trtllm && \
|
||||||
|
CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release
|
||||||
|
|
||||||
FROM nvidia/cuda:12.5.1-cudnn-runtime-ubuntu22.04 AS runtime
|
FROM nvidia/cuda:12.5.1-cudnn-runtime-ubuntu22.04 AS runtime
|
||||||
WORKDIR /usr/local/tgi/bin
|
WORKDIR /usr/local/tgi/bin
|
||||||
|
|
|
@ -12,12 +12,13 @@ use cxx::UniquePtr;
|
||||||
use log::{error, warn};
|
use log::{error, warn};
|
||||||
use tokenizers::Tokenizer;
|
use tokenizers::Tokenizer;
|
||||||
use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
|
use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
|
||||||
use tokio::sync::RwLock;
|
|
||||||
use tokio::time::{sleep, Instant};
|
use tokio::time::{sleep, Instant};
|
||||||
use tokio_stream::wrappers::UnboundedReceiverStream;
|
use tokio_stream::wrappers::UnboundedReceiverStream;
|
||||||
use tokio_stream::{Stream, StreamExt};
|
use tokio_stream::{Stream, StreamExt};
|
||||||
use tracing::{instrument, span, Level};
|
use tracing::{instrument, span, Level};
|
||||||
|
|
||||||
|
// use tokio::sync::RwLock;
|
||||||
|
use parking_lot::RwLock;
|
||||||
use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
|
use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
|
||||||
use text_generation_router::validation::ValidationError::UnsupportedModality;
|
use text_generation_router::validation::ValidationError::UnsupportedModality;
|
||||||
use text_generation_router::validation::{Chunk, ValidGenerateRequest, ValidationError};
|
use text_generation_router::validation::{Chunk, ValidGenerateRequest, ValidationError};
|
||||||
|
|
|
@ -1,12 +1,10 @@
|
||||||
|
use clap::Parser;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
use clap::Parser;
|
|
||||||
use tokenizers::{FromPretrainedParameters, Tokenizer};
|
|
||||||
|
|
||||||
use text_generation_backends_trtllm::errors::TensorRtLlmBackendError;
|
use text_generation_backends_trtllm::errors::TensorRtLlmBackendError;
|
||||||
use text_generation_backends_trtllm::TensorRtLlmBackend;
|
use text_generation_backends_trtllm::TensorRtLlmBackend;
|
||||||
use text_generation_router::server;
|
use text_generation_router::server;
|
||||||
|
use tokenizers::{FromPretrainedParameters, Tokenizer};
|
||||||
|
|
||||||
/// App Configuration
|
/// App Configuration
|
||||||
#[derive(Parser, Debug)]
|
#[derive(Parser, Debug)]
|
||||||
|
@ -160,6 +158,8 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
|
||||||
messages_api_enabled,
|
messages_api_enabled,
|
||||||
true,
|
true,
|
||||||
max_client_batch_size,
|
max_client_batch_size,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|
Loading…
Reference in New Issue