Cleanup Vertex + Chat (#2553)
* Cleanup Vertex + Chat * logprobs defaults to false. * Parameters are optional * Fix docs. * Changing back this logprobs default. * Fixup doc. * Let's debug that. * Not unstable. * Updating Cargo ? * Wat? * Dummy change. * Trying some other install. * Trying smething. * Revert everything. * Update Cargo lock. * Fixing the pre-commit after rebase.
This commit is contained in:
parent
75c8c54ac9
commit
c032280b17
|
@ -3,9 +3,8 @@ target
|
|||
router/tokenizer.json
|
||||
*__pycache__*
|
||||
|
||||
backends/v2/src/client/pb
|
||||
backends/v3/src/client/pb
|
||||
backends/client/src/v2/pb
|
||||
backends/client/src/v3/pb
|
||||
|
||||
# ROCm auto-generated files
|
||||
*.hip
|
||||
|
|
|
@ -9,7 +9,10 @@ mod kserve;
|
|||
pub mod logging;
|
||||
|
||||
pub mod usage_stats;
|
||||
mod vertex;
|
||||
|
||||
use crate::infer::{Infer, InferError};
|
||||
use crate::server::prepare_chat_input;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::warn;
|
||||
use utoipa::ToSchema;
|
||||
|
@ -54,32 +57,6 @@ impl std::str::FromStr for Attention {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Deserialize, ToSchema)]
|
||||
pub(crate) struct GenerateVertexInstance {
|
||||
#[schema(example = "What is Deep Learning?")]
|
||||
pub inputs: String,
|
||||
#[schema(nullable = true, default = "null", example = "null")]
|
||||
pub parameters: Option<GenerateParameters>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Deserialize, ToSchema)]
|
||||
#[serde(untagged)]
|
||||
enum VertexInstance {
|
||||
Generate(GenerateVertexInstance),
|
||||
Chat(ChatRequest),
|
||||
}
|
||||
|
||||
#[derive(Deserialize, ToSchema)]
|
||||
pub(crate) struct VertexRequest {
|
||||
#[serde(rename = "instances")]
|
||||
pub instances: Vec<VertexInstance>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Deserialize, ToSchema, Serialize)]
|
||||
pub(crate) struct VertexResponse {
|
||||
pub predictions: Vec<String>,
|
||||
}
|
||||
|
||||
/// Hub type
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
pub struct HubModelInfo {
|
||||
|
@ -174,6 +151,7 @@ impl HubProcessorConfig {
|
|||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, ToSchema, Serialize)]
|
||||
#[cfg_attr(test, derive(PartialEq))]
|
||||
#[serde(tag = "type", content = "value")]
|
||||
pub(crate) enum GrammarType {
|
||||
/// A string that represents a [JSON Schema](https://json-schema.org/).
|
||||
|
@ -230,6 +208,7 @@ pub struct Info {
|
|||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, ToSchema, Default)]
|
||||
#[cfg_attr(test, derive(PartialEq))]
|
||||
pub(crate) struct GenerateParameters {
|
||||
/// Generate best_of sequences and return the one if the highest token logprobs.
|
||||
#[serde(default)]
|
||||
|
@ -774,6 +753,7 @@ impl ChatCompletionChunk {
|
|||
}
|
||||
|
||||
#[derive(Clone, Deserialize, ToSchema, Serialize)]
|
||||
#[cfg_attr(test, derive(Debug, PartialEq, Default))]
|
||||
pub(crate) struct ChatRequest {
|
||||
#[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]
|
||||
/// [UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.
|
||||
|
@ -890,7 +870,82 @@ pub(crate) struct ChatRequest {
|
|||
pub stream_options: Option<StreamOptions>,
|
||||
}
|
||||
|
||||
impl ChatRequest {
|
||||
fn try_into_generate(self, infer: &Infer) -> Result<(GenerateRequest, bool), InferError> {
|
||||
let ChatRequest {
|
||||
model,
|
||||
max_tokens,
|
||||
messages,
|
||||
seed,
|
||||
stop,
|
||||
stream,
|
||||
tools,
|
||||
tool_choice,
|
||||
tool_prompt,
|
||||
temperature,
|
||||
response_format,
|
||||
guideline,
|
||||
presence_penalty,
|
||||
frequency_penalty,
|
||||
top_p,
|
||||
top_logprobs,
|
||||
..
|
||||
} = self;
|
||||
|
||||
let repetition_penalty = presence_penalty.map(|x| x + 2.0);
|
||||
let max_new_tokens = max_tokens.or(Some(100));
|
||||
let tool_prompt = tool_prompt
|
||||
.filter(|s| !s.is_empty())
|
||||
.unwrap_or_else(default_tool_prompt);
|
||||
let stop = stop.unwrap_or_default();
|
||||
// enable greedy only when temperature is 0
|
||||
let (do_sample, temperature) = match temperature {
|
||||
Some(temperature) if temperature == 0.0 => (false, None),
|
||||
other => (true, other),
|
||||
};
|
||||
let (inputs, grammar, using_tools) = prepare_chat_input(
|
||||
infer,
|
||||
response_format,
|
||||
tools,
|
||||
tool_choice,
|
||||
&tool_prompt,
|
||||
guideline,
|
||||
messages,
|
||||
)?;
|
||||
|
||||
Ok((
|
||||
GenerateRequest {
|
||||
inputs: inputs.to_string(),
|
||||
add_special_tokens: false,
|
||||
parameters: GenerateParameters {
|
||||
best_of: None,
|
||||
temperature,
|
||||
repetition_penalty,
|
||||
frequency_penalty,
|
||||
top_k: None,
|
||||
top_p,
|
||||
typical_p: None,
|
||||
do_sample,
|
||||
max_new_tokens,
|
||||
return_full_text: None,
|
||||
stop,
|
||||
truncate: None,
|
||||
watermark: false,
|
||||
details: true,
|
||||
decoder_input_details: !stream,
|
||||
seed,
|
||||
top_n_tokens: top_logprobs,
|
||||
grammar,
|
||||
adapter_id: model.filter(|m| *m != "tgi").map(String::from),
|
||||
},
|
||||
},
|
||||
using_tools,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Deserialize, ToSchema, Serialize)]
|
||||
#[cfg_attr(test, derive(Debug, PartialEq))]
|
||||
struct StreamOptions {
|
||||
/// If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.
|
||||
#[schema(example = "true")]
|
||||
|
@ -984,6 +1039,7 @@ pub(crate) struct FunctionDefinition {
|
|||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize, ToSchema)]
|
||||
#[cfg_attr(test, derive(PartialEq))]
|
||||
pub(crate) struct Tool {
|
||||
// The type of the tool. Currently, only 'function' is supported.
|
||||
#[schema(example = "function")]
|
||||
|
|
|
@ -8,7 +8,8 @@ use crate::kserve::{
|
|||
kserve_model_metadata, kserve_model_metadata_ready,
|
||||
};
|
||||
use crate::validation::ValidationError;
|
||||
use crate::{default_tool_prompt, ChatTokenizeResponse, VertexInstance};
|
||||
use crate::vertex::vertex_compatibility;
|
||||
use crate::ChatTokenizeResponse;
|
||||
use crate::{
|
||||
usage_stats, BestOfSequence, Details, ErrorResponse, FinishReason, FunctionName,
|
||||
GenerateParameters, GenerateRequest, GenerateResponse, GrammarType, HubModelInfo,
|
||||
|
@ -20,8 +21,7 @@ use crate::{
|
|||
ChatCompletion, ChatCompletionChoice, ChatCompletionChunk, ChatCompletionComplete,
|
||||
ChatCompletionDelta, ChatCompletionLogprob, ChatCompletionLogprobs, ChatCompletionTopLogprob,
|
||||
ChatRequest, Chunk, CompatGenerateRequest, Completion, CompletionComplete, CompletionFinal,
|
||||
CompletionRequest, CompletionType, DeltaToolCall, Function, Prompt, Tool, VertexRequest,
|
||||
VertexResponse,
|
||||
CompletionRequest, CompletionType, DeltaToolCall, Function, Prompt, Tool,
|
||||
};
|
||||
use crate::{FunctionDefinition, HubPreprocessorConfig, ToolCall, ToolChoice, ToolType};
|
||||
use crate::{ModelInfo, ModelsInfo};
|
||||
|
@ -149,63 +149,11 @@ async fn openai_get_model_info(info: Extension<Info>) -> Json<ModelsInfo> {
|
|||
)]
|
||||
async fn get_chat_tokenize(
|
||||
Extension(infer): Extension<Infer>,
|
||||
Json(req): Json<ChatRequest>,
|
||||
Json(chat): Json<ChatRequest>,
|
||||
) -> Result<(HeaderMap, Json<ChatTokenizeResponse>), (StatusCode, Json<ErrorResponse>)> {
|
||||
metrics::counter!("tgi_request_count").increment(1);
|
||||
|
||||
let ChatRequest {
|
||||
model,
|
||||
max_tokens,
|
||||
messages,
|
||||
seed,
|
||||
stop,
|
||||
stream,
|
||||
tools,
|
||||
tool_choice,
|
||||
tool_prompt,
|
||||
temperature,
|
||||
response_format,
|
||||
guideline,
|
||||
..
|
||||
} = req;
|
||||
|
||||
let tool_prompt = tool_prompt.unwrap_or_default();
|
||||
let (inputs, _grammar, _using_tools) = prepare_chat_input(
|
||||
&infer,
|
||||
response_format,
|
||||
tools,
|
||||
tool_choice,
|
||||
&tool_prompt,
|
||||
guideline,
|
||||
messages,
|
||||
)?;
|
||||
|
||||
let generate_request = GenerateRequest {
|
||||
inputs,
|
||||
add_special_tokens: false,
|
||||
parameters: GenerateParameters {
|
||||
best_of: None,
|
||||
temperature,
|
||||
repetition_penalty: None,
|
||||
frequency_penalty: None,
|
||||
top_k: None,
|
||||
top_p: None,
|
||||
typical_p: None,
|
||||
do_sample: true,
|
||||
max_new_tokens: max_tokens,
|
||||
return_full_text: None,
|
||||
stop: stop.unwrap_or_default(),
|
||||
truncate: None,
|
||||
watermark: false,
|
||||
details: false,
|
||||
decoder_input_details: !stream,
|
||||
seed,
|
||||
top_n_tokens: None,
|
||||
grammar: _grammar,
|
||||
adapter_id: model.as_ref().filter(|m| *m != "tgi").map(String::from),
|
||||
},
|
||||
};
|
||||
|
||||
let generate_request: GenerateRequest = chat.try_into_generate(&infer)?.0;
|
||||
let input = generate_request.inputs.clone();
|
||||
let encoding = infer.tokenize(generate_request).await?;
|
||||
if let Some(encoding) = encoding {
|
||||
|
@ -1162,77 +1110,20 @@ async fn chat_completions(
|
|||
Extension(infer): Extension<Infer>,
|
||||
Extension(compute_type): Extension<ComputeType>,
|
||||
Extension(info): Extension<Info>,
|
||||
Json(req): Json<ChatRequest>,
|
||||
Json(chat): Json<ChatRequest>,
|
||||
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
|
||||
let span = tracing::Span::current();
|
||||
metrics::counter!("tgi_request_count").increment(1);
|
||||
let ChatRequest {
|
||||
model,
|
||||
logprobs,
|
||||
max_tokens,
|
||||
messages,
|
||||
presence_penalty,
|
||||
seed,
|
||||
stop,
|
||||
stream,
|
||||
stream_options,
|
||||
tools,
|
||||
tool_choice,
|
||||
tool_prompt,
|
||||
temperature,
|
||||
response_format,
|
||||
guideline,
|
||||
logprobs,
|
||||
..
|
||||
} = req;
|
||||
} = chat.clone();
|
||||
let (generate_request, using_tools): (GenerateRequest, bool) =
|
||||
chat.try_into_generate(&infer)?;
|
||||
|
||||
let repetition_penalty = presence_penalty.map(|x| x + 2.0);
|
||||
let max_new_tokens = max_tokens.or(Some(100));
|
||||
let logprobs = logprobs.unwrap_or(false);
|
||||
let tool_prompt = tool_prompt
|
||||
.filter(|s| !s.is_empty())
|
||||
.unwrap_or_else(default_tool_prompt);
|
||||
let stop = stop.unwrap_or_default();
|
||||
// enable greedy only when temperature is 0
|
||||
let (do_sample, temperature) = match temperature {
|
||||
Some(temperature) if temperature == 0.0 => (false, None),
|
||||
other => (true, other),
|
||||
};
|
||||
let (inputs, grammar, using_tools) = prepare_chat_input(
|
||||
&infer,
|
||||
response_format,
|
||||
tools,
|
||||
tool_choice,
|
||||
&tool_prompt,
|
||||
guideline,
|
||||
messages,
|
||||
)?;
|
||||
|
||||
// build the request passing some parameters
|
||||
let generate_request = GenerateRequest {
|
||||
inputs: inputs.to_string(),
|
||||
add_special_tokens: false,
|
||||
parameters: GenerateParameters {
|
||||
best_of: None,
|
||||
temperature,
|
||||
repetition_penalty,
|
||||
frequency_penalty: req.frequency_penalty,
|
||||
top_k: None,
|
||||
top_p: req.top_p,
|
||||
typical_p: None,
|
||||
do_sample,
|
||||
max_new_tokens,
|
||||
return_full_text: None,
|
||||
stop,
|
||||
truncate: None,
|
||||
watermark: false,
|
||||
details: true,
|
||||
decoder_input_details: !stream,
|
||||
seed,
|
||||
top_n_tokens: req.top_logprobs,
|
||||
grammar,
|
||||
adapter_id: model.filter(|m| *m != "tgi").map(String::from),
|
||||
},
|
||||
};
|
||||
let logprobs = logprobs.unwrap_or_default();
|
||||
|
||||
// static values that will be returned in all cases
|
||||
let model_id = info.model_id.clone();
|
||||
|
@ -1385,186 +1276,6 @@ async fn chat_completions(
|
|||
}
|
||||
}
|
||||
|
||||
/// Generate tokens from Vertex request
|
||||
#[utoipa::path(
|
||||
post,
|
||||
tag = "Text Generation Inference",
|
||||
path = "/vertex",
|
||||
request_body = VertexRequest,
|
||||
responses(
|
||||
(status = 200, description = "Generated Text", body = VertexResponse),
|
||||
(status = 424, description = "Generation Error", body = ErrorResponse,
|
||||
example = json ! ({"error": "Request failed during generation"})),
|
||||
(status = 429, description = "Model is overloaded", body = ErrorResponse,
|
||||
example = json ! ({"error": "Model is overloaded"})),
|
||||
(status = 422, description = "Input validation error", body = ErrorResponse,
|
||||
example = json ! ({"error": "Input validation error"})),
|
||||
(status = 500, description = "Incomplete generation", body = ErrorResponse,
|
||||
example = json ! ({"error": "Incomplete generation"})),
|
||||
)
|
||||
)]
|
||||
#[instrument(
|
||||
skip_all,
|
||||
fields(
|
||||
total_time,
|
||||
validation_time,
|
||||
queue_time,
|
||||
inference_time,
|
||||
time_per_token,
|
||||
seed,
|
||||
)
|
||||
)]
|
||||
async fn vertex_compatibility(
|
||||
Extension(infer): Extension<Infer>,
|
||||
Extension(compute_type): Extension<ComputeType>,
|
||||
Json(req): Json<VertexRequest>,
|
||||
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
|
||||
let span = tracing::Span::current();
|
||||
metrics::counter!("tgi_request_count").increment(1);
|
||||
|
||||
// check that theres at least one instance
|
||||
if req.instances.is_empty() {
|
||||
return Err((
|
||||
StatusCode::UNPROCESSABLE_ENTITY,
|
||||
Json(ErrorResponse {
|
||||
error: "Input validation error".to_string(),
|
||||
error_type: "Input validation error".to_string(),
|
||||
}),
|
||||
));
|
||||
}
|
||||
|
||||
// Prepare futures for all instances
|
||||
let mut futures = Vec::with_capacity(req.instances.len());
|
||||
|
||||
for instance in req.instances.iter() {
|
||||
let generate_request = match instance {
|
||||
VertexInstance::Generate(instance) => GenerateRequest {
|
||||
inputs: instance.inputs.clone(),
|
||||
add_special_tokens: true,
|
||||
parameters: GenerateParameters {
|
||||
do_sample: true,
|
||||
max_new_tokens: instance.parameters.as_ref().and_then(|p| p.max_new_tokens),
|
||||
seed: instance.parameters.as_ref().and_then(|p| p.seed),
|
||||
details: true,
|
||||
decoder_input_details: true,
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
VertexInstance::Chat(instance) => {
|
||||
let ChatRequest {
|
||||
model,
|
||||
max_tokens,
|
||||
messages,
|
||||
seed,
|
||||
stop,
|
||||
stream,
|
||||
tools,
|
||||
tool_choice,
|
||||
tool_prompt,
|
||||
temperature,
|
||||
response_format,
|
||||
guideline,
|
||||
presence_penalty,
|
||||
frequency_penalty,
|
||||
top_p,
|
||||
top_logprobs,
|
||||
..
|
||||
} = instance.clone();
|
||||
|
||||
let repetition_penalty = presence_penalty.map(|x| x + 2.0);
|
||||
let max_new_tokens = max_tokens.or(Some(100));
|
||||
let tool_prompt = tool_prompt
|
||||
.filter(|s| !s.is_empty())
|
||||
.unwrap_or_else(default_tool_prompt);
|
||||
let stop = stop.unwrap_or_default();
|
||||
// enable greedy only when temperature is 0
|
||||
let (do_sample, temperature) = match temperature {
|
||||
Some(temperature) if temperature == 0.0 => (false, None),
|
||||
other => (true, other),
|
||||
};
|
||||
let (inputs, grammar, _using_tools) = match prepare_chat_input(
|
||||
&infer,
|
||||
response_format,
|
||||
tools,
|
||||
tool_choice,
|
||||
&tool_prompt,
|
||||
guideline,
|
||||
messages,
|
||||
) {
|
||||
Ok(result) => result,
|
||||
Err(e) => {
|
||||
return Err((
|
||||
StatusCode::BAD_REQUEST,
|
||||
Json(ErrorResponse {
|
||||
error: format!("Failed to prepare chat input: {}", e),
|
||||
error_type: "Input preparation error".to_string(),
|
||||
}),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
GenerateRequest {
|
||||
inputs: inputs.to_string(),
|
||||
add_special_tokens: false,
|
||||
parameters: GenerateParameters {
|
||||
best_of: None,
|
||||
temperature,
|
||||
repetition_penalty,
|
||||
frequency_penalty,
|
||||
top_k: None,
|
||||
top_p,
|
||||
typical_p: None,
|
||||
do_sample,
|
||||
max_new_tokens,
|
||||
return_full_text: None,
|
||||
stop,
|
||||
truncate: None,
|
||||
watermark: false,
|
||||
details: true,
|
||||
decoder_input_details: !stream,
|
||||
seed,
|
||||
top_n_tokens: top_logprobs,
|
||||
grammar,
|
||||
adapter_id: model.filter(|m| *m != "tgi").map(String::from),
|
||||
},
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let infer_clone = infer.clone();
|
||||
let compute_type_clone = compute_type.clone();
|
||||
let span_clone = span.clone();
|
||||
|
||||
futures.push(async move {
|
||||
generate_internal(
|
||||
Extension(infer_clone),
|
||||
compute_type_clone,
|
||||
Json(generate_request),
|
||||
span_clone,
|
||||
)
|
||||
.await
|
||||
.map(|(_, Json(generation))| generation.generated_text)
|
||||
.map_err(|_| {
|
||||
(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(ErrorResponse {
|
||||
error: "Incomplete generation".into(),
|
||||
error_type: "Incomplete generation".into(),
|
||||
}),
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
// execute all futures in parallel, collect results, returning early if any error occurs
|
||||
let results = futures::future::join_all(futures).await;
|
||||
let predictions: Result<Vec<_>, _> = results.into_iter().collect();
|
||||
let predictions = predictions?;
|
||||
|
||||
let response = VertexResponse { predictions };
|
||||
Ok((HeaderMap::new(), Json(response)).into_response())
|
||||
}
|
||||
|
||||
/// Tokenize inputs
|
||||
#[utoipa::path(
|
||||
post,
|
||||
|
@ -2637,7 +2348,7 @@ pub enum WebServerError {
|
|||
|
||||
type PreparedInput = (String, Option<GrammarType>, bool);
|
||||
|
||||
fn prepare_chat_input(
|
||||
pub(crate) fn prepare_chat_input(
|
||||
infer: &Infer,
|
||||
response_format: Option<GrammarType>,
|
||||
tools: Option<Vec<Tool>>,
|
||||
|
|
|
@ -0,0 +1,360 @@
|
|||
use crate::infer::Infer;
|
||||
use crate::server::{generate_internal, ComputeType};
|
||||
use crate::{
|
||||
ChatRequest, ErrorResponse, GenerateParameters, GenerateRequest, GrammarType, Message,
|
||||
StreamOptions, Tool, ToolChoice,
|
||||
};
|
||||
use axum::extract::Extension;
|
||||
use axum::http::{HeaderMap, StatusCode};
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum::Json;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::instrument;
|
||||
use utoipa::ToSchema;
|
||||
|
||||
#[derive(Clone, Deserialize, ToSchema)]
|
||||
#[cfg_attr(test, derive(Debug, PartialEq))]
|
||||
pub(crate) struct GenerateVertexInstance {
|
||||
#[schema(example = "What is Deep Learning?")]
|
||||
pub inputs: String,
|
||||
#[schema(nullable = true, default = "null", example = "null")]
|
||||
pub parameters: Option<GenerateParameters>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Deserialize, ToSchema)]
|
||||
#[cfg_attr(test, derive(Debug, PartialEq))]
|
||||
pub(crate) struct VertexChat {
|
||||
messages: Vec<Message>,
|
||||
// Messages is ignored there.
|
||||
#[serde(default)]
|
||||
parameters: VertexParameters,
|
||||
}
|
||||
|
||||
#[derive(Clone, Deserialize, ToSchema, Serialize, Default)]
|
||||
#[cfg_attr(test, derive(Debug, PartialEq))]
|
||||
pub(crate) struct VertexParameters {
|
||||
#[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]
|
||||
/// [UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.
|
||||
pub model: Option<String>,
|
||||
|
||||
/// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,
|
||||
/// decreasing the model's likelihood to repeat the same line verbatim.
|
||||
#[serde(default)]
|
||||
#[schema(example = "1.0")]
|
||||
pub frequency_penalty: Option<f32>,
|
||||
|
||||
/// UNUSED
|
||||
/// Modify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens
|
||||
/// (specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically,
|
||||
/// the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model,
|
||||
/// but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should
|
||||
/// result in a ban or exclusive selection of the relevant token.
|
||||
#[serde(default)]
|
||||
pub logit_bias: Option<Vec<f32>>,
|
||||
|
||||
/// Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each
|
||||
/// output token returned in the content of message.
|
||||
#[serde(default)]
|
||||
#[schema(example = "false")]
|
||||
pub logprobs: Option<bool>,
|
||||
|
||||
/// An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with
|
||||
/// an associated log probability. logprobs must be set to true if this parameter is used.
|
||||
#[serde(default)]
|
||||
#[schema(example = "5")]
|
||||
pub top_logprobs: Option<u32>,
|
||||
|
||||
/// The maximum number of tokens that can be generated in the chat completion.
|
||||
#[serde(default)]
|
||||
#[schema(example = "32")]
|
||||
pub max_tokens: Option<u32>,
|
||||
|
||||
/// UNUSED
|
||||
/// How many chat completion choices to generate for each input message. Note that you will be charged based on the
|
||||
/// number of generated tokens across all of the choices. Keep n as 1 to minimize costs.
|
||||
#[serde(default)]
|
||||
#[schema(nullable = true, example = "2")]
|
||||
pub n: Option<u32>,
|
||||
|
||||
/// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far,
|
||||
/// increasing the model's likelihood to talk about new topics
|
||||
#[serde(default)]
|
||||
#[schema(nullable = true, example = 0.1)]
|
||||
pub presence_penalty: Option<f32>,
|
||||
|
||||
/// Up to 4 sequences where the API will stop generating further tokens.
|
||||
#[serde(default)]
|
||||
#[schema(nullable = true, example = "null")]
|
||||
pub stop: Option<Vec<String>>,
|
||||
|
||||
#[serde(default = "bool::default")]
|
||||
pub stream: bool,
|
||||
|
||||
#[schema(nullable = true, example = 42)]
|
||||
pub seed: Option<u64>,
|
||||
|
||||
/// What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while
|
||||
/// lower values like 0.2 will make it more focused and deterministic.
|
||||
///
|
||||
/// We generally recommend altering this or `top_p` but not both.
|
||||
#[serde(default)]
|
||||
#[schema(nullable = true, example = 1.0)]
|
||||
pub temperature: Option<f32>,
|
||||
|
||||
/// An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the
|
||||
/// tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
|
||||
#[serde(default)]
|
||||
#[schema(nullable = true, example = 0.95)]
|
||||
pub top_p: Option<f32>,
|
||||
|
||||
/// A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of
|
||||
/// functions the model may generate JSON inputs for.
|
||||
#[serde(default)]
|
||||
#[schema(nullable = true, example = "null")]
|
||||
pub tools: Option<Vec<Tool>>,
|
||||
|
||||
/// A prompt to be appended before the tools
|
||||
#[serde(default)]
|
||||
#[schema(
|
||||
nullable = true,
|
||||
example = "Given the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables."
|
||||
)]
|
||||
pub tool_prompt: Option<String>,
|
||||
|
||||
/// A specific tool to use. If not provided, the model will default to use any of the tools provided in the tools parameter.
|
||||
#[serde(default)]
|
||||
#[schema(nullable = true, example = "null")]
|
||||
pub tool_choice: ToolChoice,
|
||||
|
||||
/// Response format constraints for the generation.
|
||||
///
|
||||
/// NOTE: A request can use `response_format` OR `tools` but not both.
|
||||
#[serde(default)]
|
||||
#[schema(nullable = true, default = "null", example = "null")]
|
||||
pub response_format: Option<GrammarType>,
|
||||
|
||||
/// A guideline to be used in the chat_template
|
||||
#[serde(default)]
|
||||
#[schema(nullable = true, default = "null", example = "null")]
|
||||
pub guideline: Option<String>,
|
||||
|
||||
/// Options for streaming response. Only set this when you set stream: true.
|
||||
#[serde(default)]
|
||||
#[schema(nullable = true, example = "null")]
|
||||
pub stream_options: Option<StreamOptions>,
|
||||
}
|
||||
|
||||
impl From<VertexChat> for ChatRequest {
|
||||
fn from(val: VertexChat) -> Self {
|
||||
Self {
|
||||
messages: val.messages,
|
||||
frequency_penalty: val.parameters.frequency_penalty,
|
||||
guideline: val.parameters.guideline,
|
||||
logit_bias: val.parameters.logit_bias,
|
||||
logprobs: val.parameters.logprobs,
|
||||
max_tokens: val.parameters.max_tokens,
|
||||
model: val.parameters.model,
|
||||
n: val.parameters.n,
|
||||
presence_penalty: val.parameters.presence_penalty,
|
||||
response_format: val.parameters.response_format,
|
||||
seed: val.parameters.seed,
|
||||
stop: val.parameters.stop,
|
||||
stream_options: val.parameters.stream_options,
|
||||
stream: val.parameters.stream,
|
||||
temperature: val.parameters.temperature,
|
||||
tool_choice: val.parameters.tool_choice,
|
||||
tool_prompt: val.parameters.tool_prompt,
|
||||
tools: val.parameters.tools,
|
||||
top_logprobs: val.parameters.top_logprobs,
|
||||
top_p: val.parameters.top_p,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Deserialize, ToSchema)]
|
||||
#[cfg_attr(test, derive(Debug, PartialEq))]
|
||||
#[serde(untagged)]
|
||||
pub(crate) enum VertexInstance {
|
||||
Generate(GenerateVertexInstance),
|
||||
Chat(VertexChat),
|
||||
}
|
||||
|
||||
#[derive(Deserialize, ToSchema)]
|
||||
#[cfg_attr(test, derive(Debug, PartialEq))]
|
||||
pub(crate) struct VertexRequest {
|
||||
#[serde(rename = "instances")]
|
||||
pub instances: Vec<VertexInstance>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Deserialize, ToSchema, Serialize)]
|
||||
pub(crate) struct VertexResponse {
|
||||
pub predictions: Vec<String>,
|
||||
}
|
||||
|
||||
/// Generate tokens from Vertex request
|
||||
#[utoipa::path(
|
||||
post,
|
||||
tag = "Text Generation Inference",
|
||||
path = "/vertex",
|
||||
request_body = VertexRequest,
|
||||
responses(
|
||||
(status = 200, description = "Generated Text", body = VertexResponse),
|
||||
(status = 424, description = "Generation Error", body = ErrorResponse,
|
||||
example = json ! ({"error": "Request failed during generation"})),
|
||||
(status = 429, description = "Model is overloaded", body = ErrorResponse,
|
||||
example = json ! ({"error": "Model is overloaded"})),
|
||||
(status = 422, description = "Input validation error", body = ErrorResponse,
|
||||
example = json ! ({"error": "Input validation error"})),
|
||||
(status = 500, description = "Incomplete generation", body = ErrorResponse,
|
||||
example = json ! ({"error": "Incomplete generation"})),
|
||||
)
|
||||
)]
|
||||
#[instrument(
|
||||
skip_all,
|
||||
fields(
|
||||
total_time,
|
||||
validation_time,
|
||||
queue_time,
|
||||
inference_time,
|
||||
time_per_token,
|
||||
seed,
|
||||
)
|
||||
)]
|
||||
pub(crate) async fn vertex_compatibility(
|
||||
Extension(infer): Extension<Infer>,
|
||||
Extension(compute_type): Extension<ComputeType>,
|
||||
Json(req): Json<VertexRequest>,
|
||||
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
|
||||
let span = tracing::Span::current();
|
||||
metrics::counter!("tgi_request_count").increment(1);
|
||||
|
||||
// check that theres at least one instance
|
||||
if req.instances.is_empty() {
|
||||
return Err((
|
||||
StatusCode::UNPROCESSABLE_ENTITY,
|
||||
Json(ErrorResponse {
|
||||
error: "Input validation error".to_string(),
|
||||
error_type: "Input validation error".to_string(),
|
||||
}),
|
||||
));
|
||||
}
|
||||
|
||||
// Prepare futures for all instances
|
||||
let mut futures = Vec::with_capacity(req.instances.len());
|
||||
|
||||
for instance in req.instances.into_iter() {
|
||||
let generate_request = match instance {
|
||||
VertexInstance::Generate(instance) => GenerateRequest {
|
||||
inputs: instance.inputs.clone(),
|
||||
add_special_tokens: true,
|
||||
parameters: GenerateParameters {
|
||||
do_sample: true,
|
||||
max_new_tokens: instance.parameters.as_ref().and_then(|p| p.max_new_tokens),
|
||||
seed: instance.parameters.as_ref().and_then(|p| p.seed),
|
||||
details: true,
|
||||
decoder_input_details: true,
|
||||
..Default::default()
|
||||
},
|
||||
},
|
||||
VertexInstance::Chat(instance) => {
|
||||
let chat_request: ChatRequest = instance.into();
|
||||
let (generate_request, _using_tools): (GenerateRequest, bool) =
|
||||
chat_request.try_into_generate(&infer)?;
|
||||
generate_request
|
||||
}
|
||||
};
|
||||
|
||||
let infer_clone = infer.clone();
|
||||
let compute_type_clone = compute_type.clone();
|
||||
let span_clone = span.clone();
|
||||
|
||||
futures.push(async move {
|
||||
generate_internal(
|
||||
Extension(infer_clone),
|
||||
compute_type_clone,
|
||||
Json(generate_request),
|
||||
span_clone,
|
||||
)
|
||||
.await
|
||||
.map(|(_, Json(generation))| generation.generated_text)
|
||||
.map_err(|_| {
|
||||
(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(ErrorResponse {
|
||||
error: "Incomplete generation".into(),
|
||||
error_type: "Incomplete generation".into(),
|
||||
}),
|
||||
)
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
// execute all futures in parallel, collect results, returning early if any error occurs
|
||||
let results = futures::future::join_all(futures).await;
|
||||
let predictions: Result<Vec<_>, _> = results.into_iter().collect();
|
||||
let predictions = predictions?;
|
||||
|
||||
let response = VertexResponse { predictions };
|
||||
Ok((HeaderMap::new(), Json(response)).into_response())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::{Message, MessageContent};
|
||||
|
||||
#[test]
|
||||
fn vertex_deserialization() {
|
||||
let string = serde_json::json!({
|
||||
|
||||
"messages": [{"role": "user", "content": "What's Deep Learning?"}],
|
||||
"parameters": {
|
||||
"max_tokens": 128,
|
||||
"top_p": 0.95,
|
||||
"temperature": 0.7
|
||||
}
|
||||
});
|
||||
|
||||
let _request: VertexChat = serde_json::from_value(string).expect("Can deserialize");
|
||||
|
||||
let string = serde_json::json!({
|
||||
"messages": [{"role": "user", "content": "What's Deep Learning?"}],
|
||||
});
|
||||
|
||||
let _request: VertexChat = serde_json::from_value(string).expect("Can deserialize");
|
||||
|
||||
let string = serde_json::json!({
|
||||
|
||||
"instances": [
|
||||
{
|
||||
"messages": [{"role": "user", "content": "What's Deep Learning?"}],
|
||||
"parameters": {
|
||||
"max_tokens": 128,
|
||||
"top_p": 0.95,
|
||||
"temperature": 0.7
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
});
|
||||
let request: VertexRequest = serde_json::from_value(string).expect("Can deserialize");
|
||||
assert_eq!(
|
||||
request,
|
||||
VertexRequest {
|
||||
instances: vec![VertexInstance::Chat(VertexChat {
|
||||
messages: vec![Message {
|
||||
role: "user".to_string(),
|
||||
content: MessageContent::SingleText("What's Deep Learning?".to_string()),
|
||||
name: None,
|
||||
},],
|
||||
parameters: VertexParameters {
|
||||
max_tokens: Some(128),
|
||||
top_p: Some(0.95),
|
||||
temperature: Some(0.7),
|
||||
..Default::default()
|
||||
}
|
||||
})]
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue