update to metrics 0.23.0 or could work with metrics-exporter-promethe… (#2190)
update to metrics 0.23.0 or could work with metrics-exporter-prometheus 0.15.1 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
This commit is contained in:
parent
16d9e505fd
commit
58effe78b5
|
@ -1935,17 +1935,6 @@ version = "2.7.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "metrics"
|
|
||||||
version = "0.21.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "fde3af1a009ed76a778cb84fdef9e7dbbdf5775ae3e4cc1f434a6a307f6f76c5"
|
|
||||||
dependencies = [
|
|
||||||
"ahash",
|
|
||||||
"metrics-macros",
|
|
||||||
"portable-atomic",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "metrics"
|
name = "metrics"
|
||||||
version = "0.23.0"
|
version = "0.23.0"
|
||||||
|
@ -1969,7 +1958,7 @@ dependencies = [
|
||||||
"hyper-util",
|
"hyper-util",
|
||||||
"indexmap 2.2.6",
|
"indexmap 2.2.6",
|
||||||
"ipnet",
|
"ipnet",
|
||||||
"metrics 0.23.0",
|
"metrics",
|
||||||
"metrics-util",
|
"metrics-util",
|
||||||
"quanta",
|
"quanta",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
@ -1977,17 +1966,6 @@ dependencies = [
|
||||||
"tracing",
|
"tracing",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "metrics-macros"
|
|
||||||
version = "0.7.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "38b4faf00617defe497754acde3024865bc143d44a86799b24e191ecff91354f"
|
|
||||||
dependencies = [
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"syn 2.0.68",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "metrics-util"
|
name = "metrics-util"
|
||||||
version = "0.17.0"
|
version = "0.17.0"
|
||||||
|
@ -1997,7 +1975,7 @@ dependencies = [
|
||||||
"crossbeam-epoch",
|
"crossbeam-epoch",
|
||||||
"crossbeam-utils",
|
"crossbeam-utils",
|
||||||
"hashbrown 0.14.5",
|
"hashbrown 0.14.5",
|
||||||
"metrics 0.23.0",
|
"metrics",
|
||||||
"num_cpus",
|
"num_cpus",
|
||||||
"quanta",
|
"quanta",
|
||||||
"sketches-ddsketch",
|
"sketches-ddsketch",
|
||||||
|
@ -3834,7 +3812,7 @@ dependencies = [
|
||||||
"init-tracing-opentelemetry",
|
"init-tracing-opentelemetry",
|
||||||
"itertools 0.10.5",
|
"itertools 0.10.5",
|
||||||
"jsonschema",
|
"jsonschema",
|
||||||
"metrics 0.21.1",
|
"metrics",
|
||||||
"metrics-exporter-prometheus",
|
"metrics-exporter-prometheus",
|
||||||
"minijinja",
|
"minijinja",
|
||||||
"minijinja-contrib",
|
"minijinja-contrib",
|
||||||
|
|
|
@ -24,7 +24,7 @@ futures = "0.3.28"
|
||||||
hf-hub = { workspace = true }
|
hf-hub = { workspace = true }
|
||||||
itertools = "0.10"
|
itertools = "0.10"
|
||||||
jsonschema = { version = "0.17.1", features = ["draft202012"] }
|
jsonschema = { version = "0.17.1", features = ["draft202012"] }
|
||||||
metrics = "0.21.1"
|
metrics = "0.23.0"
|
||||||
metrics-exporter-prometheus = { version = "0.15.1", features = [] }
|
metrics-exporter-prometheus = { version = "0.15.1", features = [] }
|
||||||
nohash-hasher = "0.2.0"
|
nohash-hasher = "0.2.0"
|
||||||
opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
|
opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }
|
||||||
|
|
|
@ -91,14 +91,14 @@ impl Infer {
|
||||||
.limit_concurrent_requests
|
.limit_concurrent_requests
|
||||||
.try_acquire_owned()
|
.try_acquire_owned()
|
||||||
.map_err(|err| {
|
.map_err(|err| {
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "overloaded");
|
metrics::counter!("tgi_request_failure", "err" => "overloaded").increment(1);
|
||||||
tracing::error!("{err}");
|
tracing::error!("{err}");
|
||||||
err
|
err
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
// Validate request
|
// Validate request
|
||||||
let valid_request = self.validation.validate(request).await.map_err(|err| {
|
let valid_request = self.validation.validate(request).await.map_err(|err| {
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "validation");
|
metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
|
||||||
tracing::error!("{err}");
|
tracing::error!("{err}");
|
||||||
err
|
err
|
||||||
})?;
|
})?;
|
||||||
|
@ -140,7 +140,7 @@ impl Infer {
|
||||||
.ok_or_else(|| InferError::TemplateError(ErrorKind::TemplateNotFound.into()))?
|
.ok_or_else(|| InferError::TemplateError(ErrorKind::TemplateNotFound.into()))?
|
||||||
.apply(messages, grammar_with_prompt)
|
.apply(messages, grammar_with_prompt)
|
||||||
.map_err(|e| {
|
.map_err(|e| {
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "template");
|
metrics::counter!("tgi_request_failure", "err" => "template").increment(1);
|
||||||
tracing::error!("{e}");
|
tracing::error!("{e}");
|
||||||
e
|
e
|
||||||
})
|
})
|
||||||
|
@ -214,7 +214,7 @@ impl Infer {
|
||||||
})
|
})
|
||||||
} else {
|
} else {
|
||||||
let err = InferError::IncompleteGeneration;
|
let err = InferError::IncompleteGeneration;
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "incomplete");
|
metrics::counter!("tgi_request_failure", "err" => "incomplete").increment(1);
|
||||||
tracing::error!("{err}");
|
tracing::error!("{err}");
|
||||||
Err(err)
|
Err(err)
|
||||||
}
|
}
|
||||||
|
|
|
@ -111,7 +111,7 @@ async fn queue_task(
|
||||||
match cmd {
|
match cmd {
|
||||||
QueueCommand::Append(entry, span) => {
|
QueueCommand::Append(entry, span) => {
|
||||||
span.in_scope(|| state.append(*entry));
|
span.in_scope(|| state.append(*entry));
|
||||||
metrics::increment_gauge!("tgi_queue_size", 1.0);
|
metrics::gauge!("tgi_queue_size").increment(1.0);
|
||||||
}
|
}
|
||||||
QueueCommand::NextBatch {
|
QueueCommand::NextBatch {
|
||||||
min_size,
|
min_size,
|
||||||
|
@ -124,7 +124,7 @@ async fn queue_task(
|
||||||
let next_batch =
|
let next_batch =
|
||||||
state.next_batch(min_size, max_size, prefill_token_budget, token_budget);
|
state.next_batch(min_size, max_size, prefill_token_budget, token_budget);
|
||||||
response_sender.send(next_batch).unwrap();
|
response_sender.send(next_batch).unwrap();
|
||||||
metrics::gauge!("tgi_queue_size", state.entries.len() as f64);
|
metrics::gauge!("tgi_queue_size").set(state.entries.len() as f64);
|
||||||
}),
|
}),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -226,7 +226,7 @@ impl State {
|
||||||
// Filter entries where the response receiver was dropped (== entries where the request
|
// Filter entries where the response receiver was dropped (== entries where the request
|
||||||
// was dropped by the client)
|
// was dropped by the client)
|
||||||
if entry.response_tx.is_closed() {
|
if entry.response_tx.is_closed() {
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
|
metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
|
||||||
tracing::debug!("Dropping entry");
|
tracing::debug!("Dropping entry");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -336,7 +336,7 @@ impl State {
|
||||||
// Increment batch id
|
// Increment batch id
|
||||||
self.next_batch_id += 1;
|
self.next_batch_id += 1;
|
||||||
|
|
||||||
metrics::histogram!("tgi_batch_next_size", batch.size as f64);
|
metrics::histogram!("tgi_batch_next_size").record(batch.size as f64);
|
||||||
|
|
||||||
Some((batch_entries, batch, next_batch_span))
|
Some((batch_entries, batch, next_batch_span))
|
||||||
}
|
}
|
||||||
|
|
|
@ -148,8 +148,8 @@ pub(crate) async fn batching_task(
|
||||||
let batch_size = batch.size;
|
let batch_size = batch.size;
|
||||||
let batch_max_tokens = batch.max_tokens;
|
let batch_max_tokens = batch.max_tokens;
|
||||||
let mut batches = vec![batch];
|
let mut batches = vec![batch];
|
||||||
metrics::gauge!("tgi_batch_current_size", batch_size as f64);
|
metrics::gauge!("tgi_batch_current_size").set(batch_size as f64);
|
||||||
metrics::gauge!("tgi_batch_current_max_tokens", batch_max_tokens as f64);
|
metrics::gauge!("tgi_batch_current_max_tokens").set(batch_max_tokens as f64);
|
||||||
|
|
||||||
let min_size = if waiting_tokens >= max_waiting_tokens {
|
let min_size = if waiting_tokens >= max_waiting_tokens {
|
||||||
// If we didn't onboard any new requests since >= max_waiting_tokens, we try
|
// If we didn't onboard any new requests since >= max_waiting_tokens, we try
|
||||||
|
@ -170,9 +170,11 @@ pub(crate) async fn batching_task(
|
||||||
{
|
{
|
||||||
// Tracking metrics
|
// Tracking metrics
|
||||||
if min_size.is_some() {
|
if min_size.is_some() {
|
||||||
metrics::increment_counter!("tgi_batch_concat", "reason" => "backpressure");
|
metrics::counter!("tgi_batch_concat", "reason" => "backpressure")
|
||||||
|
.increment(1);
|
||||||
} else {
|
} else {
|
||||||
metrics::increment_counter!("tgi_batch_concat", "reason" => "wait_exceeded");
|
metrics::counter!("tgi_batch_concat", "reason" => "wait_exceeded")
|
||||||
|
.increment(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
entries.iter_mut().for_each(|(_, entry)| {
|
entries.iter_mut().for_each(|(_, entry)| {
|
||||||
|
@ -219,8 +221,8 @@ pub(crate) async fn batching_task(
|
||||||
.await;
|
.await;
|
||||||
waiting_tokens += 1;
|
waiting_tokens += 1;
|
||||||
}
|
}
|
||||||
metrics::gauge!("tgi_batch_current_size", 0.0);
|
metrics::gauge!("tgi_batch_current_size").set(0.0);
|
||||||
metrics::gauge!("tgi_batch_current_max_tokens", 0.0);
|
metrics::gauge!("tgi_batch_current_max_tokens").set(0.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -234,7 +236,7 @@ async fn prefill(
|
||||||
) -> Option<CachedBatch> {
|
) -> Option<CachedBatch> {
|
||||||
let start_time = Instant::now();
|
let start_time = Instant::now();
|
||||||
let batch_id = batch.id;
|
let batch_id = batch.id;
|
||||||
metrics::increment_counter!("tgi_batch_inference_count", "method" => "prefill");
|
metrics::counter!("tgi_batch_inference_count", "method" => "prefill").increment(1);
|
||||||
|
|
||||||
match client.prefill(batch).await {
|
match client.prefill(batch).await {
|
||||||
Ok((generations, next_batch, timings)) => {
|
Ok((generations, next_batch, timings)) => {
|
||||||
|
@ -248,11 +250,15 @@ async fn prefill(
|
||||||
// Filter next batch and remove requests that were stopped
|
// Filter next batch and remove requests that were stopped
|
||||||
let next_batch = filter_batch(client, next_batch, entries).await;
|
let next_batch = filter_batch(client, next_batch, entries).await;
|
||||||
|
|
||||||
metrics::histogram!("tgi_batch_forward_duration", timings.forward.as_secs_f64(), "method" => "prefill");
|
metrics::histogram!("tgi_batch_forward_duration","method" => "prefill")
|
||||||
metrics::histogram!("tgi_batch_decode_duration", timings.decode.as_secs_f64(), "method" => "prefill");
|
.record(timings.forward.as_secs_f64());
|
||||||
metrics::histogram!("tgi_batch_filter_duration", start_filtering_time.elapsed().as_secs_f64(), "method" => "prefill");
|
metrics::histogram!("tgi_batch_decode_duration", "method" => "prefill")
|
||||||
metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "prefill");
|
.record(timings.decode.as_secs_f64());
|
||||||
metrics::increment_counter!("tgi_batch_inference_success", "method" => "prefill");
|
metrics::histogram!("tgi_batch_filter_duration", "method" => "prefill")
|
||||||
|
.record(start_filtering_time.elapsed().as_secs_f64());
|
||||||
|
metrics::histogram!("tgi_batch_inference_duration","method" => "prefill")
|
||||||
|
.record(start_time.elapsed().as_secs_f64());
|
||||||
|
metrics::counter!("tgi_batch_inference_success", "method" => "prefill").increment(1);
|
||||||
next_batch
|
next_batch
|
||||||
}
|
}
|
||||||
// If we have an error, we discard the whole batch
|
// If we have an error, we discard the whole batch
|
||||||
|
@ -261,7 +267,7 @@ async fn prefill(
|
||||||
generation_health.store(false, Ordering::SeqCst);
|
generation_health.store(false, Ordering::SeqCst);
|
||||||
let _ = client.clear_cache(Some(batch_id)).await;
|
let _ = client.clear_cache(Some(batch_id)).await;
|
||||||
send_errors(err, entries);
|
send_errors(err, entries);
|
||||||
metrics::increment_counter!("tgi_batch_inference_failure", "method" => "prefill");
|
metrics::counter!("tgi_batch_inference_failure", "method" => "prefill").increment(1);
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -276,7 +282,7 @@ async fn decode(
|
||||||
) -> Option<CachedBatch> {
|
) -> Option<CachedBatch> {
|
||||||
let start_time = Instant::now();
|
let start_time = Instant::now();
|
||||||
let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
|
let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
|
||||||
metrics::increment_counter!("tgi_batch_inference_count", "method" => "decode");
|
metrics::counter!("tgi_batch_inference_count", "method" => "decode").increment(1);
|
||||||
|
|
||||||
match client.decode(batches).await {
|
match client.decode(batches).await {
|
||||||
Ok((generations, next_batch, timings)) => {
|
Ok((generations, next_batch, timings)) => {
|
||||||
|
@ -291,13 +297,18 @@ async fn decode(
|
||||||
let next_batch = filter_batch(client, next_batch, entries).await;
|
let next_batch = filter_batch(client, next_batch, entries).await;
|
||||||
|
|
||||||
if let Some(concat_duration) = timings.concat {
|
if let Some(concat_duration) = timings.concat {
|
||||||
metrics::histogram!("tgi_batch_concat_duration", concat_duration.as_secs_f64(), "method" => "decode");
|
metrics::histogram!("tgi_batch_concat_duration", "method" => "decode")
|
||||||
|
.record(concat_duration.as_secs_f64());
|
||||||
}
|
}
|
||||||
metrics::histogram!("tgi_batch_forward_duration", timings.forward.as_secs_f64(), "method" => "decode");
|
metrics::histogram!("tgi_batch_forward_duration", "method" => "decode")
|
||||||
metrics::histogram!("tgi_batch_decode_duration", timings.decode.as_secs_f64(), "method" => "decode");
|
.record(timings.forward.as_secs_f64());
|
||||||
metrics::histogram!("tgi_batch_filter_duration", start_filtering_time.elapsed().as_secs_f64(), "method" => "decode");
|
metrics::histogram!("tgi_batch_decode_duration", "method" => "decode")
|
||||||
metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "decode");
|
.record(timings.decode.as_secs_f64());
|
||||||
metrics::increment_counter!("tgi_batch_inference_success", "method" => "decode");
|
metrics::histogram!("tgi_batch_filter_duration", "method" => "decode")
|
||||||
|
.record(start_filtering_time.elapsed().as_secs_f64());
|
||||||
|
metrics::histogram!("tgi_batch_inference_duration", "method" => "decode")
|
||||||
|
.record(start_time.elapsed().as_secs_f64());
|
||||||
|
metrics::counter!("tgi_batch_inference_success", "method" => "decode").increment(1);
|
||||||
next_batch
|
next_batch
|
||||||
}
|
}
|
||||||
// If we have an error, we discard the whole batch
|
// If we have an error, we discard the whole batch
|
||||||
|
@ -307,7 +318,7 @@ async fn decode(
|
||||||
let _ = client.clear_cache(Some(id)).await;
|
let _ = client.clear_cache(Some(id)).await;
|
||||||
}
|
}
|
||||||
send_errors(err, entries);
|
send_errors(err, entries);
|
||||||
metrics::increment_counter!("tgi_batch_inference_failure", "method" => "decode");
|
metrics::counter!("tgi_batch_inference_failure", "method" => "decode").increment(1);
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -365,7 +376,7 @@ fn filter_send_generations(generations: Vec<Generation>, entries: &mut IntMap<u6
|
||||||
// request and we need to stop generating hence why we unwrap_or(true)
|
// request and we need to stop generating hence why we unwrap_or(true)
|
||||||
let stopped = send_responses(generation, entry).map_err(|err| {
|
let stopped = send_responses(generation, entry).map_err(|err| {
|
||||||
tracing::error!("Entry response channel error.");
|
tracing::error!("Entry response channel error.");
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
|
metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
|
||||||
err
|
err
|
||||||
}).unwrap_or(true);
|
}).unwrap_or(true);
|
||||||
if stopped {
|
if stopped {
|
||||||
|
@ -381,7 +392,7 @@ fn send_responses(
|
||||||
) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> {
|
) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> {
|
||||||
// Return directly if the channel is disconnected
|
// Return directly if the channel is disconnected
|
||||||
if entry.response_tx.is_closed() {
|
if entry.response_tx.is_closed() {
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
|
metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
|
||||||
return Ok(true);
|
return Ok(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -407,7 +418,7 @@ fn send_responses(
|
||||||
// Create last Token
|
// Create last Token
|
||||||
let tokens_ = generation.tokens.expect("Non empty tokens in generation");
|
let tokens_ = generation.tokens.expect("Non empty tokens in generation");
|
||||||
let n = tokens_.ids.len();
|
let n = tokens_.ids.len();
|
||||||
metrics::histogram!("tgi_request_skipped_tokens", (n - 1) as f64);
|
metrics::histogram!("tgi_request_skipped_tokens").record((n - 1) as f64);
|
||||||
let mut iterator = tokens_
|
let mut iterator = tokens_
|
||||||
.ids
|
.ids
|
||||||
.into_iter()
|
.into_iter()
|
||||||
|
@ -472,7 +483,7 @@ fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
|
||||||
// Create and enter a span to link this function back to the entry
|
// Create and enter a span to link this function back to the entry
|
||||||
let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
|
let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
|
||||||
let err = InferError::GenerationError(error.to_string());
|
let err = InferError::GenerationError(error.to_string());
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "generation");
|
metrics::counter!("tgi_request_failure", "err" => "generation").increment(1);
|
||||||
tracing::error!("{err}");
|
tracing::error!("{err}");
|
||||||
|
|
||||||
// unwrap_or is valid here as we don't care if the receiver is gone.
|
// unwrap_or is valid here as we don't care if the receiver is gone.
|
||||||
|
|
|
@ -126,7 +126,7 @@ async fn queue_task(
|
||||||
match cmd {
|
match cmd {
|
||||||
QueueCommand::Append(entry, span) => {
|
QueueCommand::Append(entry, span) => {
|
||||||
span.in_scope(|| state.append(*entry));
|
span.in_scope(|| state.append(*entry));
|
||||||
metrics::increment_gauge!("tgi_queue_size", 1.0);
|
metrics::gauge!("tgi_queue_size").increment(1.0);
|
||||||
}
|
}
|
||||||
QueueCommand::NextBatch {
|
QueueCommand::NextBatch {
|
||||||
min_size,
|
min_size,
|
||||||
|
@ -141,7 +141,7 @@ async fn queue_task(
|
||||||
.instrument(span)
|
.instrument(span)
|
||||||
.await;
|
.await;
|
||||||
response_sender.send(next_batch).unwrap();
|
response_sender.send(next_batch).unwrap();
|
||||||
metrics::gauge!("tgi_queue_size", state.entries.len() as f64);
|
metrics::gauge!("tgi_queue_size").set(state.entries.len() as f64);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -248,7 +248,7 @@ impl State {
|
||||||
// Filter entries where the response receiver was dropped (== entries where the request
|
// Filter entries where the response receiver was dropped (== entries where the request
|
||||||
// was dropped by the client)
|
// was dropped by the client)
|
||||||
if entry.response_tx.is_closed() {
|
if entry.response_tx.is_closed() {
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
|
metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
|
||||||
tracing::debug!("Dropping entry");
|
tracing::debug!("Dropping entry");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -399,7 +399,7 @@ impl State {
|
||||||
// Increment batch id
|
// Increment batch id
|
||||||
self.next_batch_id += 1;
|
self.next_batch_id += 1;
|
||||||
|
|
||||||
metrics::histogram!("tgi_batch_next_size", batch.size as f64);
|
metrics::histogram!("tgi_batch_next_size").record(batch.size as f64);
|
||||||
|
|
||||||
Some((batch_entries, batch, next_batch_span))
|
Some((batch_entries, batch, next_batch_span))
|
||||||
}
|
}
|
||||||
|
|
|
@ -154,8 +154,8 @@ pub(crate) async fn batching_task(
|
||||||
let batch_size = batch.size;
|
let batch_size = batch.size;
|
||||||
let batch_max_tokens = batch.max_tokens;
|
let batch_max_tokens = batch.max_tokens;
|
||||||
let mut batches = vec![batch];
|
let mut batches = vec![batch];
|
||||||
metrics::gauge!("tgi_batch_current_size", batch_size as f64);
|
metrics::gauge!("tgi_batch_current_size").set(batch_size as f64);
|
||||||
metrics::gauge!("tgi_batch_current_max_tokens", batch_max_tokens as f64);
|
metrics::gauge!("tgi_batch_current_max_tokens").set(batch_max_tokens as f64);
|
||||||
|
|
||||||
let min_size = if waiting_tokens >= max_waiting_tokens {
|
let min_size = if waiting_tokens >= max_waiting_tokens {
|
||||||
// If we didn't onboard any new requests since >= max_waiting_tokens, we try
|
// If we didn't onboard any new requests since >= max_waiting_tokens, we try
|
||||||
|
@ -176,9 +176,11 @@ pub(crate) async fn batching_task(
|
||||||
{
|
{
|
||||||
// Tracking metrics
|
// Tracking metrics
|
||||||
if min_size.is_some() {
|
if min_size.is_some() {
|
||||||
metrics::increment_counter!("tgi_batch_concat", "reason" => "backpressure");
|
metrics::counter!("tgi_batch_concat", "reason" => "backpressure")
|
||||||
|
.increment(1);
|
||||||
} else {
|
} else {
|
||||||
metrics::increment_counter!("tgi_batch_concat", "reason" => "wait_exceeded");
|
metrics::counter!("tgi_batch_concat", "reason" => "wait_exceeded")
|
||||||
|
.increment(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
entries.iter_mut().for_each(|(_, entry)| {
|
entries.iter_mut().for_each(|(_, entry)| {
|
||||||
|
@ -225,8 +227,8 @@ pub(crate) async fn batching_task(
|
||||||
.await;
|
.await;
|
||||||
waiting_tokens += 1;
|
waiting_tokens += 1;
|
||||||
}
|
}
|
||||||
metrics::gauge!("tgi_batch_current_size", 0.0);
|
metrics::gauge!("tgi_batch_current_size").set(0.0);
|
||||||
metrics::gauge!("tgi_batch_current_max_tokens", 0.0);
|
metrics::gauge!("tgi_batch_current_max_tokens").set(0.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -240,7 +242,7 @@ async fn prefill(
|
||||||
) -> Option<CachedBatch> {
|
) -> Option<CachedBatch> {
|
||||||
let start_time = Instant::now();
|
let start_time = Instant::now();
|
||||||
let batch_id = batch.id;
|
let batch_id = batch.id;
|
||||||
metrics::increment_counter!("tgi_batch_inference_count", "method" => "prefill");
|
metrics::counter!("tgi_batch_inference_count", "method" => "prefill").increment(1);
|
||||||
|
|
||||||
match client.prefill(batch).await {
|
match client.prefill(batch).await {
|
||||||
Ok((generations, next_batch, timings)) => {
|
Ok((generations, next_batch, timings)) => {
|
||||||
|
@ -254,11 +256,15 @@ async fn prefill(
|
||||||
// Filter next batch and remove requests that were stopped
|
// Filter next batch and remove requests that were stopped
|
||||||
let next_batch = filter_batch(client, next_batch, entries).await;
|
let next_batch = filter_batch(client, next_batch, entries).await;
|
||||||
|
|
||||||
metrics::histogram!("tgi_batch_forward_duration", timings.forward.as_secs_f64(), "method" => "prefill");
|
metrics::histogram!("tgi_batch_forward_duration","method" => "prefill")
|
||||||
metrics::histogram!("tgi_batch_decode_duration", timings.decode.as_secs_f64(), "method" => "prefill");
|
.record(timings.forward.as_secs_f64());
|
||||||
metrics::histogram!("tgi_batch_filter_duration", start_filtering_time.elapsed().as_secs_f64(), "method" => "prefill");
|
metrics::histogram!("tgi_batch_decode_duration", "method" => "prefill")
|
||||||
metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "prefill");
|
.record(timings.decode.as_secs_f64());
|
||||||
metrics::increment_counter!("tgi_batch_inference_success", "method" => "prefill");
|
metrics::histogram!("tgi_batch_filter_duration", "method" => "prefill")
|
||||||
|
.record(start_filtering_time.elapsed().as_secs_f64());
|
||||||
|
metrics::histogram!("tgi_batch_inference_duration", "method" => "prefill")
|
||||||
|
.record(start_time.elapsed().as_secs_f64());
|
||||||
|
metrics::counter!("tgi_batch_inference_success", "method" => "prefill").increment(1);
|
||||||
next_batch
|
next_batch
|
||||||
}
|
}
|
||||||
// If we have an error, we discard the whole batch
|
// If we have an error, we discard the whole batch
|
||||||
|
@ -267,7 +273,7 @@ async fn prefill(
|
||||||
generation_health.store(false, Ordering::SeqCst);
|
generation_health.store(false, Ordering::SeqCst);
|
||||||
let _ = client.clear_cache(Some(batch_id)).await;
|
let _ = client.clear_cache(Some(batch_id)).await;
|
||||||
send_errors(err, entries);
|
send_errors(err, entries);
|
||||||
metrics::increment_counter!("tgi_batch_inference_failure", "method" => "prefill");
|
metrics::counter!("tgi_batch_inference_failure", "method" => "prefill").increment(1);
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -282,7 +288,7 @@ async fn decode(
|
||||||
) -> Option<CachedBatch> {
|
) -> Option<CachedBatch> {
|
||||||
let start_time = Instant::now();
|
let start_time = Instant::now();
|
||||||
let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
|
let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
|
||||||
metrics::increment_counter!("tgi_batch_inference_count", "method" => "decode");
|
metrics::counter!("tgi_batch_inference_count", "method" => "decode").increment(1);
|
||||||
|
|
||||||
match client.decode(batches).await {
|
match client.decode(batches).await {
|
||||||
Ok((generations, next_batch, timings)) => {
|
Ok((generations, next_batch, timings)) => {
|
||||||
|
@ -297,13 +303,18 @@ async fn decode(
|
||||||
let next_batch = filter_batch(client, next_batch, entries).await;
|
let next_batch = filter_batch(client, next_batch, entries).await;
|
||||||
|
|
||||||
if let Some(concat_duration) = timings.concat {
|
if let Some(concat_duration) = timings.concat {
|
||||||
metrics::histogram!("tgi_batch_concat_duration", concat_duration.as_secs_f64(), "method" => "decode");
|
metrics::histogram!("tgi_batch_concat_duration", "method" => "decode")
|
||||||
|
.record(concat_duration.as_secs_f64());
|
||||||
}
|
}
|
||||||
metrics::histogram!("tgi_batch_forward_duration", timings.forward.as_secs_f64(), "method" => "decode");
|
metrics::histogram!("tgi_batch_forward_duration", "method" => "decode")
|
||||||
metrics::histogram!("tgi_batch_decode_duration", timings.decode.as_secs_f64(), "method" => "decode");
|
.record(timings.forward.as_secs_f64());
|
||||||
metrics::histogram!("tgi_batch_filter_duration", start_filtering_time.elapsed().as_secs_f64(), "method" => "decode");
|
metrics::histogram!("tgi_batch_decode_duration", "method" => "decode")
|
||||||
metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "decode");
|
.record(timings.decode.as_secs_f64());
|
||||||
metrics::increment_counter!("tgi_batch_inference_success", "method" => "decode");
|
metrics::histogram!("tgi_batch_filter_duration", "method" => "decode")
|
||||||
|
.record(start_filtering_time.elapsed().as_secs_f64());
|
||||||
|
metrics::histogram!("tgi_batch_inference_duration", "method" => "decode")
|
||||||
|
.record(start_time.elapsed().as_secs_f64());
|
||||||
|
metrics::counter!("tgi_batch_inference_success", "method" => "decode").increment(1);
|
||||||
next_batch
|
next_batch
|
||||||
}
|
}
|
||||||
// If we have an error, we discard the whole batch
|
// If we have an error, we discard the whole batch
|
||||||
|
@ -313,7 +324,7 @@ async fn decode(
|
||||||
let _ = client.clear_cache(Some(id)).await;
|
let _ = client.clear_cache(Some(id)).await;
|
||||||
}
|
}
|
||||||
send_errors(err, entries);
|
send_errors(err, entries);
|
||||||
metrics::increment_counter!("tgi_batch_inference_failure", "method" => "decode");
|
metrics::counter!("tgi_batch_inference_failure", "method" => "decode").increment(1);
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -371,7 +382,7 @@ fn filter_send_generations(generations: Vec<Generation>, entries: &mut IntMap<u6
|
||||||
// request and we need to stop generating hence why we unwrap_or(true)
|
// request and we need to stop generating hence why we unwrap_or(true)
|
||||||
let stopped = send_responses(generation, entry).map_err(|err| {
|
let stopped = send_responses(generation, entry).map_err(|err| {
|
||||||
tracing::error!("Entry response channel error.");
|
tracing::error!("Entry response channel error.");
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
|
metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
|
||||||
err
|
err
|
||||||
}).unwrap_or(true);
|
}).unwrap_or(true);
|
||||||
if stopped {
|
if stopped {
|
||||||
|
@ -387,7 +398,7 @@ fn send_responses(
|
||||||
) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> {
|
) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> {
|
||||||
// Return directly if the channel is disconnected
|
// Return directly if the channel is disconnected
|
||||||
if entry.response_tx.is_closed() {
|
if entry.response_tx.is_closed() {
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "dropped");
|
metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
|
||||||
return Ok(true);
|
return Ok(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -413,7 +424,7 @@ fn send_responses(
|
||||||
// Create last Token
|
// Create last Token
|
||||||
let tokens_ = generation.tokens.expect("Non empty tokens in generation");
|
let tokens_ = generation.tokens.expect("Non empty tokens in generation");
|
||||||
let n = tokens_.ids.len();
|
let n = tokens_.ids.len();
|
||||||
metrics::histogram!("tgi_request_skipped_tokens", (n - 1) as f64);
|
metrics::histogram!("tgi_request_skipped_tokens").record((n - 1) as f64);
|
||||||
let mut iterator = tokens_
|
let mut iterator = tokens_
|
||||||
.ids
|
.ids
|
||||||
.into_iter()
|
.into_iter()
|
||||||
|
@ -478,7 +489,7 @@ fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
|
||||||
// Create and enter a span to link this function back to the entry
|
// Create and enter a span to link this function back to the entry
|
||||||
let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
|
let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
|
||||||
let err = InferError::GenerationError(error.to_string());
|
let err = InferError::GenerationError(error.to_string());
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "generation");
|
metrics::counter!("tgi_request_failure", "err" => "generation").increment(1);
|
||||||
tracing::error!("{err}");
|
tracing::error!("{err}");
|
||||||
|
|
||||||
// unwrap_or is valid here as we don't care if the receiver is gone.
|
// unwrap_or is valid here as we don't care if the receiver is gone.
|
||||||
|
|
|
@ -185,7 +185,7 @@ pub(crate) async fn generate_internal(
|
||||||
span: tracing::Span,
|
span: tracing::Span,
|
||||||
) -> Result<(HeaderMap, Json<GenerateResponse>), (StatusCode, Json<ErrorResponse>)> {
|
) -> Result<(HeaderMap, Json<GenerateResponse>), (StatusCode, Json<ErrorResponse>)> {
|
||||||
let start_time = Instant::now();
|
let start_time = Instant::now();
|
||||||
metrics::increment_counter!("tgi_request_count");
|
metrics::counter!("tgi_request_count").increment(1);
|
||||||
|
|
||||||
// Do not long ultra long inputs, like image payloads.
|
// Do not long ultra long inputs, like image payloads.
|
||||||
tracing::debug!("Input: {}", &req.inputs[..1000.min(req.inputs.len())]);
|
tracing::debug!("Input: {}", &req.inputs[..1000.min(req.inputs.len())]);
|
||||||
|
@ -301,25 +301,15 @@ pub(crate) async fn generate_internal(
|
||||||
);
|
);
|
||||||
|
|
||||||
// Metrics
|
// Metrics
|
||||||
metrics::increment_counter!("tgi_request_success");
|
metrics::counter!("tgi_request_success").increment(1);
|
||||||
metrics::histogram!("tgi_request_duration", total_time.as_secs_f64());
|
metrics::histogram!("tgi_request_duration").record(total_time.as_secs_f64());
|
||||||
metrics::histogram!(
|
metrics::histogram!("tgi_request_validation_duration").record(validation_time.as_secs_f64());
|
||||||
"tgi_request_validation_duration",
|
metrics::histogram!("tgi_request_queue_duration").record(queue_time.as_secs_f64());
|
||||||
validation_time.as_secs_f64()
|
metrics::histogram!("tgi_request_inference_duration").record(inference_time.as_secs_f64());
|
||||||
);
|
metrics::histogram!("tgi_request_mean_time_per_token_duration")
|
||||||
metrics::histogram!("tgi_request_queue_duration", queue_time.as_secs_f64());
|
.record(time_per_token.as_secs_f64());
|
||||||
metrics::histogram!(
|
metrics::histogram!("tgi_request_generated_tokens")
|
||||||
"tgi_request_inference_duration",
|
.record(response.generated_text.generated_tokens as f64);
|
||||||
inference_time.as_secs_f64()
|
|
||||||
);
|
|
||||||
metrics::histogram!(
|
|
||||||
"tgi_request_mean_time_per_token_duration",
|
|
||||||
time_per_token.as_secs_f64()
|
|
||||||
);
|
|
||||||
metrics::histogram!(
|
|
||||||
"tgi_request_generated_tokens",
|
|
||||||
response.generated_text.generated_tokens as f64
|
|
||||||
);
|
|
||||||
|
|
||||||
// Send response
|
// Send response
|
||||||
let mut output_text = response.generated_text.text;
|
let mut output_text = response.generated_text.text;
|
||||||
|
@ -399,7 +389,7 @@ async fn generate_stream_internal(
|
||||||
span: tracing::Span,
|
span: tracing::Span,
|
||||||
) -> (HeaderMap, impl Stream<Item = Result<Event, Infallible>>) {
|
) -> (HeaderMap, impl Stream<Item = Result<Event, Infallible>>) {
|
||||||
let start_time = Instant::now();
|
let start_time = Instant::now();
|
||||||
metrics::increment_counter!("tgi_request_count");
|
metrics::counter!("tgi_request_count").increment(1);
|
||||||
|
|
||||||
tracing::debug!("Input: {}", req.inputs);
|
tracing::debug!("Input: {}", req.inputs);
|
||||||
|
|
||||||
|
@ -427,12 +417,12 @@ async fn generate_stream_internal(
|
||||||
let best_of = req.parameters.best_of.unwrap_or(1);
|
let best_of = req.parameters.best_of.unwrap_or(1);
|
||||||
if best_of != 1 {
|
if best_of != 1 {
|
||||||
let err = InferError::from(ValidationError::BestOfStream);
|
let err = InferError::from(ValidationError::BestOfStream);
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "validation");
|
metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
|
||||||
tracing::error!("{err}");
|
tracing::error!("{err}");
|
||||||
yield Ok(Event::from(err));
|
yield Ok(Event::from(err));
|
||||||
} else if req.parameters.decoder_input_details {
|
} else if req.parameters.decoder_input_details {
|
||||||
let err = InferError::from(ValidationError::PrefillDetailsStream);
|
let err = InferError::from(ValidationError::PrefillDetailsStream);
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "validation");
|
metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
|
||||||
tracing::error!("{err}");
|
tracing::error!("{err}");
|
||||||
yield Ok(Event::from(err));
|
yield Ok(Event::from(err));
|
||||||
} else {
|
} else {
|
||||||
|
@ -500,13 +490,13 @@ async fn generate_stream_internal(
|
||||||
span.record("seed", format!("{:?}", generated_text.seed));
|
span.record("seed", format!("{:?}", generated_text.seed));
|
||||||
|
|
||||||
// Metrics
|
// Metrics
|
||||||
metrics::increment_counter!("tgi_request_success");
|
metrics::counter!("tgi_request_success").increment(1);
|
||||||
metrics::histogram!("tgi_request_duration", total_time.as_secs_f64());
|
metrics::histogram!("tgi_request_duration").record(total_time.as_secs_f64());
|
||||||
metrics::histogram!("tgi_request_validation_duration", validation_time.as_secs_f64());
|
metrics::histogram!("tgi_request_validation_duration").record(validation_time.as_secs_f64());
|
||||||
metrics::histogram!("tgi_request_queue_duration", queue_time.as_secs_f64());
|
metrics::histogram!("tgi_request_queue_duration").record(queue_time.as_secs_f64());
|
||||||
metrics::histogram!("tgi_request_inference_duration", inference_time.as_secs_f64());
|
metrics::histogram!("tgi_request_inference_duration").record(inference_time.as_secs_f64());
|
||||||
metrics::histogram!("tgi_request_mean_time_per_token_duration", time_per_token.as_secs_f64());
|
metrics::histogram!("tgi_request_mean_time_per_token_duration").record(time_per_token.as_secs_f64());
|
||||||
metrics::histogram!("tgi_request_generated_tokens", generated_text.generated_tokens as f64);
|
metrics::histogram!("tgi_request_generated_tokens").record(generated_text.generated_tokens as f64);
|
||||||
|
|
||||||
// StreamResponse
|
// StreamResponse
|
||||||
end_reached = true;
|
end_reached = true;
|
||||||
|
@ -553,7 +543,7 @@ async fn generate_stream_internal(
|
||||||
// Skip if we already sent an error
|
// Skip if we already sent an error
|
||||||
if !end_reached && !error {
|
if !end_reached && !error {
|
||||||
let err = InferError::IncompleteGeneration;
|
let err = InferError::IncompleteGeneration;
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "incomplete");
|
metrics::counter!("tgi_request_failure", "err" => "incomplete").increment(1);
|
||||||
tracing::error!("{err}");
|
tracing::error!("{err}");
|
||||||
yield Ok(Event::from(err));
|
yield Ok(Event::from(err));
|
||||||
}
|
}
|
||||||
|
@ -604,7 +594,7 @@ async fn completions(
|
||||||
Json(req): Json<CompletionRequest>,
|
Json(req): Json<CompletionRequest>,
|
||||||
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
|
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
|
||||||
let span = tracing::Span::current();
|
let span = tracing::Span::current();
|
||||||
metrics::increment_counter!("tgi_request_count");
|
metrics::counter!("tgi_request_count").increment(1);
|
||||||
|
|
||||||
let CompletionRequest {
|
let CompletionRequest {
|
||||||
max_tokens,
|
max_tokens,
|
||||||
|
@ -625,7 +615,7 @@ async fn completions(
|
||||||
|
|
||||||
// if suffix is present throw an error
|
// if suffix is present throw an error
|
||||||
if req.suffix.is_some() {
|
if req.suffix.is_some() {
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "validation");
|
metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
|
||||||
return Err((
|
return Err((
|
||||||
StatusCode::UNPROCESSABLE_ENTITY,
|
StatusCode::UNPROCESSABLE_ENTITY,
|
||||||
Json(ErrorResponse {
|
Json(ErrorResponse {
|
||||||
|
@ -637,7 +627,7 @@ async fn completions(
|
||||||
}
|
}
|
||||||
|
|
||||||
if req.prompt.0.len() > info.max_client_batch_size {
|
if req.prompt.0.len() > info.max_client_batch_size {
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "validation");
|
metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
|
||||||
return Err((
|
return Err((
|
||||||
StatusCode::UNPROCESSABLE_ENTITY,
|
StatusCode::UNPROCESSABLE_ENTITY,
|
||||||
Json(ErrorResponse {
|
Json(ErrorResponse {
|
||||||
|
@ -1009,7 +999,7 @@ async fn chat_completions(
|
||||||
Json(req): Json<ChatRequest>,
|
Json(req): Json<ChatRequest>,
|
||||||
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
|
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
|
||||||
let span = tracing::Span::current();
|
let span = tracing::Span::current();
|
||||||
metrics::increment_counter!("tgi_request_count");
|
metrics::counter!("tgi_request_count").increment(1);
|
||||||
let ChatRequest {
|
let ChatRequest {
|
||||||
logprobs,
|
logprobs,
|
||||||
max_tokens,
|
max_tokens,
|
||||||
|
@ -1039,7 +1029,7 @@ async fn chat_completions(
|
||||||
|
|
||||||
// response_format and tools are mutually exclusive
|
// response_format and tools are mutually exclusive
|
||||||
if response_format.is_some() && tools.as_ref().is_some() {
|
if response_format.is_some() && tools.as_ref().is_some() {
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "validation");
|
metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
|
||||||
return Err((
|
return Err((
|
||||||
StatusCode::UNPROCESSABLE_ENTITY,
|
StatusCode::UNPROCESSABLE_ENTITY,
|
||||||
Json(ErrorResponse {
|
Json(ErrorResponse {
|
||||||
|
@ -1053,7 +1043,7 @@ async fn chat_completions(
|
||||||
let tool_grammar = match ToolGrammar::apply(tools, tool_choice) {
|
let tool_grammar = match ToolGrammar::apply(tools, tool_choice) {
|
||||||
Ok(grammar) => grammar,
|
Ok(grammar) => grammar,
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "validation");
|
metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
|
||||||
tracing::error!("{err}");
|
tracing::error!("{err}");
|
||||||
return Err((
|
return Err((
|
||||||
StatusCode::UNPROCESSABLE_ENTITY,
|
StatusCode::UNPROCESSABLE_ENTITY,
|
||||||
|
@ -1082,7 +1072,7 @@ async fn chat_completions(
|
||||||
let inputs = match infer.apply_chat_template(messages, tools_grammar_prompt) {
|
let inputs = match infer.apply_chat_template(messages, tools_grammar_prompt) {
|
||||||
Ok(inputs) => inputs,
|
Ok(inputs) => inputs,
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
metrics::increment_counter!("tgi_request_failure", "err" => "validation");
|
metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
|
||||||
tracing::error!("{err}");
|
tracing::error!("{err}");
|
||||||
return Err((
|
return Err((
|
||||||
StatusCode::UNPROCESSABLE_ENTITY,
|
StatusCode::UNPROCESSABLE_ENTITY,
|
||||||
|
@ -1280,7 +1270,7 @@ async fn vertex_compatibility(
|
||||||
Json(req): Json<VertexRequest>,
|
Json(req): Json<VertexRequest>,
|
||||||
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
|
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
|
||||||
let span = tracing::Span::current();
|
let span = tracing::Span::current();
|
||||||
metrics::increment_counter!("tgi_request_count");
|
metrics::counter!("tgi_request_count").increment(1);
|
||||||
|
|
||||||
// check that theres at least one instance
|
// check that theres at least one instance
|
||||||
if req.instances.is_empty() {
|
if req.instances.is_empty() {
|
||||||
|
|
|
@ -157,7 +157,7 @@ impl Validation {
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
metrics::histogram!("tgi_request_input_length", input_length as f64);
|
metrics::histogram!("tgi_request_input_length").record(input_length as f64);
|
||||||
Ok((inputs, input_length, max_new_tokens))
|
Ok((inputs, input_length, max_new_tokens))
|
||||||
}
|
}
|
||||||
// Return inputs without validation
|
// Return inputs without validation
|
||||||
|
@ -384,7 +384,7 @@ impl Validation {
|
||||||
ignore_eos_token: false,
|
ignore_eos_token: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
metrics::histogram!("tgi_request_max_new_tokens", max_new_tokens as f64);
|
metrics::histogram!("tgi_request_max_new_tokens").record(max_new_tokens as f64);
|
||||||
|
|
||||||
Ok(ValidGenerateRequest {
|
Ok(ValidGenerateRequest {
|
||||||
inputs,
|
inputs,
|
||||||
|
|
Loading…
Reference in New Issue