update to metrics 0.23.0 or could work with metrics-exporter-promethe… (#2190)

update to metrics 0.23.0 or could work with metrics-exporter-prometheus 0.15.1

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
This commit is contained in:
Wang, Yi 2024-07-08 22:03:59 +08:00 committed by GitHub
parent 16d9e505fd
commit 58effe78b5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 119 additions and 129 deletions

28
Cargo.lock generated
View File

@ -1935,17 +1935,6 @@ version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "metrics"
version = "0.21.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fde3af1a009ed76a778cb84fdef9e7dbbdf5775ae3e4cc1f434a6a307f6f76c5"
dependencies = [
"ahash",
"metrics-macros",
"portable-atomic",
]
[[package]] [[package]]
name = "metrics" name = "metrics"
version = "0.23.0" version = "0.23.0"
@ -1969,7 +1958,7 @@ dependencies = [
"hyper-util", "hyper-util",
"indexmap 2.2.6", "indexmap 2.2.6",
"ipnet", "ipnet",
"metrics 0.23.0", "metrics",
"metrics-util", "metrics-util",
"quanta", "quanta",
"thiserror", "thiserror",
@ -1977,17 +1966,6 @@ dependencies = [
"tracing", "tracing",
] ]
[[package]]
name = "metrics-macros"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38b4faf00617defe497754acde3024865bc143d44a86799b24e191ecff91354f"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.68",
]
[[package]] [[package]]
name = "metrics-util" name = "metrics-util"
version = "0.17.0" version = "0.17.0"
@ -1997,7 +1975,7 @@ dependencies = [
"crossbeam-epoch", "crossbeam-epoch",
"crossbeam-utils", "crossbeam-utils",
"hashbrown 0.14.5", "hashbrown 0.14.5",
"metrics 0.23.0", "metrics",
"num_cpus", "num_cpus",
"quanta", "quanta",
"sketches-ddsketch", "sketches-ddsketch",
@ -3834,7 +3812,7 @@ dependencies = [
"init-tracing-opentelemetry", "init-tracing-opentelemetry",
"itertools 0.10.5", "itertools 0.10.5",
"jsonschema", "jsonschema",
"metrics 0.21.1", "metrics",
"metrics-exporter-prometheus", "metrics-exporter-prometheus",
"minijinja", "minijinja",
"minijinja-contrib", "minijinja-contrib",

View File

@ -24,7 +24,7 @@ futures = "0.3.28"
hf-hub = { workspace = true } hf-hub = { workspace = true }
itertools = "0.10" itertools = "0.10"
jsonschema = { version = "0.17.1", features = ["draft202012"] } jsonschema = { version = "0.17.1", features = ["draft202012"] }
metrics = "0.21.1" metrics = "0.23.0"
metrics-exporter-prometheus = { version = "0.15.1", features = [] } metrics-exporter-prometheus = { version = "0.15.1", features = [] }
nohash-hasher = "0.2.0" nohash-hasher = "0.2.0"
opentelemetry = { version = "0.20.0", features = ["rt-tokio"] } opentelemetry = { version = "0.20.0", features = ["rt-tokio"] }

View File

@ -91,14 +91,14 @@ impl Infer {
.limit_concurrent_requests .limit_concurrent_requests
.try_acquire_owned() .try_acquire_owned()
.map_err(|err| { .map_err(|err| {
metrics::increment_counter!("tgi_request_failure", "err" => "overloaded"); metrics::counter!("tgi_request_failure", "err" => "overloaded").increment(1);
tracing::error!("{err}"); tracing::error!("{err}");
err err
})?; })?;
// Validate request // Validate request
let valid_request = self.validation.validate(request).await.map_err(|err| { let valid_request = self.validation.validate(request).await.map_err(|err| {
metrics::increment_counter!("tgi_request_failure", "err" => "validation"); metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
tracing::error!("{err}"); tracing::error!("{err}");
err err
})?; })?;
@ -140,7 +140,7 @@ impl Infer {
.ok_or_else(|| InferError::TemplateError(ErrorKind::TemplateNotFound.into()))? .ok_or_else(|| InferError::TemplateError(ErrorKind::TemplateNotFound.into()))?
.apply(messages, grammar_with_prompt) .apply(messages, grammar_with_prompt)
.map_err(|e| { .map_err(|e| {
metrics::increment_counter!("tgi_request_failure", "err" => "template"); metrics::counter!("tgi_request_failure", "err" => "template").increment(1);
tracing::error!("{e}"); tracing::error!("{e}");
e e
}) })
@ -214,7 +214,7 @@ impl Infer {
}) })
} else { } else {
let err = InferError::IncompleteGeneration; let err = InferError::IncompleteGeneration;
metrics::increment_counter!("tgi_request_failure", "err" => "incomplete"); metrics::counter!("tgi_request_failure", "err" => "incomplete").increment(1);
tracing::error!("{err}"); tracing::error!("{err}");
Err(err) Err(err)
} }

View File

@ -111,7 +111,7 @@ async fn queue_task(
match cmd { match cmd {
QueueCommand::Append(entry, span) => { QueueCommand::Append(entry, span) => {
span.in_scope(|| state.append(*entry)); span.in_scope(|| state.append(*entry));
metrics::increment_gauge!("tgi_queue_size", 1.0); metrics::gauge!("tgi_queue_size").increment(1.0);
} }
QueueCommand::NextBatch { QueueCommand::NextBatch {
min_size, min_size,
@ -124,7 +124,7 @@ async fn queue_task(
let next_batch = let next_batch =
state.next_batch(min_size, max_size, prefill_token_budget, token_budget); state.next_batch(min_size, max_size, prefill_token_budget, token_budget);
response_sender.send(next_batch).unwrap(); response_sender.send(next_batch).unwrap();
metrics::gauge!("tgi_queue_size", state.entries.len() as f64); metrics::gauge!("tgi_queue_size").set(state.entries.len() as f64);
}), }),
} }
} }
@ -226,7 +226,7 @@ impl State {
// Filter entries where the response receiver was dropped (== entries where the request // Filter entries where the response receiver was dropped (== entries where the request
// was dropped by the client) // was dropped by the client)
if entry.response_tx.is_closed() { if entry.response_tx.is_closed() {
metrics::increment_counter!("tgi_request_failure", "err" => "dropped"); metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
tracing::debug!("Dropping entry"); tracing::debug!("Dropping entry");
continue; continue;
} }
@ -336,7 +336,7 @@ impl State {
// Increment batch id // Increment batch id
self.next_batch_id += 1; self.next_batch_id += 1;
metrics::histogram!("tgi_batch_next_size", batch.size as f64); metrics::histogram!("tgi_batch_next_size").record(batch.size as f64);
Some((batch_entries, batch, next_batch_span)) Some((batch_entries, batch, next_batch_span))
} }

View File

@ -148,8 +148,8 @@ pub(crate) async fn batching_task(
let batch_size = batch.size; let batch_size = batch.size;
let batch_max_tokens = batch.max_tokens; let batch_max_tokens = batch.max_tokens;
let mut batches = vec![batch]; let mut batches = vec![batch];
metrics::gauge!("tgi_batch_current_size", batch_size as f64); metrics::gauge!("tgi_batch_current_size").set(batch_size as f64);
metrics::gauge!("tgi_batch_current_max_tokens", batch_max_tokens as f64); metrics::gauge!("tgi_batch_current_max_tokens").set(batch_max_tokens as f64);
let min_size = if waiting_tokens >= max_waiting_tokens { let min_size = if waiting_tokens >= max_waiting_tokens {
// If we didn't onboard any new requests since >= max_waiting_tokens, we try // If we didn't onboard any new requests since >= max_waiting_tokens, we try
@ -170,9 +170,11 @@ pub(crate) async fn batching_task(
{ {
// Tracking metrics // Tracking metrics
if min_size.is_some() { if min_size.is_some() {
metrics::increment_counter!("tgi_batch_concat", "reason" => "backpressure"); metrics::counter!("tgi_batch_concat", "reason" => "backpressure")
.increment(1);
} else { } else {
metrics::increment_counter!("tgi_batch_concat", "reason" => "wait_exceeded"); metrics::counter!("tgi_batch_concat", "reason" => "wait_exceeded")
.increment(1);
} }
entries.iter_mut().for_each(|(_, entry)| { entries.iter_mut().for_each(|(_, entry)| {
@ -219,8 +221,8 @@ pub(crate) async fn batching_task(
.await; .await;
waiting_tokens += 1; waiting_tokens += 1;
} }
metrics::gauge!("tgi_batch_current_size", 0.0); metrics::gauge!("tgi_batch_current_size").set(0.0);
metrics::gauge!("tgi_batch_current_max_tokens", 0.0); metrics::gauge!("tgi_batch_current_max_tokens").set(0.0);
} }
} }
} }
@ -234,7 +236,7 @@ async fn prefill(
) -> Option<CachedBatch> { ) -> Option<CachedBatch> {
let start_time = Instant::now(); let start_time = Instant::now();
let batch_id = batch.id; let batch_id = batch.id;
metrics::increment_counter!("tgi_batch_inference_count", "method" => "prefill"); metrics::counter!("tgi_batch_inference_count", "method" => "prefill").increment(1);
match client.prefill(batch).await { match client.prefill(batch).await {
Ok((generations, next_batch, timings)) => { Ok((generations, next_batch, timings)) => {
@ -248,11 +250,15 @@ async fn prefill(
// Filter next batch and remove requests that were stopped // Filter next batch and remove requests that were stopped
let next_batch = filter_batch(client, next_batch, entries).await; let next_batch = filter_batch(client, next_batch, entries).await;
metrics::histogram!("tgi_batch_forward_duration", timings.forward.as_secs_f64(), "method" => "prefill"); metrics::histogram!("tgi_batch_forward_duration","method" => "prefill")
metrics::histogram!("tgi_batch_decode_duration", timings.decode.as_secs_f64(), "method" => "prefill"); .record(timings.forward.as_secs_f64());
metrics::histogram!("tgi_batch_filter_duration", start_filtering_time.elapsed().as_secs_f64(), "method" => "prefill"); metrics::histogram!("tgi_batch_decode_duration", "method" => "prefill")
metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "prefill"); .record(timings.decode.as_secs_f64());
metrics::increment_counter!("tgi_batch_inference_success", "method" => "prefill"); metrics::histogram!("tgi_batch_filter_duration", "method" => "prefill")
.record(start_filtering_time.elapsed().as_secs_f64());
metrics::histogram!("tgi_batch_inference_duration","method" => "prefill")
.record(start_time.elapsed().as_secs_f64());
metrics::counter!("tgi_batch_inference_success", "method" => "prefill").increment(1);
next_batch next_batch
} }
// If we have an error, we discard the whole batch // If we have an error, we discard the whole batch
@ -261,7 +267,7 @@ async fn prefill(
generation_health.store(false, Ordering::SeqCst); generation_health.store(false, Ordering::SeqCst);
let _ = client.clear_cache(Some(batch_id)).await; let _ = client.clear_cache(Some(batch_id)).await;
send_errors(err, entries); send_errors(err, entries);
metrics::increment_counter!("tgi_batch_inference_failure", "method" => "prefill"); metrics::counter!("tgi_batch_inference_failure", "method" => "prefill").increment(1);
None None
} }
} }
@ -276,7 +282,7 @@ async fn decode(
) -> Option<CachedBatch> { ) -> Option<CachedBatch> {
let start_time = Instant::now(); let start_time = Instant::now();
let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect(); let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
metrics::increment_counter!("tgi_batch_inference_count", "method" => "decode"); metrics::counter!("tgi_batch_inference_count", "method" => "decode").increment(1);
match client.decode(batches).await { match client.decode(batches).await {
Ok((generations, next_batch, timings)) => { Ok((generations, next_batch, timings)) => {
@ -291,13 +297,18 @@ async fn decode(
let next_batch = filter_batch(client, next_batch, entries).await; let next_batch = filter_batch(client, next_batch, entries).await;
if let Some(concat_duration) = timings.concat { if let Some(concat_duration) = timings.concat {
metrics::histogram!("tgi_batch_concat_duration", concat_duration.as_secs_f64(), "method" => "decode"); metrics::histogram!("tgi_batch_concat_duration", "method" => "decode")
.record(concat_duration.as_secs_f64());
} }
metrics::histogram!("tgi_batch_forward_duration", timings.forward.as_secs_f64(), "method" => "decode"); metrics::histogram!("tgi_batch_forward_duration", "method" => "decode")
metrics::histogram!("tgi_batch_decode_duration", timings.decode.as_secs_f64(), "method" => "decode"); .record(timings.forward.as_secs_f64());
metrics::histogram!("tgi_batch_filter_duration", start_filtering_time.elapsed().as_secs_f64(), "method" => "decode"); metrics::histogram!("tgi_batch_decode_duration", "method" => "decode")
metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "decode"); .record(timings.decode.as_secs_f64());
metrics::increment_counter!("tgi_batch_inference_success", "method" => "decode"); metrics::histogram!("tgi_batch_filter_duration", "method" => "decode")
.record(start_filtering_time.elapsed().as_secs_f64());
metrics::histogram!("tgi_batch_inference_duration", "method" => "decode")
.record(start_time.elapsed().as_secs_f64());
metrics::counter!("tgi_batch_inference_success", "method" => "decode").increment(1);
next_batch next_batch
} }
// If we have an error, we discard the whole batch // If we have an error, we discard the whole batch
@ -307,7 +318,7 @@ async fn decode(
let _ = client.clear_cache(Some(id)).await; let _ = client.clear_cache(Some(id)).await;
} }
send_errors(err, entries); send_errors(err, entries);
metrics::increment_counter!("tgi_batch_inference_failure", "method" => "decode"); metrics::counter!("tgi_batch_inference_failure", "method" => "decode").increment(1);
None None
} }
} }
@ -365,7 +376,7 @@ fn filter_send_generations(generations: Vec<Generation>, entries: &mut IntMap<u6
// request and we need to stop generating hence why we unwrap_or(true) // request and we need to stop generating hence why we unwrap_or(true)
let stopped = send_responses(generation, entry).map_err(|err| { let stopped = send_responses(generation, entry).map_err(|err| {
tracing::error!("Entry response channel error."); tracing::error!("Entry response channel error.");
metrics::increment_counter!("tgi_request_failure", "err" => "dropped"); metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
err err
}).unwrap_or(true); }).unwrap_or(true);
if stopped { if stopped {
@ -381,7 +392,7 @@ fn send_responses(
) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> { ) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> {
// Return directly if the channel is disconnected // Return directly if the channel is disconnected
if entry.response_tx.is_closed() { if entry.response_tx.is_closed() {
metrics::increment_counter!("tgi_request_failure", "err" => "dropped"); metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
return Ok(true); return Ok(true);
} }
@ -407,7 +418,7 @@ fn send_responses(
// Create last Token // Create last Token
let tokens_ = generation.tokens.expect("Non empty tokens in generation"); let tokens_ = generation.tokens.expect("Non empty tokens in generation");
let n = tokens_.ids.len(); let n = tokens_.ids.len();
metrics::histogram!("tgi_request_skipped_tokens", (n - 1) as f64); metrics::histogram!("tgi_request_skipped_tokens").record((n - 1) as f64);
let mut iterator = tokens_ let mut iterator = tokens_
.ids .ids
.into_iter() .into_iter()
@ -472,7 +483,7 @@ fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
// Create and enter a span to link this function back to the entry // Create and enter a span to link this function back to the entry
let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered(); let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
let err = InferError::GenerationError(error.to_string()); let err = InferError::GenerationError(error.to_string());
metrics::increment_counter!("tgi_request_failure", "err" => "generation"); metrics::counter!("tgi_request_failure", "err" => "generation").increment(1);
tracing::error!("{err}"); tracing::error!("{err}");
// unwrap_or is valid here as we don't care if the receiver is gone. // unwrap_or is valid here as we don't care if the receiver is gone.

View File

@ -126,7 +126,7 @@ async fn queue_task(
match cmd { match cmd {
QueueCommand::Append(entry, span) => { QueueCommand::Append(entry, span) => {
span.in_scope(|| state.append(*entry)); span.in_scope(|| state.append(*entry));
metrics::increment_gauge!("tgi_queue_size", 1.0); metrics::gauge!("tgi_queue_size").increment(1.0);
} }
QueueCommand::NextBatch { QueueCommand::NextBatch {
min_size, min_size,
@ -141,7 +141,7 @@ async fn queue_task(
.instrument(span) .instrument(span)
.await; .await;
response_sender.send(next_batch).unwrap(); response_sender.send(next_batch).unwrap();
metrics::gauge!("tgi_queue_size", state.entries.len() as f64); metrics::gauge!("tgi_queue_size").set(state.entries.len() as f64);
} }
} }
} }
@ -248,7 +248,7 @@ impl State {
// Filter entries where the response receiver was dropped (== entries where the request // Filter entries where the response receiver was dropped (== entries where the request
// was dropped by the client) // was dropped by the client)
if entry.response_tx.is_closed() { if entry.response_tx.is_closed() {
metrics::increment_counter!("tgi_request_failure", "err" => "dropped"); metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
tracing::debug!("Dropping entry"); tracing::debug!("Dropping entry");
continue; continue;
} }
@ -399,7 +399,7 @@ impl State {
// Increment batch id // Increment batch id
self.next_batch_id += 1; self.next_batch_id += 1;
metrics::histogram!("tgi_batch_next_size", batch.size as f64); metrics::histogram!("tgi_batch_next_size").record(batch.size as f64);
Some((batch_entries, batch, next_batch_span)) Some((batch_entries, batch, next_batch_span))
} }

View File

@ -154,8 +154,8 @@ pub(crate) async fn batching_task(
let batch_size = batch.size; let batch_size = batch.size;
let batch_max_tokens = batch.max_tokens; let batch_max_tokens = batch.max_tokens;
let mut batches = vec![batch]; let mut batches = vec![batch];
metrics::gauge!("tgi_batch_current_size", batch_size as f64); metrics::gauge!("tgi_batch_current_size").set(batch_size as f64);
metrics::gauge!("tgi_batch_current_max_tokens", batch_max_tokens as f64); metrics::gauge!("tgi_batch_current_max_tokens").set(batch_max_tokens as f64);
let min_size = if waiting_tokens >= max_waiting_tokens { let min_size = if waiting_tokens >= max_waiting_tokens {
// If we didn't onboard any new requests since >= max_waiting_tokens, we try // If we didn't onboard any new requests since >= max_waiting_tokens, we try
@ -176,9 +176,11 @@ pub(crate) async fn batching_task(
{ {
// Tracking metrics // Tracking metrics
if min_size.is_some() { if min_size.is_some() {
metrics::increment_counter!("tgi_batch_concat", "reason" => "backpressure"); metrics::counter!("tgi_batch_concat", "reason" => "backpressure")
.increment(1);
} else { } else {
metrics::increment_counter!("tgi_batch_concat", "reason" => "wait_exceeded"); metrics::counter!("tgi_batch_concat", "reason" => "wait_exceeded")
.increment(1);
} }
entries.iter_mut().for_each(|(_, entry)| { entries.iter_mut().for_each(|(_, entry)| {
@ -225,8 +227,8 @@ pub(crate) async fn batching_task(
.await; .await;
waiting_tokens += 1; waiting_tokens += 1;
} }
metrics::gauge!("tgi_batch_current_size", 0.0); metrics::gauge!("tgi_batch_current_size").set(0.0);
metrics::gauge!("tgi_batch_current_max_tokens", 0.0); metrics::gauge!("tgi_batch_current_max_tokens").set(0.0);
} }
} }
} }
@ -240,7 +242,7 @@ async fn prefill(
) -> Option<CachedBatch> { ) -> Option<CachedBatch> {
let start_time = Instant::now(); let start_time = Instant::now();
let batch_id = batch.id; let batch_id = batch.id;
metrics::increment_counter!("tgi_batch_inference_count", "method" => "prefill"); metrics::counter!("tgi_batch_inference_count", "method" => "prefill").increment(1);
match client.prefill(batch).await { match client.prefill(batch).await {
Ok((generations, next_batch, timings)) => { Ok((generations, next_batch, timings)) => {
@ -254,11 +256,15 @@ async fn prefill(
// Filter next batch and remove requests that were stopped // Filter next batch and remove requests that were stopped
let next_batch = filter_batch(client, next_batch, entries).await; let next_batch = filter_batch(client, next_batch, entries).await;
metrics::histogram!("tgi_batch_forward_duration", timings.forward.as_secs_f64(), "method" => "prefill"); metrics::histogram!("tgi_batch_forward_duration","method" => "prefill")
metrics::histogram!("tgi_batch_decode_duration", timings.decode.as_secs_f64(), "method" => "prefill"); .record(timings.forward.as_secs_f64());
metrics::histogram!("tgi_batch_filter_duration", start_filtering_time.elapsed().as_secs_f64(), "method" => "prefill"); metrics::histogram!("tgi_batch_decode_duration", "method" => "prefill")
metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "prefill"); .record(timings.decode.as_secs_f64());
metrics::increment_counter!("tgi_batch_inference_success", "method" => "prefill"); metrics::histogram!("tgi_batch_filter_duration", "method" => "prefill")
.record(start_filtering_time.elapsed().as_secs_f64());
metrics::histogram!("tgi_batch_inference_duration", "method" => "prefill")
.record(start_time.elapsed().as_secs_f64());
metrics::counter!("tgi_batch_inference_success", "method" => "prefill").increment(1);
next_batch next_batch
} }
// If we have an error, we discard the whole batch // If we have an error, we discard the whole batch
@ -267,7 +273,7 @@ async fn prefill(
generation_health.store(false, Ordering::SeqCst); generation_health.store(false, Ordering::SeqCst);
let _ = client.clear_cache(Some(batch_id)).await; let _ = client.clear_cache(Some(batch_id)).await;
send_errors(err, entries); send_errors(err, entries);
metrics::increment_counter!("tgi_batch_inference_failure", "method" => "prefill"); metrics::counter!("tgi_batch_inference_failure", "method" => "prefill").increment(1);
None None
} }
} }
@ -282,7 +288,7 @@ async fn decode(
) -> Option<CachedBatch> { ) -> Option<CachedBatch> {
let start_time = Instant::now(); let start_time = Instant::now();
let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect(); let batch_ids: Vec<u64> = batches.iter().map(|b| b.id).collect();
metrics::increment_counter!("tgi_batch_inference_count", "method" => "decode"); metrics::counter!("tgi_batch_inference_count", "method" => "decode").increment(1);
match client.decode(batches).await { match client.decode(batches).await {
Ok((generations, next_batch, timings)) => { Ok((generations, next_batch, timings)) => {
@ -297,13 +303,18 @@ async fn decode(
let next_batch = filter_batch(client, next_batch, entries).await; let next_batch = filter_batch(client, next_batch, entries).await;
if let Some(concat_duration) = timings.concat { if let Some(concat_duration) = timings.concat {
metrics::histogram!("tgi_batch_concat_duration", concat_duration.as_secs_f64(), "method" => "decode"); metrics::histogram!("tgi_batch_concat_duration", "method" => "decode")
.record(concat_duration.as_secs_f64());
} }
metrics::histogram!("tgi_batch_forward_duration", timings.forward.as_secs_f64(), "method" => "decode"); metrics::histogram!("tgi_batch_forward_duration", "method" => "decode")
metrics::histogram!("tgi_batch_decode_duration", timings.decode.as_secs_f64(), "method" => "decode"); .record(timings.forward.as_secs_f64());
metrics::histogram!("tgi_batch_filter_duration", start_filtering_time.elapsed().as_secs_f64(), "method" => "decode"); metrics::histogram!("tgi_batch_decode_duration", "method" => "decode")
metrics::histogram!("tgi_batch_inference_duration", start_time.elapsed().as_secs_f64(), "method" => "decode"); .record(timings.decode.as_secs_f64());
metrics::increment_counter!("tgi_batch_inference_success", "method" => "decode"); metrics::histogram!("tgi_batch_filter_duration", "method" => "decode")
.record(start_filtering_time.elapsed().as_secs_f64());
metrics::histogram!("tgi_batch_inference_duration", "method" => "decode")
.record(start_time.elapsed().as_secs_f64());
metrics::counter!("tgi_batch_inference_success", "method" => "decode").increment(1);
next_batch next_batch
} }
// If we have an error, we discard the whole batch // If we have an error, we discard the whole batch
@ -313,7 +324,7 @@ async fn decode(
let _ = client.clear_cache(Some(id)).await; let _ = client.clear_cache(Some(id)).await;
} }
send_errors(err, entries); send_errors(err, entries);
metrics::increment_counter!("tgi_batch_inference_failure", "method" => "decode"); metrics::counter!("tgi_batch_inference_failure", "method" => "decode").increment(1);
None None
} }
} }
@ -371,7 +382,7 @@ fn filter_send_generations(generations: Vec<Generation>, entries: &mut IntMap<u6
// request and we need to stop generating hence why we unwrap_or(true) // request and we need to stop generating hence why we unwrap_or(true)
let stopped = send_responses(generation, entry).map_err(|err| { let stopped = send_responses(generation, entry).map_err(|err| {
tracing::error!("Entry response channel error."); tracing::error!("Entry response channel error.");
metrics::increment_counter!("tgi_request_failure", "err" => "dropped"); metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
err err
}).unwrap_or(true); }).unwrap_or(true);
if stopped { if stopped {
@ -387,7 +398,7 @@ fn send_responses(
) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> { ) -> Result<bool, Box<SendError<Result<InferStreamResponse, InferError>>>> {
// Return directly if the channel is disconnected // Return directly if the channel is disconnected
if entry.response_tx.is_closed() { if entry.response_tx.is_closed() {
metrics::increment_counter!("tgi_request_failure", "err" => "dropped"); metrics::counter!("tgi_request_failure", "err" => "dropped").increment(1);
return Ok(true); return Ok(true);
} }
@ -413,7 +424,7 @@ fn send_responses(
// Create last Token // Create last Token
let tokens_ = generation.tokens.expect("Non empty tokens in generation"); let tokens_ = generation.tokens.expect("Non empty tokens in generation");
let n = tokens_.ids.len(); let n = tokens_.ids.len();
metrics::histogram!("tgi_request_skipped_tokens", (n - 1) as f64); metrics::histogram!("tgi_request_skipped_tokens").record((n - 1) as f64);
let mut iterator = tokens_ let mut iterator = tokens_
.ids .ids
.into_iter() .into_iter()
@ -478,7 +489,7 @@ fn send_errors(error: ClientError, entries: &mut IntMap<u64, Entry>) {
// Create and enter a span to link this function back to the entry // Create and enter a span to link this function back to the entry
let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered(); let _send_error_span = info_span!(parent: entry.temp_span.as_ref().expect("batch_span is None. This is a bug."), "send_error").entered();
let err = InferError::GenerationError(error.to_string()); let err = InferError::GenerationError(error.to_string());
metrics::increment_counter!("tgi_request_failure", "err" => "generation"); metrics::counter!("tgi_request_failure", "err" => "generation").increment(1);
tracing::error!("{err}"); tracing::error!("{err}");
// unwrap_or is valid here as we don't care if the receiver is gone. // unwrap_or is valid here as we don't care if the receiver is gone.

View File

@ -185,7 +185,7 @@ pub(crate) async fn generate_internal(
span: tracing::Span, span: tracing::Span,
) -> Result<(HeaderMap, Json<GenerateResponse>), (StatusCode, Json<ErrorResponse>)> { ) -> Result<(HeaderMap, Json<GenerateResponse>), (StatusCode, Json<ErrorResponse>)> {
let start_time = Instant::now(); let start_time = Instant::now();
metrics::increment_counter!("tgi_request_count"); metrics::counter!("tgi_request_count").increment(1);
// Do not long ultra long inputs, like image payloads. // Do not long ultra long inputs, like image payloads.
tracing::debug!("Input: {}", &req.inputs[..1000.min(req.inputs.len())]); tracing::debug!("Input: {}", &req.inputs[..1000.min(req.inputs.len())]);
@ -301,25 +301,15 @@ pub(crate) async fn generate_internal(
); );
// Metrics // Metrics
metrics::increment_counter!("tgi_request_success"); metrics::counter!("tgi_request_success").increment(1);
metrics::histogram!("tgi_request_duration", total_time.as_secs_f64()); metrics::histogram!("tgi_request_duration").record(total_time.as_secs_f64());
metrics::histogram!( metrics::histogram!("tgi_request_validation_duration").record(validation_time.as_secs_f64());
"tgi_request_validation_duration", metrics::histogram!("tgi_request_queue_duration").record(queue_time.as_secs_f64());
validation_time.as_secs_f64() metrics::histogram!("tgi_request_inference_duration").record(inference_time.as_secs_f64());
); metrics::histogram!("tgi_request_mean_time_per_token_duration")
metrics::histogram!("tgi_request_queue_duration", queue_time.as_secs_f64()); .record(time_per_token.as_secs_f64());
metrics::histogram!( metrics::histogram!("tgi_request_generated_tokens")
"tgi_request_inference_duration", .record(response.generated_text.generated_tokens as f64);
inference_time.as_secs_f64()
);
metrics::histogram!(
"tgi_request_mean_time_per_token_duration",
time_per_token.as_secs_f64()
);
metrics::histogram!(
"tgi_request_generated_tokens",
response.generated_text.generated_tokens as f64
);
// Send response // Send response
let mut output_text = response.generated_text.text; let mut output_text = response.generated_text.text;
@ -399,7 +389,7 @@ async fn generate_stream_internal(
span: tracing::Span, span: tracing::Span,
) -> (HeaderMap, impl Stream<Item = Result<Event, Infallible>>) { ) -> (HeaderMap, impl Stream<Item = Result<Event, Infallible>>) {
let start_time = Instant::now(); let start_time = Instant::now();
metrics::increment_counter!("tgi_request_count"); metrics::counter!("tgi_request_count").increment(1);
tracing::debug!("Input: {}", req.inputs); tracing::debug!("Input: {}", req.inputs);
@ -427,12 +417,12 @@ async fn generate_stream_internal(
let best_of = req.parameters.best_of.unwrap_or(1); let best_of = req.parameters.best_of.unwrap_or(1);
if best_of != 1 { if best_of != 1 {
let err = InferError::from(ValidationError::BestOfStream); let err = InferError::from(ValidationError::BestOfStream);
metrics::increment_counter!("tgi_request_failure", "err" => "validation"); metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
tracing::error!("{err}"); tracing::error!("{err}");
yield Ok(Event::from(err)); yield Ok(Event::from(err));
} else if req.parameters.decoder_input_details { } else if req.parameters.decoder_input_details {
let err = InferError::from(ValidationError::PrefillDetailsStream); let err = InferError::from(ValidationError::PrefillDetailsStream);
metrics::increment_counter!("tgi_request_failure", "err" => "validation"); metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
tracing::error!("{err}"); tracing::error!("{err}");
yield Ok(Event::from(err)); yield Ok(Event::from(err));
} else { } else {
@ -500,13 +490,13 @@ async fn generate_stream_internal(
span.record("seed", format!("{:?}", generated_text.seed)); span.record("seed", format!("{:?}", generated_text.seed));
// Metrics // Metrics
metrics::increment_counter!("tgi_request_success"); metrics::counter!("tgi_request_success").increment(1);
metrics::histogram!("tgi_request_duration", total_time.as_secs_f64()); metrics::histogram!("tgi_request_duration").record(total_time.as_secs_f64());
metrics::histogram!("tgi_request_validation_duration", validation_time.as_secs_f64()); metrics::histogram!("tgi_request_validation_duration").record(validation_time.as_secs_f64());
metrics::histogram!("tgi_request_queue_duration", queue_time.as_secs_f64()); metrics::histogram!("tgi_request_queue_duration").record(queue_time.as_secs_f64());
metrics::histogram!("tgi_request_inference_duration", inference_time.as_secs_f64()); metrics::histogram!("tgi_request_inference_duration").record(inference_time.as_secs_f64());
metrics::histogram!("tgi_request_mean_time_per_token_duration", time_per_token.as_secs_f64()); metrics::histogram!("tgi_request_mean_time_per_token_duration").record(time_per_token.as_secs_f64());
metrics::histogram!("tgi_request_generated_tokens", generated_text.generated_tokens as f64); metrics::histogram!("tgi_request_generated_tokens").record(generated_text.generated_tokens as f64);
// StreamResponse // StreamResponse
end_reached = true; end_reached = true;
@ -553,7 +543,7 @@ async fn generate_stream_internal(
// Skip if we already sent an error // Skip if we already sent an error
if !end_reached && !error { if !end_reached && !error {
let err = InferError::IncompleteGeneration; let err = InferError::IncompleteGeneration;
metrics::increment_counter!("tgi_request_failure", "err" => "incomplete"); metrics::counter!("tgi_request_failure", "err" => "incomplete").increment(1);
tracing::error!("{err}"); tracing::error!("{err}");
yield Ok(Event::from(err)); yield Ok(Event::from(err));
} }
@ -604,7 +594,7 @@ async fn completions(
Json(req): Json<CompletionRequest>, Json(req): Json<CompletionRequest>,
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> { ) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
let span = tracing::Span::current(); let span = tracing::Span::current();
metrics::increment_counter!("tgi_request_count"); metrics::counter!("tgi_request_count").increment(1);
let CompletionRequest { let CompletionRequest {
max_tokens, max_tokens,
@ -625,7 +615,7 @@ async fn completions(
// if suffix is present throw an error // if suffix is present throw an error
if req.suffix.is_some() { if req.suffix.is_some() {
metrics::increment_counter!("tgi_request_failure", "err" => "validation"); metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
return Err(( return Err((
StatusCode::UNPROCESSABLE_ENTITY, StatusCode::UNPROCESSABLE_ENTITY,
Json(ErrorResponse { Json(ErrorResponse {
@ -637,7 +627,7 @@ async fn completions(
} }
if req.prompt.0.len() > info.max_client_batch_size { if req.prompt.0.len() > info.max_client_batch_size {
metrics::increment_counter!("tgi_request_failure", "err" => "validation"); metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
return Err(( return Err((
StatusCode::UNPROCESSABLE_ENTITY, StatusCode::UNPROCESSABLE_ENTITY,
Json(ErrorResponse { Json(ErrorResponse {
@ -1009,7 +999,7 @@ async fn chat_completions(
Json(req): Json<ChatRequest>, Json(req): Json<ChatRequest>,
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> { ) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
let span = tracing::Span::current(); let span = tracing::Span::current();
metrics::increment_counter!("tgi_request_count"); metrics::counter!("tgi_request_count").increment(1);
let ChatRequest { let ChatRequest {
logprobs, logprobs,
max_tokens, max_tokens,
@ -1039,7 +1029,7 @@ async fn chat_completions(
// response_format and tools are mutually exclusive // response_format and tools are mutually exclusive
if response_format.is_some() && tools.as_ref().is_some() { if response_format.is_some() && tools.as_ref().is_some() {
metrics::increment_counter!("tgi_request_failure", "err" => "validation"); metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
return Err(( return Err((
StatusCode::UNPROCESSABLE_ENTITY, StatusCode::UNPROCESSABLE_ENTITY,
Json(ErrorResponse { Json(ErrorResponse {
@ -1053,7 +1043,7 @@ async fn chat_completions(
let tool_grammar = match ToolGrammar::apply(tools, tool_choice) { let tool_grammar = match ToolGrammar::apply(tools, tool_choice) {
Ok(grammar) => grammar, Ok(grammar) => grammar,
Err(err) => { Err(err) => {
metrics::increment_counter!("tgi_request_failure", "err" => "validation"); metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
tracing::error!("{err}"); tracing::error!("{err}");
return Err(( return Err((
StatusCode::UNPROCESSABLE_ENTITY, StatusCode::UNPROCESSABLE_ENTITY,
@ -1082,7 +1072,7 @@ async fn chat_completions(
let inputs = match infer.apply_chat_template(messages, tools_grammar_prompt) { let inputs = match infer.apply_chat_template(messages, tools_grammar_prompt) {
Ok(inputs) => inputs, Ok(inputs) => inputs,
Err(err) => { Err(err) => {
metrics::increment_counter!("tgi_request_failure", "err" => "validation"); metrics::counter!("tgi_request_failure", "err" => "validation").increment(1);
tracing::error!("{err}"); tracing::error!("{err}");
return Err(( return Err((
StatusCode::UNPROCESSABLE_ENTITY, StatusCode::UNPROCESSABLE_ENTITY,
@ -1280,7 +1270,7 @@ async fn vertex_compatibility(
Json(req): Json<VertexRequest>, Json(req): Json<VertexRequest>,
) -> Result<Response, (StatusCode, Json<ErrorResponse>)> { ) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
let span = tracing::Span::current(); let span = tracing::Span::current();
metrics::increment_counter!("tgi_request_count"); metrics::counter!("tgi_request_count").increment(1);
// check that theres at least one instance // check that theres at least one instance
if req.instances.is_empty() { if req.instances.is_empty() {

View File

@ -157,7 +157,7 @@ impl Validation {
)); ));
} }
metrics::histogram!("tgi_request_input_length", input_length as f64); metrics::histogram!("tgi_request_input_length").record(input_length as f64);
Ok((inputs, input_length, max_new_tokens)) Ok((inputs, input_length, max_new_tokens))
} }
// Return inputs without validation // Return inputs without validation
@ -384,7 +384,7 @@ impl Validation {
ignore_eos_token: false, ignore_eos_token: false,
}; };
metrics::histogram!("tgi_request_max_new_tokens", max_new_tokens as f64); metrics::histogram!("tgi_request_max_new_tokens").record(max_new_tokens as f64);
Ok(ValidGenerateRequest { Ok(ValidGenerateRequest {
inputs, inputs,