feat(benchmarker): add summary tables (#368)
This commit is contained in:
parent
218c9adaa5
commit
951930fbff
|
@ -249,6 +249,12 @@ version = "3.12.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
|
||||
|
||||
[[package]]
|
||||
name = "bytecount"
|
||||
version = "0.6.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.4.3"
|
||||
|
@ -1706,6 +1712,17 @@ version = "0.1.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
|
||||
|
||||
[[package]]
|
||||
name = "papergrid"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fdfe703c51ddc52887ad78fc69cd2ea78d895ffcd6e955c9d03566db8ab5bb1"
|
||||
dependencies = [
|
||||
"bytecount",
|
||||
"fnv",
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.12.1"
|
||||
|
@ -2490,6 +2507,30 @@ dependencies = [
|
|||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tabled"
|
||||
version = "0.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da1a2e56bbf7bfdd08aaa7592157a742205459eff774b73bc01809ae2d99dc2a"
|
||||
dependencies = [
|
||||
"papergrid",
|
||||
"tabled_derive",
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tabled_derive"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99f688a08b54f4f02f0a3c382aefdb7884d3d69609f785bd253dc033243e3fe4"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro-error",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tar"
|
||||
version = "0.4.38"
|
||||
|
@ -2525,6 +2566,7 @@ dependencies = [
|
|||
"ratatui",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tabled",
|
||||
"text-generation-client",
|
||||
"thiserror",
|
||||
"tokenizers",
|
||||
|
|
|
@ -20,6 +20,7 @@ crossterm = "0.26"
|
|||
float-ord = "0.3.2"
|
||||
serde = {version = "1.0.142", features = ["derive"]}
|
||||
serde_json = "1.0"
|
||||
tabled = "0.12.0"
|
||||
text-generation-client = { path = "../router/client" }
|
||||
thiserror = "1.0.38"
|
||||
tokenizers = "0.13.3"
|
||||
|
|
|
@ -15,6 +15,7 @@ use tui::{symbols, Frame};
|
|||
/// TUI powered App
|
||||
pub(crate) struct App {
|
||||
pub(crate) running: bool,
|
||||
pub(crate) data: Data,
|
||||
completed_runs: Vec<usize>,
|
||||
completed_batch: usize,
|
||||
current_batch: usize,
|
||||
|
@ -22,12 +23,10 @@ pub(crate) struct App {
|
|||
touched_tab: bool,
|
||||
zoom: bool,
|
||||
is_error: bool,
|
||||
data: Data,
|
||||
tokenizer_name: String,
|
||||
sequence_length: u32,
|
||||
decode_length: u32,
|
||||
n_run: usize,
|
||||
batch_size: Vec<u32>,
|
||||
receiver: mpsc::Receiver<Result<Message, ClientError>>,
|
||||
}
|
||||
|
||||
|
@ -40,7 +39,6 @@ impl App {
|
|||
n_run: usize,
|
||||
batch_size: Vec<u32>,
|
||||
) -> Self {
|
||||
let data = Data::new(n_run, batch_size.len());
|
||||
let current_tab = 0;
|
||||
|
||||
let completed_runs: Vec<usize> = (0..batch_size.len()).map(|_| 0).collect();
|
||||
|
@ -48,8 +46,11 @@ impl App {
|
|||
let current_batch = 0;
|
||||
let is_error = false;
|
||||
|
||||
let data = Data::new(n_run, batch_size);
|
||||
|
||||
Self {
|
||||
running: true,
|
||||
data,
|
||||
completed_runs,
|
||||
completed_batch,
|
||||
current_batch,
|
||||
|
@ -57,12 +58,10 @@ impl App {
|
|||
touched_tab: false,
|
||||
zoom: false,
|
||||
is_error,
|
||||
data,
|
||||
tokenizer_name,
|
||||
sequence_length,
|
||||
decode_length,
|
||||
n_run,
|
||||
batch_size,
|
||||
receiver,
|
||||
}
|
||||
}
|
||||
|
@ -79,7 +78,7 @@ impl App {
|
|||
code: KeyCode::Tab, ..
|
||||
} => {
|
||||
self.touched_tab = true;
|
||||
self.current_tab = (self.current_tab + 1) % self.batch_size.len();
|
||||
self.current_tab = (self.current_tab + 1) % self.data.batch_size.len();
|
||||
}
|
||||
// Decrease and wrap tab
|
||||
KeyEvent {
|
||||
|
@ -90,7 +89,7 @@ impl App {
|
|||
if self.current_tab > 0 {
|
||||
self.current_tab -= 1;
|
||||
} else {
|
||||
self.current_tab = self.batch_size.len() - 1;
|
||||
self.current_tab = self.data.batch_size.len() - 1;
|
||||
}
|
||||
}
|
||||
// Zoom on throughput/latency fig
|
||||
|
@ -137,7 +136,7 @@ impl App {
|
|||
self.data.end_batch(self.current_batch);
|
||||
self.completed_batch += 1;
|
||||
|
||||
if self.current_batch < self.batch_size.len() - 1 {
|
||||
if self.current_batch < self.data.batch_size.len() - 1 {
|
||||
// Only go to next tab if the user never touched the tab keys
|
||||
if !self.touched_tab {
|
||||
self.current_tab += 1;
|
||||
|
@ -156,7 +155,7 @@ impl App {
|
|||
/// Render frame
|
||||
pub fn render<B: Backend>(&mut self, f: &mut Frame<'_, B>) {
|
||||
let batch_progress =
|
||||
(self.completed_batch as f64 / self.batch_size.len() as f64).clamp(0.0, 1.0);
|
||||
(self.completed_batch as f64 / self.data.batch_size.len() as f64).clamp(0.0, 1.0);
|
||||
let run_progress =
|
||||
(self.completed_runs[self.current_batch] as f64 / self.n_run as f64).clamp(0.0, 1.0);
|
||||
|
||||
|
@ -241,6 +240,7 @@ impl App {
|
|||
|
||||
// Batch tabs
|
||||
let titles = self
|
||||
.data
|
||||
.batch_size
|
||||
.iter()
|
||||
.map(|b| {
|
||||
|
@ -269,7 +269,7 @@ impl App {
|
|||
};
|
||||
let batch_gauge = progress_gauge(
|
||||
"Total Progress",
|
||||
format!("{} / {}", self.completed_batch, self.batch_size.len()),
|
||||
format!("{} / {}", self.completed_batch, self.data.batch_size.len()),
|
||||
batch_progress,
|
||||
color,
|
||||
);
|
||||
|
@ -347,7 +347,7 @@ impl App {
|
|||
// Prefill latency/throughput chart
|
||||
let prefill_latency_throughput_chart = latency_throughput_chart(
|
||||
&self.data.prefill_batch_latency_throughput,
|
||||
&self.batch_size,
|
||||
&self.data.batch_size,
|
||||
self.zoom,
|
||||
"Prefill",
|
||||
);
|
||||
|
@ -356,7 +356,7 @@ impl App {
|
|||
// Decode latency/throughput chart
|
||||
let decode_latency_throughput_chart = latency_throughput_chart(
|
||||
&self.data.decode_batch_latency_throughput,
|
||||
&self.batch_size,
|
||||
&self.data.batch_size,
|
||||
self.zoom,
|
||||
"Decode",
|
||||
);
|
||||
|
@ -365,31 +365,35 @@ impl App {
|
|||
}
|
||||
|
||||
/// App internal data struct
|
||||
struct Data {
|
||||
prefill_latencies: Vec<Vec<f64>>,
|
||||
prefill_throughputs: Vec<Vec<f64>>,
|
||||
decode_latencies: Vec<Vec<f64>>,
|
||||
decode_token_latencies: Vec<Vec<f64>>,
|
||||
decode_throughputs: Vec<Vec<f64>>,
|
||||
prefill_batch_latency_throughput: Vec<(f64, f64)>,
|
||||
decode_batch_latency_throughput: Vec<(f64, f64)>,
|
||||
pub(crate) struct Data {
|
||||
pub(crate) batch_size: Vec<u32>,
|
||||
pub(crate) prefill_latencies: Vec<Vec<f64>>,
|
||||
pub(crate) prefill_throughputs: Vec<Vec<f64>>,
|
||||
pub(crate) decode_latencies: Vec<Vec<f64>>,
|
||||
pub(crate) decode_token_latencies: Vec<Vec<f64>>,
|
||||
pub(crate) decode_throughputs: Vec<Vec<f64>>,
|
||||
pub(crate) prefill_batch_latency_throughput: Vec<(f64, f64)>,
|
||||
pub(crate) decode_batch_latency_throughput: Vec<(f64, f64)>,
|
||||
}
|
||||
|
||||
impl Data {
|
||||
fn new(n_run: usize, n_batch: usize) -> Self {
|
||||
let prefill_latencies: Vec<Vec<f64>> =
|
||||
(0..n_batch).map(|_| Vec::with_capacity(n_run)).collect();
|
||||
fn new(n_run: usize, batch_size: Vec<u32>) -> Self {
|
||||
let prefill_latencies: Vec<Vec<f64>> = (0..batch_size.len())
|
||||
.map(|_| Vec::with_capacity(n_run))
|
||||
.collect();
|
||||
let prefill_throughputs: Vec<Vec<f64>> = prefill_latencies.clone();
|
||||
|
||||
let decode_latencies: Vec<Vec<f64>> = prefill_latencies.clone();
|
||||
let decode_token_latencies: Vec<Vec<f64>> = decode_latencies.clone();
|
||||
let decode_throughputs: Vec<Vec<f64>> = prefill_throughputs.clone();
|
||||
|
||||
let prefill_batch_latency_throughput: Vec<(f64, f64)> = Vec::with_capacity(n_batch);
|
||||
let prefill_batch_latency_throughput: Vec<(f64, f64)> =
|
||||
Vec::with_capacity(batch_size.len());
|
||||
let decode_batch_latency_throughput: Vec<(f64, f64)> =
|
||||
prefill_batch_latency_throughput.clone();
|
||||
|
||||
Self {
|
||||
batch_size,
|
||||
prefill_latencies,
|
||||
prefill_throughputs,
|
||||
decode_latencies,
|
||||
|
@ -401,14 +405,14 @@ impl Data {
|
|||
}
|
||||
|
||||
fn push_prefill(&mut self, prefill: Prefill, batch_idx: usize) {
|
||||
let latency = prefill.latency.as_millis() as f64;
|
||||
let latency = prefill.latency.as_micros() as f64 / 1000.0;
|
||||
self.prefill_latencies[batch_idx].push(latency);
|
||||
self.prefill_throughputs[batch_idx].push(prefill.throughput);
|
||||
}
|
||||
|
||||
fn push_decode(&mut self, decode: Decode, batch_idx: usize) {
|
||||
let latency = decode.latency.as_millis() as f64;
|
||||
let token_latency = decode.token_latency.as_millis() as f64;
|
||||
let latency = decode.latency.as_micros() as f64 / 1000.0;
|
||||
let token_latency = decode.token_latency.as_micros() as f64 / 1000.0;
|
||||
self.decode_latencies[batch_idx].push(latency);
|
||||
self.decode_token_latencies[batch_idx].push(token_latency);
|
||||
self.decode_throughputs[batch_idx].push(decode.throughput);
|
||||
|
|
|
@ -39,6 +39,7 @@ pub(crate) async fn generation_task(
|
|||
decode_length: u32,
|
||||
n_runs: usize,
|
||||
warmups: usize,
|
||||
parameters: NextTokenChooserParameters,
|
||||
client: ShardedClient,
|
||||
run_sender: mpsc::Sender<Result<Message, ClientError>>,
|
||||
mut shutdown_receiver: broadcast::Receiver<()>,
|
||||
|
@ -47,7 +48,7 @@ pub(crate) async fn generation_task(
|
|||
// End task if a message is received on shutdown_receiver
|
||||
// _shutdown_guard_sender will be dropped once the task is finished
|
||||
tokio::select! {
|
||||
res = generate_runs(tokenizer, batch_size, sequence_length, decode_length, n_runs, warmups, client, run_sender.clone()) => {
|
||||
res = generate_runs(tokenizer, batch_size, sequence_length, decode_length, n_runs, warmups, parameters, client, run_sender.clone()) => {
|
||||
if let Err(err) = res {
|
||||
run_sender.send(Err(err)).await.unwrap_or(());
|
||||
}
|
||||
|
@ -65,6 +66,7 @@ async fn generate_runs(
|
|||
decode_length: u32,
|
||||
n_runs: usize,
|
||||
warmups: usize,
|
||||
parameters: NextTokenChooserParameters,
|
||||
mut client: ShardedClient,
|
||||
run_sender: mpsc::Sender<Result<Message, ClientError>>,
|
||||
) -> Result<(), ClientError> {
|
||||
|
@ -79,6 +81,7 @@ async fn generate_runs(
|
|||
sequence_length,
|
||||
b,
|
||||
decode_length,
|
||||
parameters.clone(),
|
||||
&mut client,
|
||||
)
|
||||
.await?;
|
||||
|
@ -93,6 +96,7 @@ async fn generate_runs(
|
|||
sequence_length,
|
||||
b,
|
||||
decode_length,
|
||||
parameters.clone(),
|
||||
&mut client,
|
||||
)
|
||||
.await?;
|
||||
|
@ -125,6 +129,7 @@ async fn prefill(
|
|||
sequence_length: u32,
|
||||
batch_size: u32,
|
||||
decode_length: u32,
|
||||
parameters: NextTokenChooserParameters,
|
||||
client: &mut ShardedClient,
|
||||
) -> Result<(Prefill, CachedBatch), ClientError> {
|
||||
// Create requests
|
||||
|
@ -133,16 +138,7 @@ async fn prefill(
|
|||
id: id.into(),
|
||||
inputs: sequence.clone(),
|
||||
truncate: sequence_length,
|
||||
parameters: Some(NextTokenChooserParameters {
|
||||
temperature: 1.0,
|
||||
top_k: 0,
|
||||
top_p: 1.0,
|
||||
typical_p: 1.0,
|
||||
do_sample: false,
|
||||
seed: 0,
|
||||
repetition_penalty: 1.0,
|
||||
watermark: false,
|
||||
}),
|
||||
parameters: Some(parameters.clone()),
|
||||
stopping_parameters: Some(StoppingCriteriaParameters {
|
||||
max_new_tokens: decode_length,
|
||||
stop_sequences: vec![],
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
mod app;
|
||||
mod event;
|
||||
mod generation;
|
||||
mod table;
|
||||
mod utils;
|
||||
|
||||
use crate::app::App;
|
||||
use crate::event::Event;
|
||||
use crossterm::ExecutableCommand;
|
||||
use std::io;
|
||||
use text_generation_client::ShardedClient;
|
||||
use text_generation_client::{NextTokenChooserParameters, ShardedClient};
|
||||
use tokenizers::Tokenizer;
|
||||
use tokio::sync::{broadcast, mpsc};
|
||||
use tui::backend::CrosstermBackend;
|
||||
|
@ -23,8 +24,26 @@ pub async fn run(
|
|||
decode_length: u32,
|
||||
n_runs: usize,
|
||||
warmups: usize,
|
||||
temperature: Option<f32>,
|
||||
top_k: Option<u32>,
|
||||
top_p: Option<f32>,
|
||||
typical_p: Option<f32>,
|
||||
repetition_penalty: Option<f32>,
|
||||
watermark: bool,
|
||||
do_sample: bool,
|
||||
client: ShardedClient,
|
||||
) -> Result<(), crossterm::ErrorKind> {
|
||||
let parameters = NextTokenChooserParameters {
|
||||
temperature: temperature.unwrap_or(1.0),
|
||||
top_k: top_k.unwrap_or(0),
|
||||
top_p: top_p.unwrap_or(1.0),
|
||||
typical_p: typical_p.unwrap_or(1.0),
|
||||
do_sample,
|
||||
seed: 0,
|
||||
repetition_penalty: repetition_penalty.unwrap_or(1.0),
|
||||
watermark,
|
||||
};
|
||||
|
||||
// Initialize terminal properties
|
||||
crossterm::terminal::enable_raw_mode()?;
|
||||
io::stdout().execute(crossterm::terminal::EnterAlternateScreen)?;
|
||||
|
@ -53,6 +72,7 @@ pub async fn run(
|
|||
decode_length,
|
||||
n_runs,
|
||||
warmups,
|
||||
parameters,
|
||||
client,
|
||||
run_sender,
|
||||
shutdown_sender.subscribe(),
|
||||
|
@ -73,7 +93,7 @@ pub async fn run(
|
|||
// Create App
|
||||
let mut app = App::new(
|
||||
run_receiver,
|
||||
tokenizer_name,
|
||||
tokenizer_name.clone(),
|
||||
sequence_length,
|
||||
decode_length,
|
||||
n_runs,
|
||||
|
@ -106,5 +126,27 @@ pub async fn run(
|
|||
crossterm::terminal::disable_raw_mode()?;
|
||||
io::stdout().execute(crossterm::cursor::Show)?;
|
||||
|
||||
let parameters_table = table::parameters_table(
|
||||
tokenizer_name,
|
||||
sequence_length,
|
||||
decode_length,
|
||||
n_runs,
|
||||
warmups,
|
||||
temperature,
|
||||
top_k,
|
||||
top_p,
|
||||
typical_p,
|
||||
repetition_penalty,
|
||||
watermark,
|
||||
do_sample,
|
||||
);
|
||||
println!("\n{parameters_table}\n");
|
||||
|
||||
let latency_table = table::latency_table(&app.data);
|
||||
println!("\n{latency_table}\n");
|
||||
|
||||
let throughput_table = table::throughput_table(&app.data);
|
||||
println!("\n{throughput_table}\n");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -28,11 +28,27 @@ struct Args {
|
|||
runs: usize,
|
||||
#[clap(default_value = "1", short, long, env)]
|
||||
warmups: usize,
|
||||
#[clap(long, env)]
|
||||
temperature: Option<f32>,
|
||||
#[clap(long, env)]
|
||||
top_k: Option<u32>,
|
||||
#[clap(long, env)]
|
||||
top_p: Option<f32>,
|
||||
#[clap(long, env)]
|
||||
typical_p: Option<f32>,
|
||||
#[clap(long, env)]
|
||||
repetition_penalty: Option<f32>,
|
||||
#[clap(long, env)]
|
||||
watermark: bool,
|
||||
#[clap(long, env)]
|
||||
do_sample: bool,
|
||||
#[clap(default_value = "/tmp/text-generation-server-0", short, long, env)]
|
||||
master_shard_uds_path: String,
|
||||
}
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
init_logging();
|
||||
|
||||
// Get args
|
||||
let args = Args::parse();
|
||||
// Pattern match configuration
|
||||
|
@ -44,13 +60,18 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
decode_length,
|
||||
runs,
|
||||
warmups,
|
||||
temperature,
|
||||
top_k,
|
||||
top_p,
|
||||
typical_p,
|
||||
repetition_penalty,
|
||||
watermark,
|
||||
do_sample,
|
||||
master_shard_uds_path,
|
||||
} = args;
|
||||
|
||||
let batch_size = batch_size.unwrap_or(vec![1, 2, 4, 8, 16, 32]);
|
||||
|
||||
init_logging();
|
||||
|
||||
// Tokenizer instance
|
||||
// This will only be used to validate payloads
|
||||
tracing::info!("Loading tokenizer");
|
||||
|
@ -105,6 +126,13 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
decode_length,
|
||||
runs,
|
||||
warmups,
|
||||
temperature,
|
||||
top_k,
|
||||
top_p,
|
||||
typical_p,
|
||||
repetition_penalty,
|
||||
watermark,
|
||||
do_sample,
|
||||
sharded_client,
|
||||
)
|
||||
.await
|
||||
|
|
|
@ -0,0 +1,170 @@
|
|||
use crate::app::Data;
|
||||
use tabled::settings::Merge;
|
||||
use tabled::{builder::Builder, settings::Style, Table};
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn parameters_table(
|
||||
tokenizer_name: String,
|
||||
sequence_length: u32,
|
||||
decode_length: u32,
|
||||
n_runs: usize,
|
||||
warmups: usize,
|
||||
temperature: Option<f32>,
|
||||
top_k: Option<u32>,
|
||||
top_p: Option<f32>,
|
||||
typical_p: Option<f32>,
|
||||
repetition_penalty: Option<f32>,
|
||||
watermark: bool,
|
||||
do_sample: bool,
|
||||
) -> Table {
|
||||
let mut builder = Builder::default();
|
||||
|
||||
builder.set_header(["Parameter", "Value"]);
|
||||
|
||||
builder.push_record(["Model", &tokenizer_name]);
|
||||
builder.push_record(["Sequence Length", &sequence_length.to_string()]);
|
||||
builder.push_record(["Decode Length", &decode_length.to_string()]);
|
||||
builder.push_record(["N Runs", &n_runs.to_string()]);
|
||||
builder.push_record(["Warmups", &warmups.to_string()]);
|
||||
builder.push_record(["Temperature", &format!("{temperature:?}")]);
|
||||
builder.push_record(["Top K", &format!("{top_k:?}")]);
|
||||
builder.push_record(["Top P", &format!("{top_p:?}")]);
|
||||
builder.push_record(["Typical P", &format!("{typical_p:?}")]);
|
||||
builder.push_record(["Repetition Penalty", &format!("{repetition_penalty:?}")]);
|
||||
builder.push_record(["Watermark", &watermark.to_string()]);
|
||||
builder.push_record(["Do Sample", &do_sample.to_string()]);
|
||||
|
||||
let mut table = builder.build();
|
||||
table.with(Style::markdown());
|
||||
table
|
||||
}
|
||||
|
||||
pub(crate) fn latency_table(data: &Data) -> Table {
|
||||
let mut builder = Builder::default();
|
||||
|
||||
builder.set_header([
|
||||
"Step",
|
||||
"Batch Size",
|
||||
"Average",
|
||||
"Lowest",
|
||||
"Highest",
|
||||
"p50",
|
||||
"p90",
|
||||
"p99",
|
||||
]);
|
||||
|
||||
add_latencies(
|
||||
&mut builder,
|
||||
"Prefill",
|
||||
&data.batch_size,
|
||||
&data.prefill_latencies,
|
||||
);
|
||||
add_latencies(
|
||||
&mut builder,
|
||||
"Decode (token)",
|
||||
&data.batch_size,
|
||||
&data.decode_token_latencies,
|
||||
);
|
||||
add_latencies(
|
||||
&mut builder,
|
||||
"Decode (total)",
|
||||
&data.batch_size,
|
||||
&data.decode_latencies,
|
||||
);
|
||||
|
||||
let mut table = builder.build();
|
||||
table.with(Style::markdown()).with(Merge::vertical());
|
||||
table
|
||||
}
|
||||
|
||||
pub(crate) fn throughput_table(data: &Data) -> Table {
|
||||
let mut builder = Builder::default();
|
||||
|
||||
builder.set_header(["Step", "Batch Size", "Average", "Lowest", "Highest"]);
|
||||
|
||||
add_throuhgputs(
|
||||
&mut builder,
|
||||
"Prefill",
|
||||
&data.batch_size,
|
||||
&data.prefill_throughputs,
|
||||
);
|
||||
add_throuhgputs(
|
||||
&mut builder,
|
||||
"Decode",
|
||||
&data.batch_size,
|
||||
&data.decode_throughputs,
|
||||
);
|
||||
|
||||
let mut table = builder.build();
|
||||
table.with(Style::markdown()).with(Merge::vertical());
|
||||
table
|
||||
}
|
||||
|
||||
fn add_latencies(
|
||||
builder: &mut Builder,
|
||||
step: &'static str,
|
||||
batch_size: &[u32],
|
||||
batch_latencies: &[Vec<f64>],
|
||||
) {
|
||||
for (i, b) in batch_size.iter().enumerate() {
|
||||
let latencies = &batch_latencies[i];
|
||||
let (avg, min, max) = avg_min_max(latencies);
|
||||
|
||||
let row = [
|
||||
step,
|
||||
&b.to_string(),
|
||||
&format_value(avg, "ms"),
|
||||
&format_value(min, "ms"),
|
||||
&format_value(max, "ms"),
|
||||
&format_value(px(latencies, 50), "ms"),
|
||||
&format_value(px(latencies, 90), "ms"),
|
||||
&format_value(px(latencies, 99), "ms"),
|
||||
];
|
||||
|
||||
builder.push_record(row);
|
||||
}
|
||||
}
|
||||
|
||||
fn add_throuhgputs(
|
||||
builder: &mut Builder,
|
||||
step: &'static str,
|
||||
batch_size: &[u32],
|
||||
batch_throughputs: &[Vec<f64>],
|
||||
) {
|
||||
for (i, b) in batch_size.iter().enumerate() {
|
||||
let throughputs = &batch_throughputs[i];
|
||||
let (avg, min, max) = avg_min_max(throughputs);
|
||||
|
||||
let row = [
|
||||
step,
|
||||
&b.to_string(),
|
||||
&format_value(avg, "tokens/secs"),
|
||||
&format_value(min, "tokens/secs"),
|
||||
&format_value(max, "tokens/secs"),
|
||||
];
|
||||
|
||||
builder.push_record(row);
|
||||
}
|
||||
}
|
||||
|
||||
fn avg_min_max(data: &Vec<f64>) -> (f64, f64, f64) {
|
||||
let average = data.iter().sum::<f64>() / data.len() as f64;
|
||||
let min = data
|
||||
.iter()
|
||||
.min_by(|a, b| a.total_cmp(b))
|
||||
.unwrap_or(&std::f64::NAN);
|
||||
let max = data
|
||||
.iter()
|
||||
.max_by(|a, b| a.total_cmp(b))
|
||||
.unwrap_or(&std::f64::NAN);
|
||||
(average, *min, *max)
|
||||
}
|
||||
|
||||
fn px(data: &Vec<f64>, p: u32) -> f64 {
|
||||
let i = (f64::from(p) / 100.0 * data.len() as f64) as usize;
|
||||
*data.get(i).unwrap_or(&std::f64::NAN)
|
||||
}
|
||||
|
||||
fn format_value(value: f64, unit: &'static str) -> String {
|
||||
format!("{:.2} {unit}", value)
|
||||
}
|
Loading…
Reference in New Issue