feat(benchmarker): add summary tables (#368)
This commit is contained in:
parent
218c9adaa5
commit
951930fbff
|
@ -249,6 +249,12 @@ version = "3.12.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
|
checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bytecount"
|
||||||
|
version = "0.6.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "byteorder"
|
name = "byteorder"
|
||||||
version = "1.4.3"
|
version = "1.4.3"
|
||||||
|
@ -1706,6 +1712,17 @@ version = "0.1.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
|
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "papergrid"
|
||||||
|
version = "0.9.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1fdfe703c51ddc52887ad78fc69cd2ea78d895ffcd6e955c9d03566db8ab5bb1"
|
||||||
|
dependencies = [
|
||||||
|
"bytecount",
|
||||||
|
"fnv",
|
||||||
|
"unicode-width",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "parking_lot"
|
name = "parking_lot"
|
||||||
version = "0.12.1"
|
version = "0.12.1"
|
||||||
|
@ -2490,6 +2507,30 @@ dependencies = [
|
||||||
"winapi",
|
"winapi",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tabled"
|
||||||
|
version = "0.12.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "da1a2e56bbf7bfdd08aaa7592157a742205459eff774b73bc01809ae2d99dc2a"
|
||||||
|
dependencies = [
|
||||||
|
"papergrid",
|
||||||
|
"tabled_derive",
|
||||||
|
"unicode-width",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tabled_derive"
|
||||||
|
version = "0.6.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "99f688a08b54f4f02f0a3c382aefdb7884d3d69609f785bd253dc033243e3fe4"
|
||||||
|
dependencies = [
|
||||||
|
"heck",
|
||||||
|
"proc-macro-error",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn 1.0.109",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tar"
|
name = "tar"
|
||||||
version = "0.4.38"
|
version = "0.4.38"
|
||||||
|
@ -2525,6 +2566,7 @@ dependencies = [
|
||||||
"ratatui",
|
"ratatui",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
"tabled",
|
||||||
"text-generation-client",
|
"text-generation-client",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"tokenizers",
|
"tokenizers",
|
||||||
|
|
|
@ -20,6 +20,7 @@ crossterm = "0.26"
|
||||||
float-ord = "0.3.2"
|
float-ord = "0.3.2"
|
||||||
serde = {version = "1.0.142", features = ["derive"]}
|
serde = {version = "1.0.142", features = ["derive"]}
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
|
tabled = "0.12.0"
|
||||||
text-generation-client = { path = "../router/client" }
|
text-generation-client = { path = "../router/client" }
|
||||||
thiserror = "1.0.38"
|
thiserror = "1.0.38"
|
||||||
tokenizers = "0.13.3"
|
tokenizers = "0.13.3"
|
||||||
|
|
|
@ -15,6 +15,7 @@ use tui::{symbols, Frame};
|
||||||
/// TUI powered App
|
/// TUI powered App
|
||||||
pub(crate) struct App {
|
pub(crate) struct App {
|
||||||
pub(crate) running: bool,
|
pub(crate) running: bool,
|
||||||
|
pub(crate) data: Data,
|
||||||
completed_runs: Vec<usize>,
|
completed_runs: Vec<usize>,
|
||||||
completed_batch: usize,
|
completed_batch: usize,
|
||||||
current_batch: usize,
|
current_batch: usize,
|
||||||
|
@ -22,12 +23,10 @@ pub(crate) struct App {
|
||||||
touched_tab: bool,
|
touched_tab: bool,
|
||||||
zoom: bool,
|
zoom: bool,
|
||||||
is_error: bool,
|
is_error: bool,
|
||||||
data: Data,
|
|
||||||
tokenizer_name: String,
|
tokenizer_name: String,
|
||||||
sequence_length: u32,
|
sequence_length: u32,
|
||||||
decode_length: u32,
|
decode_length: u32,
|
||||||
n_run: usize,
|
n_run: usize,
|
||||||
batch_size: Vec<u32>,
|
|
||||||
receiver: mpsc::Receiver<Result<Message, ClientError>>,
|
receiver: mpsc::Receiver<Result<Message, ClientError>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -40,7 +39,6 @@ impl App {
|
||||||
n_run: usize,
|
n_run: usize,
|
||||||
batch_size: Vec<u32>,
|
batch_size: Vec<u32>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let data = Data::new(n_run, batch_size.len());
|
|
||||||
let current_tab = 0;
|
let current_tab = 0;
|
||||||
|
|
||||||
let completed_runs: Vec<usize> = (0..batch_size.len()).map(|_| 0).collect();
|
let completed_runs: Vec<usize> = (0..batch_size.len()).map(|_| 0).collect();
|
||||||
|
@ -48,8 +46,11 @@ impl App {
|
||||||
let current_batch = 0;
|
let current_batch = 0;
|
||||||
let is_error = false;
|
let is_error = false;
|
||||||
|
|
||||||
|
let data = Data::new(n_run, batch_size);
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
running: true,
|
running: true,
|
||||||
|
data,
|
||||||
completed_runs,
|
completed_runs,
|
||||||
completed_batch,
|
completed_batch,
|
||||||
current_batch,
|
current_batch,
|
||||||
|
@ -57,12 +58,10 @@ impl App {
|
||||||
touched_tab: false,
|
touched_tab: false,
|
||||||
zoom: false,
|
zoom: false,
|
||||||
is_error,
|
is_error,
|
||||||
data,
|
|
||||||
tokenizer_name,
|
tokenizer_name,
|
||||||
sequence_length,
|
sequence_length,
|
||||||
decode_length,
|
decode_length,
|
||||||
n_run,
|
n_run,
|
||||||
batch_size,
|
|
||||||
receiver,
|
receiver,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -79,7 +78,7 @@ impl App {
|
||||||
code: KeyCode::Tab, ..
|
code: KeyCode::Tab, ..
|
||||||
} => {
|
} => {
|
||||||
self.touched_tab = true;
|
self.touched_tab = true;
|
||||||
self.current_tab = (self.current_tab + 1) % self.batch_size.len();
|
self.current_tab = (self.current_tab + 1) % self.data.batch_size.len();
|
||||||
}
|
}
|
||||||
// Decrease and wrap tab
|
// Decrease and wrap tab
|
||||||
KeyEvent {
|
KeyEvent {
|
||||||
|
@ -90,7 +89,7 @@ impl App {
|
||||||
if self.current_tab > 0 {
|
if self.current_tab > 0 {
|
||||||
self.current_tab -= 1;
|
self.current_tab -= 1;
|
||||||
} else {
|
} else {
|
||||||
self.current_tab = self.batch_size.len() - 1;
|
self.current_tab = self.data.batch_size.len() - 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Zoom on throughput/latency fig
|
// Zoom on throughput/latency fig
|
||||||
|
@ -137,7 +136,7 @@ impl App {
|
||||||
self.data.end_batch(self.current_batch);
|
self.data.end_batch(self.current_batch);
|
||||||
self.completed_batch += 1;
|
self.completed_batch += 1;
|
||||||
|
|
||||||
if self.current_batch < self.batch_size.len() - 1 {
|
if self.current_batch < self.data.batch_size.len() - 1 {
|
||||||
// Only go to next tab if the user never touched the tab keys
|
// Only go to next tab if the user never touched the tab keys
|
||||||
if !self.touched_tab {
|
if !self.touched_tab {
|
||||||
self.current_tab += 1;
|
self.current_tab += 1;
|
||||||
|
@ -156,7 +155,7 @@ impl App {
|
||||||
/// Render frame
|
/// Render frame
|
||||||
pub fn render<B: Backend>(&mut self, f: &mut Frame<'_, B>) {
|
pub fn render<B: Backend>(&mut self, f: &mut Frame<'_, B>) {
|
||||||
let batch_progress =
|
let batch_progress =
|
||||||
(self.completed_batch as f64 / self.batch_size.len() as f64).clamp(0.0, 1.0);
|
(self.completed_batch as f64 / self.data.batch_size.len() as f64).clamp(0.0, 1.0);
|
||||||
let run_progress =
|
let run_progress =
|
||||||
(self.completed_runs[self.current_batch] as f64 / self.n_run as f64).clamp(0.0, 1.0);
|
(self.completed_runs[self.current_batch] as f64 / self.n_run as f64).clamp(0.0, 1.0);
|
||||||
|
|
||||||
|
@ -241,6 +240,7 @@ impl App {
|
||||||
|
|
||||||
// Batch tabs
|
// Batch tabs
|
||||||
let titles = self
|
let titles = self
|
||||||
|
.data
|
||||||
.batch_size
|
.batch_size
|
||||||
.iter()
|
.iter()
|
||||||
.map(|b| {
|
.map(|b| {
|
||||||
|
@ -269,7 +269,7 @@ impl App {
|
||||||
};
|
};
|
||||||
let batch_gauge = progress_gauge(
|
let batch_gauge = progress_gauge(
|
||||||
"Total Progress",
|
"Total Progress",
|
||||||
format!("{} / {}", self.completed_batch, self.batch_size.len()),
|
format!("{} / {}", self.completed_batch, self.data.batch_size.len()),
|
||||||
batch_progress,
|
batch_progress,
|
||||||
color,
|
color,
|
||||||
);
|
);
|
||||||
|
@ -347,7 +347,7 @@ impl App {
|
||||||
// Prefill latency/throughput chart
|
// Prefill latency/throughput chart
|
||||||
let prefill_latency_throughput_chart = latency_throughput_chart(
|
let prefill_latency_throughput_chart = latency_throughput_chart(
|
||||||
&self.data.prefill_batch_latency_throughput,
|
&self.data.prefill_batch_latency_throughput,
|
||||||
&self.batch_size,
|
&self.data.batch_size,
|
||||||
self.zoom,
|
self.zoom,
|
||||||
"Prefill",
|
"Prefill",
|
||||||
);
|
);
|
||||||
|
@ -356,7 +356,7 @@ impl App {
|
||||||
// Decode latency/throughput chart
|
// Decode latency/throughput chart
|
||||||
let decode_latency_throughput_chart = latency_throughput_chart(
|
let decode_latency_throughput_chart = latency_throughput_chart(
|
||||||
&self.data.decode_batch_latency_throughput,
|
&self.data.decode_batch_latency_throughput,
|
||||||
&self.batch_size,
|
&self.data.batch_size,
|
||||||
self.zoom,
|
self.zoom,
|
||||||
"Decode",
|
"Decode",
|
||||||
);
|
);
|
||||||
|
@ -365,31 +365,35 @@ impl App {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// App internal data struct
|
/// App internal data struct
|
||||||
struct Data {
|
pub(crate) struct Data {
|
||||||
prefill_latencies: Vec<Vec<f64>>,
|
pub(crate) batch_size: Vec<u32>,
|
||||||
prefill_throughputs: Vec<Vec<f64>>,
|
pub(crate) prefill_latencies: Vec<Vec<f64>>,
|
||||||
decode_latencies: Vec<Vec<f64>>,
|
pub(crate) prefill_throughputs: Vec<Vec<f64>>,
|
||||||
decode_token_latencies: Vec<Vec<f64>>,
|
pub(crate) decode_latencies: Vec<Vec<f64>>,
|
||||||
decode_throughputs: Vec<Vec<f64>>,
|
pub(crate) decode_token_latencies: Vec<Vec<f64>>,
|
||||||
prefill_batch_latency_throughput: Vec<(f64, f64)>,
|
pub(crate) decode_throughputs: Vec<Vec<f64>>,
|
||||||
decode_batch_latency_throughput: Vec<(f64, f64)>,
|
pub(crate) prefill_batch_latency_throughput: Vec<(f64, f64)>,
|
||||||
|
pub(crate) decode_batch_latency_throughput: Vec<(f64, f64)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Data {
|
impl Data {
|
||||||
fn new(n_run: usize, n_batch: usize) -> Self {
|
fn new(n_run: usize, batch_size: Vec<u32>) -> Self {
|
||||||
let prefill_latencies: Vec<Vec<f64>> =
|
let prefill_latencies: Vec<Vec<f64>> = (0..batch_size.len())
|
||||||
(0..n_batch).map(|_| Vec::with_capacity(n_run)).collect();
|
.map(|_| Vec::with_capacity(n_run))
|
||||||
|
.collect();
|
||||||
let prefill_throughputs: Vec<Vec<f64>> = prefill_latencies.clone();
|
let prefill_throughputs: Vec<Vec<f64>> = prefill_latencies.clone();
|
||||||
|
|
||||||
let decode_latencies: Vec<Vec<f64>> = prefill_latencies.clone();
|
let decode_latencies: Vec<Vec<f64>> = prefill_latencies.clone();
|
||||||
let decode_token_latencies: Vec<Vec<f64>> = decode_latencies.clone();
|
let decode_token_latencies: Vec<Vec<f64>> = decode_latencies.clone();
|
||||||
let decode_throughputs: Vec<Vec<f64>> = prefill_throughputs.clone();
|
let decode_throughputs: Vec<Vec<f64>> = prefill_throughputs.clone();
|
||||||
|
|
||||||
let prefill_batch_latency_throughput: Vec<(f64, f64)> = Vec::with_capacity(n_batch);
|
let prefill_batch_latency_throughput: Vec<(f64, f64)> =
|
||||||
|
Vec::with_capacity(batch_size.len());
|
||||||
let decode_batch_latency_throughput: Vec<(f64, f64)> =
|
let decode_batch_latency_throughput: Vec<(f64, f64)> =
|
||||||
prefill_batch_latency_throughput.clone();
|
prefill_batch_latency_throughput.clone();
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
|
batch_size,
|
||||||
prefill_latencies,
|
prefill_latencies,
|
||||||
prefill_throughputs,
|
prefill_throughputs,
|
||||||
decode_latencies,
|
decode_latencies,
|
||||||
|
@ -401,14 +405,14 @@ impl Data {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn push_prefill(&mut self, prefill: Prefill, batch_idx: usize) {
|
fn push_prefill(&mut self, prefill: Prefill, batch_idx: usize) {
|
||||||
let latency = prefill.latency.as_millis() as f64;
|
let latency = prefill.latency.as_micros() as f64 / 1000.0;
|
||||||
self.prefill_latencies[batch_idx].push(latency);
|
self.prefill_latencies[batch_idx].push(latency);
|
||||||
self.prefill_throughputs[batch_idx].push(prefill.throughput);
|
self.prefill_throughputs[batch_idx].push(prefill.throughput);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn push_decode(&mut self, decode: Decode, batch_idx: usize) {
|
fn push_decode(&mut self, decode: Decode, batch_idx: usize) {
|
||||||
let latency = decode.latency.as_millis() as f64;
|
let latency = decode.latency.as_micros() as f64 / 1000.0;
|
||||||
let token_latency = decode.token_latency.as_millis() as f64;
|
let token_latency = decode.token_latency.as_micros() as f64 / 1000.0;
|
||||||
self.decode_latencies[batch_idx].push(latency);
|
self.decode_latencies[batch_idx].push(latency);
|
||||||
self.decode_token_latencies[batch_idx].push(token_latency);
|
self.decode_token_latencies[batch_idx].push(token_latency);
|
||||||
self.decode_throughputs[batch_idx].push(decode.throughput);
|
self.decode_throughputs[batch_idx].push(decode.throughput);
|
||||||
|
|
|
@ -39,6 +39,7 @@ pub(crate) async fn generation_task(
|
||||||
decode_length: u32,
|
decode_length: u32,
|
||||||
n_runs: usize,
|
n_runs: usize,
|
||||||
warmups: usize,
|
warmups: usize,
|
||||||
|
parameters: NextTokenChooserParameters,
|
||||||
client: ShardedClient,
|
client: ShardedClient,
|
||||||
run_sender: mpsc::Sender<Result<Message, ClientError>>,
|
run_sender: mpsc::Sender<Result<Message, ClientError>>,
|
||||||
mut shutdown_receiver: broadcast::Receiver<()>,
|
mut shutdown_receiver: broadcast::Receiver<()>,
|
||||||
|
@ -47,7 +48,7 @@ pub(crate) async fn generation_task(
|
||||||
// End task if a message is received on shutdown_receiver
|
// End task if a message is received on shutdown_receiver
|
||||||
// _shutdown_guard_sender will be dropped once the task is finished
|
// _shutdown_guard_sender will be dropped once the task is finished
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
res = generate_runs(tokenizer, batch_size, sequence_length, decode_length, n_runs, warmups, client, run_sender.clone()) => {
|
res = generate_runs(tokenizer, batch_size, sequence_length, decode_length, n_runs, warmups, parameters, client, run_sender.clone()) => {
|
||||||
if let Err(err) = res {
|
if let Err(err) = res {
|
||||||
run_sender.send(Err(err)).await.unwrap_or(());
|
run_sender.send(Err(err)).await.unwrap_or(());
|
||||||
}
|
}
|
||||||
|
@ -65,6 +66,7 @@ async fn generate_runs(
|
||||||
decode_length: u32,
|
decode_length: u32,
|
||||||
n_runs: usize,
|
n_runs: usize,
|
||||||
warmups: usize,
|
warmups: usize,
|
||||||
|
parameters: NextTokenChooserParameters,
|
||||||
mut client: ShardedClient,
|
mut client: ShardedClient,
|
||||||
run_sender: mpsc::Sender<Result<Message, ClientError>>,
|
run_sender: mpsc::Sender<Result<Message, ClientError>>,
|
||||||
) -> Result<(), ClientError> {
|
) -> Result<(), ClientError> {
|
||||||
|
@ -79,6 +81,7 @@ async fn generate_runs(
|
||||||
sequence_length,
|
sequence_length,
|
||||||
b,
|
b,
|
||||||
decode_length,
|
decode_length,
|
||||||
|
parameters.clone(),
|
||||||
&mut client,
|
&mut client,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
@ -93,6 +96,7 @@ async fn generate_runs(
|
||||||
sequence_length,
|
sequence_length,
|
||||||
b,
|
b,
|
||||||
decode_length,
|
decode_length,
|
||||||
|
parameters.clone(),
|
||||||
&mut client,
|
&mut client,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
@ -125,6 +129,7 @@ async fn prefill(
|
||||||
sequence_length: u32,
|
sequence_length: u32,
|
||||||
batch_size: u32,
|
batch_size: u32,
|
||||||
decode_length: u32,
|
decode_length: u32,
|
||||||
|
parameters: NextTokenChooserParameters,
|
||||||
client: &mut ShardedClient,
|
client: &mut ShardedClient,
|
||||||
) -> Result<(Prefill, CachedBatch), ClientError> {
|
) -> Result<(Prefill, CachedBatch), ClientError> {
|
||||||
// Create requests
|
// Create requests
|
||||||
|
@ -133,16 +138,7 @@ async fn prefill(
|
||||||
id: id.into(),
|
id: id.into(),
|
||||||
inputs: sequence.clone(),
|
inputs: sequence.clone(),
|
||||||
truncate: sequence_length,
|
truncate: sequence_length,
|
||||||
parameters: Some(NextTokenChooserParameters {
|
parameters: Some(parameters.clone()),
|
||||||
temperature: 1.0,
|
|
||||||
top_k: 0,
|
|
||||||
top_p: 1.0,
|
|
||||||
typical_p: 1.0,
|
|
||||||
do_sample: false,
|
|
||||||
seed: 0,
|
|
||||||
repetition_penalty: 1.0,
|
|
||||||
watermark: false,
|
|
||||||
}),
|
|
||||||
stopping_parameters: Some(StoppingCriteriaParameters {
|
stopping_parameters: Some(StoppingCriteriaParameters {
|
||||||
max_new_tokens: decode_length,
|
max_new_tokens: decode_length,
|
||||||
stop_sequences: vec![],
|
stop_sequences: vec![],
|
||||||
|
|
|
@ -1,13 +1,14 @@
|
||||||
mod app;
|
mod app;
|
||||||
mod event;
|
mod event;
|
||||||
mod generation;
|
mod generation;
|
||||||
|
mod table;
|
||||||
mod utils;
|
mod utils;
|
||||||
|
|
||||||
use crate::app::App;
|
use crate::app::App;
|
||||||
use crate::event::Event;
|
use crate::event::Event;
|
||||||
use crossterm::ExecutableCommand;
|
use crossterm::ExecutableCommand;
|
||||||
use std::io;
|
use std::io;
|
||||||
use text_generation_client::ShardedClient;
|
use text_generation_client::{NextTokenChooserParameters, ShardedClient};
|
||||||
use tokenizers::Tokenizer;
|
use tokenizers::Tokenizer;
|
||||||
use tokio::sync::{broadcast, mpsc};
|
use tokio::sync::{broadcast, mpsc};
|
||||||
use tui::backend::CrosstermBackend;
|
use tui::backend::CrosstermBackend;
|
||||||
|
@ -23,8 +24,26 @@ pub async fn run(
|
||||||
decode_length: u32,
|
decode_length: u32,
|
||||||
n_runs: usize,
|
n_runs: usize,
|
||||||
warmups: usize,
|
warmups: usize,
|
||||||
|
temperature: Option<f32>,
|
||||||
|
top_k: Option<u32>,
|
||||||
|
top_p: Option<f32>,
|
||||||
|
typical_p: Option<f32>,
|
||||||
|
repetition_penalty: Option<f32>,
|
||||||
|
watermark: bool,
|
||||||
|
do_sample: bool,
|
||||||
client: ShardedClient,
|
client: ShardedClient,
|
||||||
) -> Result<(), crossterm::ErrorKind> {
|
) -> Result<(), crossterm::ErrorKind> {
|
||||||
|
let parameters = NextTokenChooserParameters {
|
||||||
|
temperature: temperature.unwrap_or(1.0),
|
||||||
|
top_k: top_k.unwrap_or(0),
|
||||||
|
top_p: top_p.unwrap_or(1.0),
|
||||||
|
typical_p: typical_p.unwrap_or(1.0),
|
||||||
|
do_sample,
|
||||||
|
seed: 0,
|
||||||
|
repetition_penalty: repetition_penalty.unwrap_or(1.0),
|
||||||
|
watermark,
|
||||||
|
};
|
||||||
|
|
||||||
// Initialize terminal properties
|
// Initialize terminal properties
|
||||||
crossterm::terminal::enable_raw_mode()?;
|
crossterm::terminal::enable_raw_mode()?;
|
||||||
io::stdout().execute(crossterm::terminal::EnterAlternateScreen)?;
|
io::stdout().execute(crossterm::terminal::EnterAlternateScreen)?;
|
||||||
|
@ -53,6 +72,7 @@ pub async fn run(
|
||||||
decode_length,
|
decode_length,
|
||||||
n_runs,
|
n_runs,
|
||||||
warmups,
|
warmups,
|
||||||
|
parameters,
|
||||||
client,
|
client,
|
||||||
run_sender,
|
run_sender,
|
||||||
shutdown_sender.subscribe(),
|
shutdown_sender.subscribe(),
|
||||||
|
@ -73,7 +93,7 @@ pub async fn run(
|
||||||
// Create App
|
// Create App
|
||||||
let mut app = App::new(
|
let mut app = App::new(
|
||||||
run_receiver,
|
run_receiver,
|
||||||
tokenizer_name,
|
tokenizer_name.clone(),
|
||||||
sequence_length,
|
sequence_length,
|
||||||
decode_length,
|
decode_length,
|
||||||
n_runs,
|
n_runs,
|
||||||
|
@ -106,5 +126,27 @@ pub async fn run(
|
||||||
crossterm::terminal::disable_raw_mode()?;
|
crossterm::terminal::disable_raw_mode()?;
|
||||||
io::stdout().execute(crossterm::cursor::Show)?;
|
io::stdout().execute(crossterm::cursor::Show)?;
|
||||||
|
|
||||||
|
let parameters_table = table::parameters_table(
|
||||||
|
tokenizer_name,
|
||||||
|
sequence_length,
|
||||||
|
decode_length,
|
||||||
|
n_runs,
|
||||||
|
warmups,
|
||||||
|
temperature,
|
||||||
|
top_k,
|
||||||
|
top_p,
|
||||||
|
typical_p,
|
||||||
|
repetition_penalty,
|
||||||
|
watermark,
|
||||||
|
do_sample,
|
||||||
|
);
|
||||||
|
println!("\n{parameters_table}\n");
|
||||||
|
|
||||||
|
let latency_table = table::latency_table(&app.data);
|
||||||
|
println!("\n{latency_table}\n");
|
||||||
|
|
||||||
|
let throughput_table = table::throughput_table(&app.data);
|
||||||
|
println!("\n{throughput_table}\n");
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,11 +28,27 @@ struct Args {
|
||||||
runs: usize,
|
runs: usize,
|
||||||
#[clap(default_value = "1", short, long, env)]
|
#[clap(default_value = "1", short, long, env)]
|
||||||
warmups: usize,
|
warmups: usize,
|
||||||
|
#[clap(long, env)]
|
||||||
|
temperature: Option<f32>,
|
||||||
|
#[clap(long, env)]
|
||||||
|
top_k: Option<u32>,
|
||||||
|
#[clap(long, env)]
|
||||||
|
top_p: Option<f32>,
|
||||||
|
#[clap(long, env)]
|
||||||
|
typical_p: Option<f32>,
|
||||||
|
#[clap(long, env)]
|
||||||
|
repetition_penalty: Option<f32>,
|
||||||
|
#[clap(long, env)]
|
||||||
|
watermark: bool,
|
||||||
|
#[clap(long, env)]
|
||||||
|
do_sample: bool,
|
||||||
#[clap(default_value = "/tmp/text-generation-server-0", short, long, env)]
|
#[clap(default_value = "/tmp/text-generation-server-0", short, long, env)]
|
||||||
master_shard_uds_path: String,
|
master_shard_uds_path: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
init_logging();
|
||||||
|
|
||||||
// Get args
|
// Get args
|
||||||
let args = Args::parse();
|
let args = Args::parse();
|
||||||
// Pattern match configuration
|
// Pattern match configuration
|
||||||
|
@ -44,13 +60,18 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
decode_length,
|
decode_length,
|
||||||
runs,
|
runs,
|
||||||
warmups,
|
warmups,
|
||||||
|
temperature,
|
||||||
|
top_k,
|
||||||
|
top_p,
|
||||||
|
typical_p,
|
||||||
|
repetition_penalty,
|
||||||
|
watermark,
|
||||||
|
do_sample,
|
||||||
master_shard_uds_path,
|
master_shard_uds_path,
|
||||||
} = args;
|
} = args;
|
||||||
|
|
||||||
let batch_size = batch_size.unwrap_or(vec![1, 2, 4, 8, 16, 32]);
|
let batch_size = batch_size.unwrap_or(vec![1, 2, 4, 8, 16, 32]);
|
||||||
|
|
||||||
init_logging();
|
|
||||||
|
|
||||||
// Tokenizer instance
|
// Tokenizer instance
|
||||||
// This will only be used to validate payloads
|
// This will only be used to validate payloads
|
||||||
tracing::info!("Loading tokenizer");
|
tracing::info!("Loading tokenizer");
|
||||||
|
@ -105,6 +126,13 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
decode_length,
|
decode_length,
|
||||||
runs,
|
runs,
|
||||||
warmups,
|
warmups,
|
||||||
|
temperature,
|
||||||
|
top_k,
|
||||||
|
top_p,
|
||||||
|
typical_p,
|
||||||
|
repetition_penalty,
|
||||||
|
watermark,
|
||||||
|
do_sample,
|
||||||
sharded_client,
|
sharded_client,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
|
|
|
@ -0,0 +1,170 @@
|
||||||
|
use crate::app::Data;
|
||||||
|
use tabled::settings::Merge;
|
||||||
|
use tabled::{builder::Builder, settings::Style, Table};
|
||||||
|
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
pub(crate) fn parameters_table(
|
||||||
|
tokenizer_name: String,
|
||||||
|
sequence_length: u32,
|
||||||
|
decode_length: u32,
|
||||||
|
n_runs: usize,
|
||||||
|
warmups: usize,
|
||||||
|
temperature: Option<f32>,
|
||||||
|
top_k: Option<u32>,
|
||||||
|
top_p: Option<f32>,
|
||||||
|
typical_p: Option<f32>,
|
||||||
|
repetition_penalty: Option<f32>,
|
||||||
|
watermark: bool,
|
||||||
|
do_sample: bool,
|
||||||
|
) -> Table {
|
||||||
|
let mut builder = Builder::default();
|
||||||
|
|
||||||
|
builder.set_header(["Parameter", "Value"]);
|
||||||
|
|
||||||
|
builder.push_record(["Model", &tokenizer_name]);
|
||||||
|
builder.push_record(["Sequence Length", &sequence_length.to_string()]);
|
||||||
|
builder.push_record(["Decode Length", &decode_length.to_string()]);
|
||||||
|
builder.push_record(["N Runs", &n_runs.to_string()]);
|
||||||
|
builder.push_record(["Warmups", &warmups.to_string()]);
|
||||||
|
builder.push_record(["Temperature", &format!("{temperature:?}")]);
|
||||||
|
builder.push_record(["Top K", &format!("{top_k:?}")]);
|
||||||
|
builder.push_record(["Top P", &format!("{top_p:?}")]);
|
||||||
|
builder.push_record(["Typical P", &format!("{typical_p:?}")]);
|
||||||
|
builder.push_record(["Repetition Penalty", &format!("{repetition_penalty:?}")]);
|
||||||
|
builder.push_record(["Watermark", &watermark.to_string()]);
|
||||||
|
builder.push_record(["Do Sample", &do_sample.to_string()]);
|
||||||
|
|
||||||
|
let mut table = builder.build();
|
||||||
|
table.with(Style::markdown());
|
||||||
|
table
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn latency_table(data: &Data) -> Table {
|
||||||
|
let mut builder = Builder::default();
|
||||||
|
|
||||||
|
builder.set_header([
|
||||||
|
"Step",
|
||||||
|
"Batch Size",
|
||||||
|
"Average",
|
||||||
|
"Lowest",
|
||||||
|
"Highest",
|
||||||
|
"p50",
|
||||||
|
"p90",
|
||||||
|
"p99",
|
||||||
|
]);
|
||||||
|
|
||||||
|
add_latencies(
|
||||||
|
&mut builder,
|
||||||
|
"Prefill",
|
||||||
|
&data.batch_size,
|
||||||
|
&data.prefill_latencies,
|
||||||
|
);
|
||||||
|
add_latencies(
|
||||||
|
&mut builder,
|
||||||
|
"Decode (token)",
|
||||||
|
&data.batch_size,
|
||||||
|
&data.decode_token_latencies,
|
||||||
|
);
|
||||||
|
add_latencies(
|
||||||
|
&mut builder,
|
||||||
|
"Decode (total)",
|
||||||
|
&data.batch_size,
|
||||||
|
&data.decode_latencies,
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut table = builder.build();
|
||||||
|
table.with(Style::markdown()).with(Merge::vertical());
|
||||||
|
table
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn throughput_table(data: &Data) -> Table {
|
||||||
|
let mut builder = Builder::default();
|
||||||
|
|
||||||
|
builder.set_header(["Step", "Batch Size", "Average", "Lowest", "Highest"]);
|
||||||
|
|
||||||
|
add_throuhgputs(
|
||||||
|
&mut builder,
|
||||||
|
"Prefill",
|
||||||
|
&data.batch_size,
|
||||||
|
&data.prefill_throughputs,
|
||||||
|
);
|
||||||
|
add_throuhgputs(
|
||||||
|
&mut builder,
|
||||||
|
"Decode",
|
||||||
|
&data.batch_size,
|
||||||
|
&data.decode_throughputs,
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut table = builder.build();
|
||||||
|
table.with(Style::markdown()).with(Merge::vertical());
|
||||||
|
table
|
||||||
|
}
|
||||||
|
|
||||||
|
fn add_latencies(
|
||||||
|
builder: &mut Builder,
|
||||||
|
step: &'static str,
|
||||||
|
batch_size: &[u32],
|
||||||
|
batch_latencies: &[Vec<f64>],
|
||||||
|
) {
|
||||||
|
for (i, b) in batch_size.iter().enumerate() {
|
||||||
|
let latencies = &batch_latencies[i];
|
||||||
|
let (avg, min, max) = avg_min_max(latencies);
|
||||||
|
|
||||||
|
let row = [
|
||||||
|
step,
|
||||||
|
&b.to_string(),
|
||||||
|
&format_value(avg, "ms"),
|
||||||
|
&format_value(min, "ms"),
|
||||||
|
&format_value(max, "ms"),
|
||||||
|
&format_value(px(latencies, 50), "ms"),
|
||||||
|
&format_value(px(latencies, 90), "ms"),
|
||||||
|
&format_value(px(latencies, 99), "ms"),
|
||||||
|
];
|
||||||
|
|
||||||
|
builder.push_record(row);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn add_throuhgputs(
|
||||||
|
builder: &mut Builder,
|
||||||
|
step: &'static str,
|
||||||
|
batch_size: &[u32],
|
||||||
|
batch_throughputs: &[Vec<f64>],
|
||||||
|
) {
|
||||||
|
for (i, b) in batch_size.iter().enumerate() {
|
||||||
|
let throughputs = &batch_throughputs[i];
|
||||||
|
let (avg, min, max) = avg_min_max(throughputs);
|
||||||
|
|
||||||
|
let row = [
|
||||||
|
step,
|
||||||
|
&b.to_string(),
|
||||||
|
&format_value(avg, "tokens/secs"),
|
||||||
|
&format_value(min, "tokens/secs"),
|
||||||
|
&format_value(max, "tokens/secs"),
|
||||||
|
];
|
||||||
|
|
||||||
|
builder.push_record(row);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn avg_min_max(data: &Vec<f64>) -> (f64, f64, f64) {
|
||||||
|
let average = data.iter().sum::<f64>() / data.len() as f64;
|
||||||
|
let min = data
|
||||||
|
.iter()
|
||||||
|
.min_by(|a, b| a.total_cmp(b))
|
||||||
|
.unwrap_or(&std::f64::NAN);
|
||||||
|
let max = data
|
||||||
|
.iter()
|
||||||
|
.max_by(|a, b| a.total_cmp(b))
|
||||||
|
.unwrap_or(&std::f64::NAN);
|
||||||
|
(average, *min, *max)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn px(data: &Vec<f64>, p: u32) -> f64 {
|
||||||
|
let i = (f64::from(p) / 100.0 * data.len() as f64) as usize;
|
||||||
|
*data.get(i).unwrap_or(&std::f64::NAN)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn format_value(value: f64, unit: &'static str) -> String {
|
||||||
|
format!("{:.2} {unit}", value)
|
||||||
|
}
|
Loading…
Reference in New Issue