feat(benchmarker): add summary tables (#368)

This commit is contained in:
OlivierDehaene 2023-05-25 13:38:36 +02:00 committed by GitHub
parent 218c9adaa5
commit 951930fbff
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 325 additions and 42 deletions

42
Cargo.lock generated
View File

@ -249,6 +249,12 @@ version = "3.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
[[package]]
name = "bytecount"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
[[package]]
name = "byteorder"
version = "1.4.3"
@ -1706,6 +1712,17 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "papergrid"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fdfe703c51ddc52887ad78fc69cd2ea78d895ffcd6e955c9d03566db8ab5bb1"
dependencies = [
"bytecount",
"fnv",
"unicode-width",
]
[[package]]
name = "parking_lot"
version = "0.12.1"
@ -2490,6 +2507,30 @@ dependencies = [
"winapi",
]
[[package]]
name = "tabled"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da1a2e56bbf7bfdd08aaa7592157a742205459eff774b73bc01809ae2d99dc2a"
dependencies = [
"papergrid",
"tabled_derive",
"unicode-width",
]
[[package]]
name = "tabled_derive"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99f688a08b54f4f02f0a3c382aefdb7884d3d69609f785bd253dc033243e3fe4"
dependencies = [
"heck",
"proc-macro-error",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "tar"
version = "0.4.38"
@ -2525,6 +2566,7 @@ dependencies = [
"ratatui",
"serde",
"serde_json",
"tabled",
"text-generation-client",
"thiserror",
"tokenizers",

View File

@ -20,6 +20,7 @@ crossterm = "0.26"
float-ord = "0.3.2"
serde = {version = "1.0.142", features = ["derive"]}
serde_json = "1.0"
tabled = "0.12.0"
text-generation-client = { path = "../router/client" }
thiserror = "1.0.38"
tokenizers = "0.13.3"

View File

@ -15,6 +15,7 @@ use tui::{symbols, Frame};
/// TUI powered App
pub(crate) struct App {
pub(crate) running: bool,
pub(crate) data: Data,
completed_runs: Vec<usize>,
completed_batch: usize,
current_batch: usize,
@ -22,12 +23,10 @@ pub(crate) struct App {
touched_tab: bool,
zoom: bool,
is_error: bool,
data: Data,
tokenizer_name: String,
sequence_length: u32,
decode_length: u32,
n_run: usize,
batch_size: Vec<u32>,
receiver: mpsc::Receiver<Result<Message, ClientError>>,
}
@ -40,7 +39,6 @@ impl App {
n_run: usize,
batch_size: Vec<u32>,
) -> Self {
let data = Data::new(n_run, batch_size.len());
let current_tab = 0;
let completed_runs: Vec<usize> = (0..batch_size.len()).map(|_| 0).collect();
@ -48,8 +46,11 @@ impl App {
let current_batch = 0;
let is_error = false;
let data = Data::new(n_run, batch_size);
Self {
running: true,
data,
completed_runs,
completed_batch,
current_batch,
@ -57,12 +58,10 @@ impl App {
touched_tab: false,
zoom: false,
is_error,
data,
tokenizer_name,
sequence_length,
decode_length,
n_run,
batch_size,
receiver,
}
}
@ -79,7 +78,7 @@ impl App {
code: KeyCode::Tab, ..
} => {
self.touched_tab = true;
self.current_tab = (self.current_tab + 1) % self.batch_size.len();
self.current_tab = (self.current_tab + 1) % self.data.batch_size.len();
}
// Decrease and wrap tab
KeyEvent {
@ -90,7 +89,7 @@ impl App {
if self.current_tab > 0 {
self.current_tab -= 1;
} else {
self.current_tab = self.batch_size.len() - 1;
self.current_tab = self.data.batch_size.len() - 1;
}
}
// Zoom on throughput/latency fig
@ -137,7 +136,7 @@ impl App {
self.data.end_batch(self.current_batch);
self.completed_batch += 1;
if self.current_batch < self.batch_size.len() - 1 {
if self.current_batch < self.data.batch_size.len() - 1 {
// Only go to next tab if the user never touched the tab keys
if !self.touched_tab {
self.current_tab += 1;
@ -156,7 +155,7 @@ impl App {
/// Render frame
pub fn render<B: Backend>(&mut self, f: &mut Frame<'_, B>) {
let batch_progress =
(self.completed_batch as f64 / self.batch_size.len() as f64).clamp(0.0, 1.0);
(self.completed_batch as f64 / self.data.batch_size.len() as f64).clamp(0.0, 1.0);
let run_progress =
(self.completed_runs[self.current_batch] as f64 / self.n_run as f64).clamp(0.0, 1.0);
@ -241,6 +240,7 @@ impl App {
// Batch tabs
let titles = self
.data
.batch_size
.iter()
.map(|b| {
@ -269,7 +269,7 @@ impl App {
};
let batch_gauge = progress_gauge(
"Total Progress",
format!("{} / {}", self.completed_batch, self.batch_size.len()),
format!("{} / {}", self.completed_batch, self.data.batch_size.len()),
batch_progress,
color,
);
@ -347,7 +347,7 @@ impl App {
// Prefill latency/throughput chart
let prefill_latency_throughput_chart = latency_throughput_chart(
&self.data.prefill_batch_latency_throughput,
&self.batch_size,
&self.data.batch_size,
self.zoom,
"Prefill",
);
@ -356,7 +356,7 @@ impl App {
// Decode latency/throughput chart
let decode_latency_throughput_chart = latency_throughput_chart(
&self.data.decode_batch_latency_throughput,
&self.batch_size,
&self.data.batch_size,
self.zoom,
"Decode",
);
@ -365,31 +365,35 @@ impl App {
}
/// App internal data struct
struct Data {
prefill_latencies: Vec<Vec<f64>>,
prefill_throughputs: Vec<Vec<f64>>,
decode_latencies: Vec<Vec<f64>>,
decode_token_latencies: Vec<Vec<f64>>,
decode_throughputs: Vec<Vec<f64>>,
prefill_batch_latency_throughput: Vec<(f64, f64)>,
decode_batch_latency_throughput: Vec<(f64, f64)>,
pub(crate) struct Data {
pub(crate) batch_size: Vec<u32>,
pub(crate) prefill_latencies: Vec<Vec<f64>>,
pub(crate) prefill_throughputs: Vec<Vec<f64>>,
pub(crate) decode_latencies: Vec<Vec<f64>>,
pub(crate) decode_token_latencies: Vec<Vec<f64>>,
pub(crate) decode_throughputs: Vec<Vec<f64>>,
pub(crate) prefill_batch_latency_throughput: Vec<(f64, f64)>,
pub(crate) decode_batch_latency_throughput: Vec<(f64, f64)>,
}
impl Data {
fn new(n_run: usize, n_batch: usize) -> Self {
let prefill_latencies: Vec<Vec<f64>> =
(0..n_batch).map(|_| Vec::with_capacity(n_run)).collect();
fn new(n_run: usize, batch_size: Vec<u32>) -> Self {
let prefill_latencies: Vec<Vec<f64>> = (0..batch_size.len())
.map(|_| Vec::with_capacity(n_run))
.collect();
let prefill_throughputs: Vec<Vec<f64>> = prefill_latencies.clone();
let decode_latencies: Vec<Vec<f64>> = prefill_latencies.clone();
let decode_token_latencies: Vec<Vec<f64>> = decode_latencies.clone();
let decode_throughputs: Vec<Vec<f64>> = prefill_throughputs.clone();
let prefill_batch_latency_throughput: Vec<(f64, f64)> = Vec::with_capacity(n_batch);
let prefill_batch_latency_throughput: Vec<(f64, f64)> =
Vec::with_capacity(batch_size.len());
let decode_batch_latency_throughput: Vec<(f64, f64)> =
prefill_batch_latency_throughput.clone();
Self {
batch_size,
prefill_latencies,
prefill_throughputs,
decode_latencies,
@ -401,14 +405,14 @@ impl Data {
}
fn push_prefill(&mut self, prefill: Prefill, batch_idx: usize) {
let latency = prefill.latency.as_millis() as f64;
let latency = prefill.latency.as_micros() as f64 / 1000.0;
self.prefill_latencies[batch_idx].push(latency);
self.prefill_throughputs[batch_idx].push(prefill.throughput);
}
fn push_decode(&mut self, decode: Decode, batch_idx: usize) {
let latency = decode.latency.as_millis() as f64;
let token_latency = decode.token_latency.as_millis() as f64;
let latency = decode.latency.as_micros() as f64 / 1000.0;
let token_latency = decode.token_latency.as_micros() as f64 / 1000.0;
self.decode_latencies[batch_idx].push(latency);
self.decode_token_latencies[batch_idx].push(token_latency);
self.decode_throughputs[batch_idx].push(decode.throughput);

View File

@ -39,6 +39,7 @@ pub(crate) async fn generation_task(
decode_length: u32,
n_runs: usize,
warmups: usize,
parameters: NextTokenChooserParameters,
client: ShardedClient,
run_sender: mpsc::Sender<Result<Message, ClientError>>,
mut shutdown_receiver: broadcast::Receiver<()>,
@ -47,7 +48,7 @@ pub(crate) async fn generation_task(
// End task if a message is received on shutdown_receiver
// _shutdown_guard_sender will be dropped once the task is finished
tokio::select! {
res = generate_runs(tokenizer, batch_size, sequence_length, decode_length, n_runs, warmups, client, run_sender.clone()) => {
res = generate_runs(tokenizer, batch_size, sequence_length, decode_length, n_runs, warmups, parameters, client, run_sender.clone()) => {
if let Err(err) = res {
run_sender.send(Err(err)).await.unwrap_or(());
}
@ -65,6 +66,7 @@ async fn generate_runs(
decode_length: u32,
n_runs: usize,
warmups: usize,
parameters: NextTokenChooserParameters,
mut client: ShardedClient,
run_sender: mpsc::Sender<Result<Message, ClientError>>,
) -> Result<(), ClientError> {
@ -79,6 +81,7 @@ async fn generate_runs(
sequence_length,
b,
decode_length,
parameters.clone(),
&mut client,
)
.await?;
@ -93,6 +96,7 @@ async fn generate_runs(
sequence_length,
b,
decode_length,
parameters.clone(),
&mut client,
)
.await?;
@ -125,6 +129,7 @@ async fn prefill(
sequence_length: u32,
batch_size: u32,
decode_length: u32,
parameters: NextTokenChooserParameters,
client: &mut ShardedClient,
) -> Result<(Prefill, CachedBatch), ClientError> {
// Create requests
@ -133,16 +138,7 @@ async fn prefill(
id: id.into(),
inputs: sequence.clone(),
truncate: sequence_length,
parameters: Some(NextTokenChooserParameters {
temperature: 1.0,
top_k: 0,
top_p: 1.0,
typical_p: 1.0,
do_sample: false,
seed: 0,
repetition_penalty: 1.0,
watermark: false,
}),
parameters: Some(parameters.clone()),
stopping_parameters: Some(StoppingCriteriaParameters {
max_new_tokens: decode_length,
stop_sequences: vec![],

View File

@ -1,13 +1,14 @@
mod app;
mod event;
mod generation;
mod table;
mod utils;
use crate::app::App;
use crate::event::Event;
use crossterm::ExecutableCommand;
use std::io;
use text_generation_client::ShardedClient;
use text_generation_client::{NextTokenChooserParameters, ShardedClient};
use tokenizers::Tokenizer;
use tokio::sync::{broadcast, mpsc};
use tui::backend::CrosstermBackend;
@ -23,8 +24,26 @@ pub async fn run(
decode_length: u32,
n_runs: usize,
warmups: usize,
temperature: Option<f32>,
top_k: Option<u32>,
top_p: Option<f32>,
typical_p: Option<f32>,
repetition_penalty: Option<f32>,
watermark: bool,
do_sample: bool,
client: ShardedClient,
) -> Result<(), crossterm::ErrorKind> {
let parameters = NextTokenChooserParameters {
temperature: temperature.unwrap_or(1.0),
top_k: top_k.unwrap_or(0),
top_p: top_p.unwrap_or(1.0),
typical_p: typical_p.unwrap_or(1.0),
do_sample,
seed: 0,
repetition_penalty: repetition_penalty.unwrap_or(1.0),
watermark,
};
// Initialize terminal properties
crossterm::terminal::enable_raw_mode()?;
io::stdout().execute(crossterm::terminal::EnterAlternateScreen)?;
@ -53,6 +72,7 @@ pub async fn run(
decode_length,
n_runs,
warmups,
parameters,
client,
run_sender,
shutdown_sender.subscribe(),
@ -73,7 +93,7 @@ pub async fn run(
// Create App
let mut app = App::new(
run_receiver,
tokenizer_name,
tokenizer_name.clone(),
sequence_length,
decode_length,
n_runs,
@ -106,5 +126,27 @@ pub async fn run(
crossterm::terminal::disable_raw_mode()?;
io::stdout().execute(crossterm::cursor::Show)?;
let parameters_table = table::parameters_table(
tokenizer_name,
sequence_length,
decode_length,
n_runs,
warmups,
temperature,
top_k,
top_p,
typical_p,
repetition_penalty,
watermark,
do_sample,
);
println!("\n{parameters_table}\n");
let latency_table = table::latency_table(&app.data);
println!("\n{latency_table}\n");
let throughput_table = table::throughput_table(&app.data);
println!("\n{throughput_table}\n");
Ok(())
}

View File

@ -28,11 +28,27 @@ struct Args {
runs: usize,
#[clap(default_value = "1", short, long, env)]
warmups: usize,
#[clap(long, env)]
temperature: Option<f32>,
#[clap(long, env)]
top_k: Option<u32>,
#[clap(long, env)]
top_p: Option<f32>,
#[clap(long, env)]
typical_p: Option<f32>,
#[clap(long, env)]
repetition_penalty: Option<f32>,
#[clap(long, env)]
watermark: bool,
#[clap(long, env)]
do_sample: bool,
#[clap(default_value = "/tmp/text-generation-server-0", short, long, env)]
master_shard_uds_path: String,
}
fn main() -> Result<(), Box<dyn std::error::Error>> {
init_logging();
// Get args
let args = Args::parse();
// Pattern match configuration
@ -44,13 +60,18 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
decode_length,
runs,
warmups,
temperature,
top_k,
top_p,
typical_p,
repetition_penalty,
watermark,
do_sample,
master_shard_uds_path,
} = args;
let batch_size = batch_size.unwrap_or(vec![1, 2, 4, 8, 16, 32]);
init_logging();
// Tokenizer instance
// This will only be used to validate payloads
tracing::info!("Loading tokenizer");
@ -105,6 +126,13 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
decode_length,
runs,
warmups,
temperature,
top_k,
top_p,
typical_p,
repetition_penalty,
watermark,
do_sample,
sharded_client,
)
.await

170
benchmark/src/table.rs Normal file
View File

@ -0,0 +1,170 @@
use crate::app::Data;
use tabled::settings::Merge;
use tabled::{builder::Builder, settings::Style, Table};
#[allow(clippy::too_many_arguments)]
pub(crate) fn parameters_table(
tokenizer_name: String,
sequence_length: u32,
decode_length: u32,
n_runs: usize,
warmups: usize,
temperature: Option<f32>,
top_k: Option<u32>,
top_p: Option<f32>,
typical_p: Option<f32>,
repetition_penalty: Option<f32>,
watermark: bool,
do_sample: bool,
) -> Table {
let mut builder = Builder::default();
builder.set_header(["Parameter", "Value"]);
builder.push_record(["Model", &tokenizer_name]);
builder.push_record(["Sequence Length", &sequence_length.to_string()]);
builder.push_record(["Decode Length", &decode_length.to_string()]);
builder.push_record(["N Runs", &n_runs.to_string()]);
builder.push_record(["Warmups", &warmups.to_string()]);
builder.push_record(["Temperature", &format!("{temperature:?}")]);
builder.push_record(["Top K", &format!("{top_k:?}")]);
builder.push_record(["Top P", &format!("{top_p:?}")]);
builder.push_record(["Typical P", &format!("{typical_p:?}")]);
builder.push_record(["Repetition Penalty", &format!("{repetition_penalty:?}")]);
builder.push_record(["Watermark", &watermark.to_string()]);
builder.push_record(["Do Sample", &do_sample.to_string()]);
let mut table = builder.build();
table.with(Style::markdown());
table
}
pub(crate) fn latency_table(data: &Data) -> Table {
let mut builder = Builder::default();
builder.set_header([
"Step",
"Batch Size",
"Average",
"Lowest",
"Highest",
"p50",
"p90",
"p99",
]);
add_latencies(
&mut builder,
"Prefill",
&data.batch_size,
&data.prefill_latencies,
);
add_latencies(
&mut builder,
"Decode (token)",
&data.batch_size,
&data.decode_token_latencies,
);
add_latencies(
&mut builder,
"Decode (total)",
&data.batch_size,
&data.decode_latencies,
);
let mut table = builder.build();
table.with(Style::markdown()).with(Merge::vertical());
table
}
pub(crate) fn throughput_table(data: &Data) -> Table {
let mut builder = Builder::default();
builder.set_header(["Step", "Batch Size", "Average", "Lowest", "Highest"]);
add_throuhgputs(
&mut builder,
"Prefill",
&data.batch_size,
&data.prefill_throughputs,
);
add_throuhgputs(
&mut builder,
"Decode",
&data.batch_size,
&data.decode_throughputs,
);
let mut table = builder.build();
table.with(Style::markdown()).with(Merge::vertical());
table
}
fn add_latencies(
builder: &mut Builder,
step: &'static str,
batch_size: &[u32],
batch_latencies: &[Vec<f64>],
) {
for (i, b) in batch_size.iter().enumerate() {
let latencies = &batch_latencies[i];
let (avg, min, max) = avg_min_max(latencies);
let row = [
step,
&b.to_string(),
&format_value(avg, "ms"),
&format_value(min, "ms"),
&format_value(max, "ms"),
&format_value(px(latencies, 50), "ms"),
&format_value(px(latencies, 90), "ms"),
&format_value(px(latencies, 99), "ms"),
];
builder.push_record(row);
}
}
fn add_throuhgputs(
builder: &mut Builder,
step: &'static str,
batch_size: &[u32],
batch_throughputs: &[Vec<f64>],
) {
for (i, b) in batch_size.iter().enumerate() {
let throughputs = &batch_throughputs[i];
let (avg, min, max) = avg_min_max(throughputs);
let row = [
step,
&b.to_string(),
&format_value(avg, "tokens/secs"),
&format_value(min, "tokens/secs"),
&format_value(max, "tokens/secs"),
];
builder.push_record(row);
}
}
fn avg_min_max(data: &Vec<f64>) -> (f64, f64, f64) {
let average = data.iter().sum::<f64>() / data.len() as f64;
let min = data
.iter()
.min_by(|a, b| a.total_cmp(b))
.unwrap_or(&std::f64::NAN);
let max = data
.iter()
.max_by(|a, b| a.total_cmp(b))
.unwrap_or(&std::f64::NAN);
(average, *min, *max)
}
fn px(data: &Vec<f64>, p: u32) -> f64 {
let i = (f64::from(p) / 100.0 * data.len() as f64) as usize;
*data.get(i).unwrap_or(&std::f64::NAN)
}
fn format_value(value: f64, unit: &'static str) -> String {
format!("{:.2} {unit}", value)
}