diff --git a/Cargo.lock b/Cargo.lock index 1dcc7c97..62896a51 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -249,6 +249,12 @@ version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535" +[[package]] +name = "bytecount" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" + [[package]] name = "byteorder" version = "1.4.3" @@ -1706,6 +1712,17 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" +[[package]] +name = "papergrid" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fdfe703c51ddc52887ad78fc69cd2ea78d895ffcd6e955c9d03566db8ab5bb1" +dependencies = [ + "bytecount", + "fnv", + "unicode-width", +] + [[package]] name = "parking_lot" version = "0.12.1" @@ -2490,6 +2507,30 @@ dependencies = [ "winapi", ] +[[package]] +name = "tabled" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da1a2e56bbf7bfdd08aaa7592157a742205459eff774b73bc01809ae2d99dc2a" +dependencies = [ + "papergrid", + "tabled_derive", + "unicode-width", +] + +[[package]] +name = "tabled_derive" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99f688a08b54f4f02f0a3c382aefdb7884d3d69609f785bd253dc033243e3fe4" +dependencies = [ + "heck", + "proc-macro-error", + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "tar" version = "0.4.38" @@ -2525,6 +2566,7 @@ dependencies = [ "ratatui", "serde", "serde_json", + "tabled", "text-generation-client", "thiserror", "tokenizers", diff --git a/benchmark/Cargo.toml b/benchmark/Cargo.toml index a4e215fc..67e04f0a 100644 --- a/benchmark/Cargo.toml +++ b/benchmark/Cargo.toml @@ -20,6 +20,7 @@ crossterm = "0.26" float-ord = "0.3.2" serde = {version = "1.0.142", features = ["derive"]} serde_json = "1.0" +tabled = "0.12.0" text-generation-client = { path = "../router/client" } thiserror = "1.0.38" tokenizers = "0.13.3" diff --git a/benchmark/src/app.rs b/benchmark/src/app.rs index 85026a33..6a9881fb 100644 --- a/benchmark/src/app.rs +++ b/benchmark/src/app.rs @@ -15,6 +15,7 @@ use tui::{symbols, Frame}; /// TUI powered App pub(crate) struct App { pub(crate) running: bool, + pub(crate) data: Data, completed_runs: Vec, completed_batch: usize, current_batch: usize, @@ -22,12 +23,10 @@ pub(crate) struct App { touched_tab: bool, zoom: bool, is_error: bool, - data: Data, tokenizer_name: String, sequence_length: u32, decode_length: u32, n_run: usize, - batch_size: Vec, receiver: mpsc::Receiver>, } @@ -40,7 +39,6 @@ impl App { n_run: usize, batch_size: Vec, ) -> Self { - let data = Data::new(n_run, batch_size.len()); let current_tab = 0; let completed_runs: Vec = (0..batch_size.len()).map(|_| 0).collect(); @@ -48,8 +46,11 @@ impl App { let current_batch = 0; let is_error = false; + let data = Data::new(n_run, batch_size); + Self { running: true, + data, completed_runs, completed_batch, current_batch, @@ -57,12 +58,10 @@ impl App { touched_tab: false, zoom: false, is_error, - data, tokenizer_name, sequence_length, decode_length, n_run, - batch_size, receiver, } } @@ -79,7 +78,7 @@ impl App { code: KeyCode::Tab, .. } => { self.touched_tab = true; - self.current_tab = (self.current_tab + 1) % self.batch_size.len(); + self.current_tab = (self.current_tab + 1) % self.data.batch_size.len(); } // Decrease and wrap tab KeyEvent { @@ -90,7 +89,7 @@ impl App { if self.current_tab > 0 { self.current_tab -= 1; } else { - self.current_tab = self.batch_size.len() - 1; + self.current_tab = self.data.batch_size.len() - 1; } } // Zoom on throughput/latency fig @@ -137,7 +136,7 @@ impl App { self.data.end_batch(self.current_batch); self.completed_batch += 1; - if self.current_batch < self.batch_size.len() - 1 { + if self.current_batch < self.data.batch_size.len() - 1 { // Only go to next tab if the user never touched the tab keys if !self.touched_tab { self.current_tab += 1; @@ -156,7 +155,7 @@ impl App { /// Render frame pub fn render(&mut self, f: &mut Frame<'_, B>) { let batch_progress = - (self.completed_batch as f64 / self.batch_size.len() as f64).clamp(0.0, 1.0); + (self.completed_batch as f64 / self.data.batch_size.len() as f64).clamp(0.0, 1.0); let run_progress = (self.completed_runs[self.current_batch] as f64 / self.n_run as f64).clamp(0.0, 1.0); @@ -241,6 +240,7 @@ impl App { // Batch tabs let titles = self + .data .batch_size .iter() .map(|b| { @@ -269,7 +269,7 @@ impl App { }; let batch_gauge = progress_gauge( "Total Progress", - format!("{} / {}", self.completed_batch, self.batch_size.len()), + format!("{} / {}", self.completed_batch, self.data.batch_size.len()), batch_progress, color, ); @@ -347,7 +347,7 @@ impl App { // Prefill latency/throughput chart let prefill_latency_throughput_chart = latency_throughput_chart( &self.data.prefill_batch_latency_throughput, - &self.batch_size, + &self.data.batch_size, self.zoom, "Prefill", ); @@ -356,7 +356,7 @@ impl App { // Decode latency/throughput chart let decode_latency_throughput_chart = latency_throughput_chart( &self.data.decode_batch_latency_throughput, - &self.batch_size, + &self.data.batch_size, self.zoom, "Decode", ); @@ -365,31 +365,35 @@ impl App { } /// App internal data struct -struct Data { - prefill_latencies: Vec>, - prefill_throughputs: Vec>, - decode_latencies: Vec>, - decode_token_latencies: Vec>, - decode_throughputs: Vec>, - prefill_batch_latency_throughput: Vec<(f64, f64)>, - decode_batch_latency_throughput: Vec<(f64, f64)>, +pub(crate) struct Data { + pub(crate) batch_size: Vec, + pub(crate) prefill_latencies: Vec>, + pub(crate) prefill_throughputs: Vec>, + pub(crate) decode_latencies: Vec>, + pub(crate) decode_token_latencies: Vec>, + pub(crate) decode_throughputs: Vec>, + pub(crate) prefill_batch_latency_throughput: Vec<(f64, f64)>, + pub(crate) decode_batch_latency_throughput: Vec<(f64, f64)>, } impl Data { - fn new(n_run: usize, n_batch: usize) -> Self { - let prefill_latencies: Vec> = - (0..n_batch).map(|_| Vec::with_capacity(n_run)).collect(); + fn new(n_run: usize, batch_size: Vec) -> Self { + let prefill_latencies: Vec> = (0..batch_size.len()) + .map(|_| Vec::with_capacity(n_run)) + .collect(); let prefill_throughputs: Vec> = prefill_latencies.clone(); let decode_latencies: Vec> = prefill_latencies.clone(); let decode_token_latencies: Vec> = decode_latencies.clone(); let decode_throughputs: Vec> = prefill_throughputs.clone(); - let prefill_batch_latency_throughput: Vec<(f64, f64)> = Vec::with_capacity(n_batch); + let prefill_batch_latency_throughput: Vec<(f64, f64)> = + Vec::with_capacity(batch_size.len()); let decode_batch_latency_throughput: Vec<(f64, f64)> = prefill_batch_latency_throughput.clone(); Self { + batch_size, prefill_latencies, prefill_throughputs, decode_latencies, @@ -401,14 +405,14 @@ impl Data { } fn push_prefill(&mut self, prefill: Prefill, batch_idx: usize) { - let latency = prefill.latency.as_millis() as f64; + let latency = prefill.latency.as_micros() as f64 / 1000.0; self.prefill_latencies[batch_idx].push(latency); self.prefill_throughputs[batch_idx].push(prefill.throughput); } fn push_decode(&mut self, decode: Decode, batch_idx: usize) { - let latency = decode.latency.as_millis() as f64; - let token_latency = decode.token_latency.as_millis() as f64; + let latency = decode.latency.as_micros() as f64 / 1000.0; + let token_latency = decode.token_latency.as_micros() as f64 / 1000.0; self.decode_latencies[batch_idx].push(latency); self.decode_token_latencies[batch_idx].push(token_latency); self.decode_throughputs[batch_idx].push(decode.throughput); diff --git a/benchmark/src/generation.rs b/benchmark/src/generation.rs index d40a2e8d..17c72d26 100644 --- a/benchmark/src/generation.rs +++ b/benchmark/src/generation.rs @@ -39,6 +39,7 @@ pub(crate) async fn generation_task( decode_length: u32, n_runs: usize, warmups: usize, + parameters: NextTokenChooserParameters, client: ShardedClient, run_sender: mpsc::Sender>, mut shutdown_receiver: broadcast::Receiver<()>, @@ -47,7 +48,7 @@ pub(crate) async fn generation_task( // End task if a message is received on shutdown_receiver // _shutdown_guard_sender will be dropped once the task is finished tokio::select! { - res = generate_runs(tokenizer, batch_size, sequence_length, decode_length, n_runs, warmups, client, run_sender.clone()) => { + res = generate_runs(tokenizer, batch_size, sequence_length, decode_length, n_runs, warmups, parameters, client, run_sender.clone()) => { if let Err(err) = res { run_sender.send(Err(err)).await.unwrap_or(()); } @@ -65,6 +66,7 @@ async fn generate_runs( decode_length: u32, n_runs: usize, warmups: usize, + parameters: NextTokenChooserParameters, mut client: ShardedClient, run_sender: mpsc::Sender>, ) -> Result<(), ClientError> { @@ -79,6 +81,7 @@ async fn generate_runs( sequence_length, b, decode_length, + parameters.clone(), &mut client, ) .await?; @@ -93,6 +96,7 @@ async fn generate_runs( sequence_length, b, decode_length, + parameters.clone(), &mut client, ) .await?; @@ -125,6 +129,7 @@ async fn prefill( sequence_length: u32, batch_size: u32, decode_length: u32, + parameters: NextTokenChooserParameters, client: &mut ShardedClient, ) -> Result<(Prefill, CachedBatch), ClientError> { // Create requests @@ -133,16 +138,7 @@ async fn prefill( id: id.into(), inputs: sequence.clone(), truncate: sequence_length, - parameters: Some(NextTokenChooserParameters { - temperature: 1.0, - top_k: 0, - top_p: 1.0, - typical_p: 1.0, - do_sample: false, - seed: 0, - repetition_penalty: 1.0, - watermark: false, - }), + parameters: Some(parameters.clone()), stopping_parameters: Some(StoppingCriteriaParameters { max_new_tokens: decode_length, stop_sequences: vec![], diff --git a/benchmark/src/lib.rs b/benchmark/src/lib.rs index 4da0b573..fcad400c 100644 --- a/benchmark/src/lib.rs +++ b/benchmark/src/lib.rs @@ -1,13 +1,14 @@ mod app; mod event; mod generation; +mod table; mod utils; use crate::app::App; use crate::event::Event; use crossterm::ExecutableCommand; use std::io; -use text_generation_client::ShardedClient; +use text_generation_client::{NextTokenChooserParameters, ShardedClient}; use tokenizers::Tokenizer; use tokio::sync::{broadcast, mpsc}; use tui::backend::CrosstermBackend; @@ -23,8 +24,26 @@ pub async fn run( decode_length: u32, n_runs: usize, warmups: usize, + temperature: Option, + top_k: Option, + top_p: Option, + typical_p: Option, + repetition_penalty: Option, + watermark: bool, + do_sample: bool, client: ShardedClient, ) -> Result<(), crossterm::ErrorKind> { + let parameters = NextTokenChooserParameters { + temperature: temperature.unwrap_or(1.0), + top_k: top_k.unwrap_or(0), + top_p: top_p.unwrap_or(1.0), + typical_p: typical_p.unwrap_or(1.0), + do_sample, + seed: 0, + repetition_penalty: repetition_penalty.unwrap_or(1.0), + watermark, + }; + // Initialize terminal properties crossterm::terminal::enable_raw_mode()?; io::stdout().execute(crossterm::terminal::EnterAlternateScreen)?; @@ -53,6 +72,7 @@ pub async fn run( decode_length, n_runs, warmups, + parameters, client, run_sender, shutdown_sender.subscribe(), @@ -73,7 +93,7 @@ pub async fn run( // Create App let mut app = App::new( run_receiver, - tokenizer_name, + tokenizer_name.clone(), sequence_length, decode_length, n_runs, @@ -106,5 +126,27 @@ pub async fn run( crossterm::terminal::disable_raw_mode()?; io::stdout().execute(crossterm::cursor::Show)?; + let parameters_table = table::parameters_table( + tokenizer_name, + sequence_length, + decode_length, + n_runs, + warmups, + temperature, + top_k, + top_p, + typical_p, + repetition_penalty, + watermark, + do_sample, + ); + println!("\n{parameters_table}\n"); + + let latency_table = table::latency_table(&app.data); + println!("\n{latency_table}\n"); + + let throughput_table = table::throughput_table(&app.data); + println!("\n{throughput_table}\n"); + Ok(()) } diff --git a/benchmark/src/main.rs b/benchmark/src/main.rs index 03f61dcd..6172d377 100644 --- a/benchmark/src/main.rs +++ b/benchmark/src/main.rs @@ -28,11 +28,27 @@ struct Args { runs: usize, #[clap(default_value = "1", short, long, env)] warmups: usize, + #[clap(long, env)] + temperature: Option, + #[clap(long, env)] + top_k: Option, + #[clap(long, env)] + top_p: Option, + #[clap(long, env)] + typical_p: Option, + #[clap(long, env)] + repetition_penalty: Option, + #[clap(long, env)] + watermark: bool, + #[clap(long, env)] + do_sample: bool, #[clap(default_value = "/tmp/text-generation-server-0", short, long, env)] master_shard_uds_path: String, } fn main() -> Result<(), Box> { + init_logging(); + // Get args let args = Args::parse(); // Pattern match configuration @@ -44,13 +60,18 @@ fn main() -> Result<(), Box> { decode_length, runs, warmups, + temperature, + top_k, + top_p, + typical_p, + repetition_penalty, + watermark, + do_sample, master_shard_uds_path, } = args; let batch_size = batch_size.unwrap_or(vec![1, 2, 4, 8, 16, 32]); - init_logging(); - // Tokenizer instance // This will only be used to validate payloads tracing::info!("Loading tokenizer"); @@ -105,6 +126,13 @@ fn main() -> Result<(), Box> { decode_length, runs, warmups, + temperature, + top_k, + top_p, + typical_p, + repetition_penalty, + watermark, + do_sample, sharded_client, ) .await diff --git a/benchmark/src/table.rs b/benchmark/src/table.rs new file mode 100644 index 00000000..6b74bc36 --- /dev/null +++ b/benchmark/src/table.rs @@ -0,0 +1,170 @@ +use crate::app::Data; +use tabled::settings::Merge; +use tabled::{builder::Builder, settings::Style, Table}; + +#[allow(clippy::too_many_arguments)] +pub(crate) fn parameters_table( + tokenizer_name: String, + sequence_length: u32, + decode_length: u32, + n_runs: usize, + warmups: usize, + temperature: Option, + top_k: Option, + top_p: Option, + typical_p: Option, + repetition_penalty: Option, + watermark: bool, + do_sample: bool, +) -> Table { + let mut builder = Builder::default(); + + builder.set_header(["Parameter", "Value"]); + + builder.push_record(["Model", &tokenizer_name]); + builder.push_record(["Sequence Length", &sequence_length.to_string()]); + builder.push_record(["Decode Length", &decode_length.to_string()]); + builder.push_record(["N Runs", &n_runs.to_string()]); + builder.push_record(["Warmups", &warmups.to_string()]); + builder.push_record(["Temperature", &format!("{temperature:?}")]); + builder.push_record(["Top K", &format!("{top_k:?}")]); + builder.push_record(["Top P", &format!("{top_p:?}")]); + builder.push_record(["Typical P", &format!("{typical_p:?}")]); + builder.push_record(["Repetition Penalty", &format!("{repetition_penalty:?}")]); + builder.push_record(["Watermark", &watermark.to_string()]); + builder.push_record(["Do Sample", &do_sample.to_string()]); + + let mut table = builder.build(); + table.with(Style::markdown()); + table +} + +pub(crate) fn latency_table(data: &Data) -> Table { + let mut builder = Builder::default(); + + builder.set_header([ + "Step", + "Batch Size", + "Average", + "Lowest", + "Highest", + "p50", + "p90", + "p99", + ]); + + add_latencies( + &mut builder, + "Prefill", + &data.batch_size, + &data.prefill_latencies, + ); + add_latencies( + &mut builder, + "Decode (token)", + &data.batch_size, + &data.decode_token_latencies, + ); + add_latencies( + &mut builder, + "Decode (total)", + &data.batch_size, + &data.decode_latencies, + ); + + let mut table = builder.build(); + table.with(Style::markdown()).with(Merge::vertical()); + table +} + +pub(crate) fn throughput_table(data: &Data) -> Table { + let mut builder = Builder::default(); + + builder.set_header(["Step", "Batch Size", "Average", "Lowest", "Highest"]); + + add_throuhgputs( + &mut builder, + "Prefill", + &data.batch_size, + &data.prefill_throughputs, + ); + add_throuhgputs( + &mut builder, + "Decode", + &data.batch_size, + &data.decode_throughputs, + ); + + let mut table = builder.build(); + table.with(Style::markdown()).with(Merge::vertical()); + table +} + +fn add_latencies( + builder: &mut Builder, + step: &'static str, + batch_size: &[u32], + batch_latencies: &[Vec], +) { + for (i, b) in batch_size.iter().enumerate() { + let latencies = &batch_latencies[i]; + let (avg, min, max) = avg_min_max(latencies); + + let row = [ + step, + &b.to_string(), + &format_value(avg, "ms"), + &format_value(min, "ms"), + &format_value(max, "ms"), + &format_value(px(latencies, 50), "ms"), + &format_value(px(latencies, 90), "ms"), + &format_value(px(latencies, 99), "ms"), + ]; + + builder.push_record(row); + } +} + +fn add_throuhgputs( + builder: &mut Builder, + step: &'static str, + batch_size: &[u32], + batch_throughputs: &[Vec], +) { + for (i, b) in batch_size.iter().enumerate() { + let throughputs = &batch_throughputs[i]; + let (avg, min, max) = avg_min_max(throughputs); + + let row = [ + step, + &b.to_string(), + &format_value(avg, "tokens/secs"), + &format_value(min, "tokens/secs"), + &format_value(max, "tokens/secs"), + ]; + + builder.push_record(row); + } +} + +fn avg_min_max(data: &Vec) -> (f64, f64, f64) { + let average = data.iter().sum::() / data.len() as f64; + let min = data + .iter() + .min_by(|a, b| a.total_cmp(b)) + .unwrap_or(&std::f64::NAN); + let max = data + .iter() + .max_by(|a, b| a.total_cmp(b)) + .unwrap_or(&std::f64::NAN); + (average, *min, *max) +} + +fn px(data: &Vec, p: u32) -> f64 { + let i = (f64::from(p) / 100.0 * data.len() as f64) as usize; + *data.get(i).unwrap_or(&std::f64::NAN) +} + +fn format_value(value: f64, unit: &'static str) -> String { + format!("{:.2} {unit}", value) +}