feat(benchmarker): add summary tables (#368)

2023-05-25 13:38:36 +02:00 · 2023-05-25 13:38:36 +02:00 · 951930fbff
parent 218c9adaa5
commit 951930fbff
7 changed files with 325 additions and 42 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -249,6 +249,12 @@ version = "3.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
 [[package]]
 name = "bytecount"
 version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
 [[package]]
 name = "byteorder"
 version = "1.4.3"
@ -1706,6 +1712,17 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 [[package]]
 name = "papergrid"
 version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1fdfe703c51ddc52887ad78fc69cd2ea78d895ffcd6e955c9d03566db8ab5bb1"
 dependencies = [
 "bytecount",
 "fnv",
 "unicode-width",
 ]
 [[package]]
 name = "parking_lot"
 version = "0.12.1"
@ -2490,6 +2507,30 @@ dependencies = [
 "winapi",
 ]
 [[package]]
 name = "tabled"
 version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da1a2e56bbf7bfdd08aaa7592157a742205459eff774b73bc01809ae2d99dc2a"
 dependencies = [
 "papergrid",
 "tabled_derive",
 "unicode-width",
 ]
 [[package]]
 name = "tabled_derive"
 version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "99f688a08b54f4f02f0a3c382aefdb7884d3d69609f785bd253dc033243e3fe4"
 dependencies = [
 "heck",
 "proc-macro-error",
 "proc-macro2",
 "quote",
 "syn 1.0.109",
 ]
 [[package]]
 name = "tar"
 version = "0.4.38"
@ -2525,6 +2566,7 @@ dependencies = [
 "ratatui",
 "serde",
 "serde_json",
 "tabled",
 "text-generation-client",
 "thiserror",
 "tokenizers",
--- a/benchmark/Cargo.toml
+++ b/benchmark/Cargo.toml
@ -20,6 +20,7 @@ crossterm = "0.26"
 float-ord = "0.3.2"
 serde = {version = "1.0.142", features = ["derive"]}
 serde_json = "1.0"
 tabled = "0.12.0"
 text-generation-client = { path = "../router/client" }
 thiserror = "1.0.38"
 tokenizers = "0.13.3"
--- a/benchmark/src/app.rs
+++ b/benchmark/src/app.rs
@ -15,6 +15,7 @@ use tui::{symbols, Frame};
 /// TUI powered App
 pub(crate) struct App {
    pub(crate) running: bool,
    pub(crate) data: Data,
    completed_runs: Vec<usize>,
    completed_batch: usize,
    current_batch: usize,
@ -22,12 +23,10 @@ pub(crate) struct App {
    touched_tab: bool,
    zoom: bool,
    is_error: bool,
    data: Data,
    tokenizer_name: String,
    sequence_length: u32,
    decode_length: u32,
    n_run: usize,
    batch_size: Vec<u32>,
    receiver: mpsc::Receiver<Result<Message, ClientError>>,
 }
@ -40,7 +39,6 @@ impl App {
        n_run: usize,
        batch_size: Vec<u32>,
    ) -> Self {
        let data = Data::new(n_run, batch_size.len());
        let current_tab = 0;
        let completed_runs: Vec<usize> = (0..batch_size.len()).map(|_| 0).collect();
@ -48,8 +46,11 @@ impl App {
        let current_batch = 0;
        let is_error = false;
        let data = Data::new(n_run, batch_size);
        Self {
            running: true,
            data,
            completed_runs,
            completed_batch,
            current_batch,
@ -57,12 +58,10 @@ impl App {
            touched_tab: false,
            zoom: false,
            is_error,
            data,
            tokenizer_name,
            sequence_length,
            decode_length,
            n_run,
            batch_size,
            receiver,
        }
    }
@ -79,7 +78,7 @@ impl App {
                code: KeyCode::Tab, ..
            } => {
                self.touched_tab = true;
-                self.current_tab = (self.current_tab + 1) % self.batch_size.len();
+                self.current_tab = (self.current_tab + 1) % self.data.batch_size.len();
            }
            // Decrease and wrap tab
            KeyEvent {
@ -90,7 +89,7 @@ impl App {
                if self.current_tab > 0 {
                    self.current_tab -= 1;
                } else {
-                    self.current_tab = self.batch_size.len() - 1;
+                    self.current_tab = self.data.batch_size.len() - 1;
                }
            }
            // Zoom on throughput/latency fig
@ -137,7 +136,7 @@ impl App {
                        self.data.end_batch(self.current_batch);
                        self.completed_batch += 1;
-                        if self.current_batch < self.batch_size.len() - 1 {
+                        if self.current_batch < self.data.batch_size.len() - 1 {
                            // Only go to next tab if the user never touched the tab keys
                            if !self.touched_tab {
                                self.current_tab += 1;
@ -156,7 +155,7 @@ impl App {
    /// Render frame
    pub fn render<B: Backend>(&mut self, f: &mut Frame<'_, B>) {
        let batch_progress =
-            (self.completed_batch as f64 / self.batch_size.len() as f64).clamp(0.0, 1.0);
+            (self.completed_batch as f64 / self.data.batch_size.len() as f64).clamp(0.0, 1.0);
        let run_progress =
            (self.completed_runs[self.current_batch] as f64 / self.n_run as f64).clamp(0.0, 1.0);
@ -241,6 +240,7 @@ impl App {
        // Batch tabs
        let titles = self
            .data
            .batch_size
            .iter()
            .map(|b| {
@ -269,7 +269,7 @@ impl App {
        };
        let batch_gauge = progress_gauge(
            "Total Progress",
-            format!("{} / {}", self.completed_batch, self.batch_size.len()),
+            format!("{} / {}", self.completed_batch, self.data.batch_size.len()),
            batch_progress,
            color,
        );
@ -347,7 +347,7 @@ impl App {
        // Prefill latency/throughput chart
        let prefill_latency_throughput_chart = latency_throughput_chart(
            &self.data.prefill_batch_latency_throughput,
-            &self.batch_size,
+            &self.data.batch_size,
            self.zoom,
            "Prefill",
        );
@ -356,7 +356,7 @@ impl App {
        // Decode latency/throughput chart
        let decode_latency_throughput_chart = latency_throughput_chart(
            &self.data.decode_batch_latency_throughput,
-            &self.batch_size,
+            &self.data.batch_size,
            self.zoom,
            "Decode",
        );
@ -365,31 +365,35 @@ impl App {
 }
 /// App internal data struct
-struct Data {
+pub(crate) struct Data {
-    prefill_latencies: Vec<Vec<f64>>,
+    pub(crate) batch_size: Vec<u32>,
-    prefill_throughputs: Vec<Vec<f64>>,
+    pub(crate) prefill_latencies: Vec<Vec<f64>>,
-    decode_latencies: Vec<Vec<f64>>,
+    pub(crate) prefill_throughputs: Vec<Vec<f64>>,
-    decode_token_latencies: Vec<Vec<f64>>,
+    pub(crate) decode_latencies: Vec<Vec<f64>>,
-    decode_throughputs: Vec<Vec<f64>>,
+    pub(crate) decode_token_latencies: Vec<Vec<f64>>,
-    prefill_batch_latency_throughput: Vec<(f64, f64)>,
+    pub(crate) decode_throughputs: Vec<Vec<f64>>,
-    decode_batch_latency_throughput: Vec<(f64, f64)>,
+    pub(crate) prefill_batch_latency_throughput: Vec<(f64, f64)>,
    pub(crate) decode_batch_latency_throughput: Vec<(f64, f64)>,
 }
 impl Data {
-    fn new(n_run: usize, n_batch: usize) -> Self {
+    fn new(n_run: usize, batch_size: Vec<u32>) -> Self {
-        let prefill_latencies: Vec<Vec<f64>> =
+        let prefill_latencies: Vec<Vec<f64>> = (0..batch_size.len())
-            (0..n_batch).map(|_| Vec::with_capacity(n_run)).collect();
+            .map(|_| Vec::with_capacity(n_run))
            .collect();
        let prefill_throughputs: Vec<Vec<f64>> = prefill_latencies.clone();
        let decode_latencies: Vec<Vec<f64>> = prefill_latencies.clone();
        let decode_token_latencies: Vec<Vec<f64>> = decode_latencies.clone();
        let decode_throughputs: Vec<Vec<f64>> = prefill_throughputs.clone();
-        let prefill_batch_latency_throughput: Vec<(f64, f64)> = Vec::with_capacity(n_batch);
+        let prefill_batch_latency_throughput: Vec<(f64, f64)> =
            Vec::with_capacity(batch_size.len());
        let decode_batch_latency_throughput: Vec<(f64, f64)> =
            prefill_batch_latency_throughput.clone();
        Self {
            batch_size,
            prefill_latencies,
            prefill_throughputs,
            decode_latencies,
@ -401,14 +405,14 @@ impl Data {
    }
    fn push_prefill(&mut self, prefill: Prefill, batch_idx: usize) {
-        let latency = prefill.latency.as_millis() as f64;
+        let latency = prefill.latency.as_micros() as f64 / 1000.0;
        self.prefill_latencies[batch_idx].push(latency);
        self.prefill_throughputs[batch_idx].push(prefill.throughput);
    }
    fn push_decode(&mut self, decode: Decode, batch_idx: usize) {
-        let latency = decode.latency.as_millis() as f64;
+        let latency = decode.latency.as_micros() as f64 / 1000.0;
-        let token_latency = decode.token_latency.as_millis() as f64;
+        let token_latency = decode.token_latency.as_micros() as f64 / 1000.0;
        self.decode_latencies[batch_idx].push(latency);
        self.decode_token_latencies[batch_idx].push(token_latency);
        self.decode_throughputs[batch_idx].push(decode.throughput);
--- a/benchmark/src/generation.rs
+++ b/benchmark/src/generation.rs
@ -39,6 +39,7 @@ pub(crate) async fn generation_task(
    decode_length: u32,
    n_runs: usize,
    warmups: usize,
    parameters: NextTokenChooserParameters,
    client: ShardedClient,
    run_sender: mpsc::Sender<Result<Message, ClientError>>,
    mut shutdown_receiver: broadcast::Receiver<()>,
@ -47,7 +48,7 @@ pub(crate) async fn generation_task(
    // End task if a message is received on shutdown_receiver
    // _shutdown_guard_sender will be dropped once the task is finished
    tokio::select! {
-        res = generate_runs(tokenizer, batch_size, sequence_length, decode_length, n_runs, warmups, client, run_sender.clone())  => {
+        res = generate_runs(tokenizer, batch_size, sequence_length, decode_length, n_runs, warmups, parameters, client, run_sender.clone())  => {
            if let Err(err) = res {
                run_sender.send(Err(err)).await.unwrap_or(());
            }
@ -65,6 +66,7 @@ async fn generate_runs(
    decode_length: u32,
    n_runs: usize,
    warmups: usize,
    parameters: NextTokenChooserParameters,
    mut client: ShardedClient,
    run_sender: mpsc::Sender<Result<Message, ClientError>>,
 ) -> Result<(), ClientError> {
@ -79,6 +81,7 @@ async fn generate_runs(
                sequence_length,
                b,
                decode_length,
                parameters.clone(),
                &mut client,
            )
            .await?;
@ -93,6 +96,7 @@ async fn generate_runs(
                sequence_length,
                b,
                decode_length,
                parameters.clone(),
                &mut client,
            )
            .await?;
@ -125,6 +129,7 @@ async fn prefill(
    sequence_length: u32,
    batch_size: u32,
    decode_length: u32,
    parameters: NextTokenChooserParameters,
    client: &mut ShardedClient,
 ) -> Result<(Prefill, CachedBatch), ClientError> {
    // Create requests
@ -133,16 +138,7 @@ async fn prefill(
            id: id.into(),
            inputs: sequence.clone(),
            truncate: sequence_length,
-            parameters: Some(NextTokenChooserParameters {
+            parameters: Some(parameters.clone()),
                temperature: 1.0,
                top_k: 0,
                top_p: 1.0,
                typical_p: 1.0,
                do_sample: false,
                seed: 0,
                repetition_penalty: 1.0,
                watermark: false,
            }),
            stopping_parameters: Some(StoppingCriteriaParameters {
                max_new_tokens: decode_length,
                stop_sequences: vec![],
--- a/benchmark/src/lib.rs
+++ b/benchmark/src/lib.rs
@ -1,13 +1,14 @@
 mod app;
 mod event;
 mod generation;
 mod table;
 mod utils;
 use crate::app::App;
 use crate::event::Event;
 use crossterm::ExecutableCommand;
 use std::io;
-use text_generation_client::ShardedClient;
+use text_generation_client::{NextTokenChooserParameters, ShardedClient};
 use tokenizers::Tokenizer;
 use tokio::sync::{broadcast, mpsc};
 use tui::backend::CrosstermBackend;
@ -23,8 +24,26 @@ pub async fn run(
    decode_length: u32,
    n_runs: usize,
    warmups: usize,
    temperature: Option<f32>,
    top_k: Option<u32>,
    top_p: Option<f32>,
    typical_p: Option<f32>,
    repetition_penalty: Option<f32>,
    watermark: bool,
    do_sample: bool,
    client: ShardedClient,
 ) -> Result<(), crossterm::ErrorKind> {
    let parameters = NextTokenChooserParameters {
        temperature: temperature.unwrap_or(1.0),
        top_k: top_k.unwrap_or(0),
        top_p: top_p.unwrap_or(1.0),
        typical_p: typical_p.unwrap_or(1.0),
        do_sample,
        seed: 0,
        repetition_penalty: repetition_penalty.unwrap_or(1.0),
        watermark,
    };
    // Initialize terminal properties
    crossterm::terminal::enable_raw_mode()?;
    io::stdout().execute(crossterm::terminal::EnterAlternateScreen)?;
@ -53,6 +72,7 @@ pub async fn run(
        decode_length,
        n_runs,
        warmups,
        parameters,
        client,
        run_sender,
        shutdown_sender.subscribe(),
@ -73,7 +93,7 @@ pub async fn run(
    // Create App
    let mut app = App::new(
        run_receiver,
-        tokenizer_name,
+        tokenizer_name.clone(),
        sequence_length,
        decode_length,
        n_runs,
@ -106,5 +126,27 @@ pub async fn run(
    crossterm::terminal::disable_raw_mode()?;
    io::stdout().execute(crossterm::cursor::Show)?;
    let parameters_table = table::parameters_table(
        tokenizer_name,
        sequence_length,
        decode_length,
        n_runs,
        warmups,
        temperature,
        top_k,
        top_p,
        typical_p,
        repetition_penalty,
        watermark,
        do_sample,
    );
    println!("\n{parameters_table}\n");
    let latency_table = table::latency_table(&app.data);
    println!("\n{latency_table}\n");
    let throughput_table = table::throughput_table(&app.data);
    println!("\n{throughput_table}\n");
    Ok(())
 }
--- a/benchmark/src/main.rs
+++ b/benchmark/src/main.rs
@ -28,11 +28,27 @@ struct Args {
    runs: usize,
    #[clap(default_value = "1", short, long, env)]
    warmups: usize,
    #[clap(long, env)]
    temperature: Option<f32>,
    #[clap(long, env)]
    top_k: Option<u32>,
    #[clap(long, env)]
    top_p: Option<f32>,
    #[clap(long, env)]
    typical_p: Option<f32>,
    #[clap(long, env)]
    repetition_penalty: Option<f32>,
    #[clap(long, env)]
    watermark: bool,
    #[clap(long, env)]
    do_sample: bool,
    #[clap(default_value = "/tmp/text-generation-server-0", short, long, env)]
    master_shard_uds_path: String,
 }
 fn main() -> Result<(), Box<dyn std::error::Error>> {
    init_logging();
    // Get args
    let args = Args::parse();
    // Pattern match configuration
@ -44,13 +60,18 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
        decode_length,
        runs,
        warmups,
        temperature,
        top_k,
        top_p,
        typical_p,
        repetition_penalty,
        watermark,
        do_sample,
        master_shard_uds_path,
    } = args;
    let batch_size = batch_size.unwrap_or(vec![1, 2, 4, 8, 16, 32]);
    init_logging();
    // Tokenizer instance
    // This will only be used to validate payloads
    tracing::info!("Loading tokenizer");
@ -105,6 +126,13 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
                decode_length,
                runs,
                warmups,
                temperature,
                top_k,
                top_p,
                typical_p,
                repetition_penalty,
                watermark,
                do_sample,
                sharded_client,
            )
            .await
--- a/benchmark/src/table.rs
+++ b/benchmark/src/table.rs
@ -0,0 +1,170 @@
 use crate::app::Data;
 use tabled::settings::Merge;
 use tabled::{builder::Builder, settings::Style, Table};
 #[allow(clippy::too_many_arguments)]
 pub(crate) fn parameters_table(
    tokenizer_name: String,
    sequence_length: u32,
    decode_length: u32,
    n_runs: usize,
    warmups: usize,
    temperature: Option<f32>,
    top_k: Option<u32>,
    top_p: Option<f32>,
    typical_p: Option<f32>,
    repetition_penalty: Option<f32>,
    watermark: bool,
    do_sample: bool,
 ) -> Table {
    let mut builder = Builder::default();
    builder.set_header(["Parameter", "Value"]);
    builder.push_record(["Model", &tokenizer_name]);
    builder.push_record(["Sequence Length", &sequence_length.to_string()]);
    builder.push_record(["Decode Length", &decode_length.to_string()]);
    builder.push_record(["N Runs", &n_runs.to_string()]);
    builder.push_record(["Warmups", &warmups.to_string()]);
    builder.push_record(["Temperature", &format!("{temperature:?}")]);
    builder.push_record(["Top K", &format!("{top_k:?}")]);
    builder.push_record(["Top P", &format!("{top_p:?}")]);
    builder.push_record(["Typical P", &format!("{typical_p:?}")]);
    builder.push_record(["Repetition Penalty", &format!("{repetition_penalty:?}")]);
    builder.push_record(["Watermark", &watermark.to_string()]);
    builder.push_record(["Do Sample", &do_sample.to_string()]);
    let mut table = builder.build();
    table.with(Style::markdown());
    table
 }
 pub(crate) fn latency_table(data: &Data) -> Table {
    let mut builder = Builder::default();
    builder.set_header([
        "Step",
        "Batch Size",
        "Average",
        "Lowest",
        "Highest",
        "p50",
        "p90",
        "p99",
    ]);
    add_latencies(
        &mut builder,
        "Prefill",
        &data.batch_size,
        &data.prefill_latencies,
    );
    add_latencies(
        &mut builder,
        "Decode (token)",
        &data.batch_size,
        &data.decode_token_latencies,
    );
    add_latencies(
        &mut builder,
        "Decode (total)",
        &data.batch_size,
        &data.decode_latencies,
    );
    let mut table = builder.build();
    table.with(Style::markdown()).with(Merge::vertical());
    table
 }
 pub(crate) fn throughput_table(data: &Data) -> Table {
    let mut builder = Builder::default();
    builder.set_header(["Step", "Batch Size", "Average", "Lowest", "Highest"]);
    add_throuhgputs(
        &mut builder,
        "Prefill",
        &data.batch_size,
        &data.prefill_throughputs,
    );
    add_throuhgputs(
        &mut builder,
        "Decode",
        &data.batch_size,
        &data.decode_throughputs,
    );
    let mut table = builder.build();
    table.with(Style::markdown()).with(Merge::vertical());
    table
 }
 fn add_latencies(
    builder: &mut Builder,
    step: &'static str,
    batch_size: &[u32],
    batch_latencies: &[Vec<f64>],
 ) {
    for (i, b) in batch_size.iter().enumerate() {
        let latencies = &batch_latencies[i];
        let (avg, min, max) = avg_min_max(latencies);
        let row = [
            step,
            &b.to_string(),
            &format_value(avg, "ms"),
            &format_value(min, "ms"),
            &format_value(max, "ms"),
            &format_value(px(latencies, 50), "ms"),
            &format_value(px(latencies, 90), "ms"),
            &format_value(px(latencies, 99), "ms"),
        ];
        builder.push_record(row);
    }
 }
 fn add_throuhgputs(
    builder: &mut Builder,
    step: &'static str,
    batch_size: &[u32],
    batch_throughputs: &[Vec<f64>],
 ) {
    for (i, b) in batch_size.iter().enumerate() {
        let throughputs = &batch_throughputs[i];
        let (avg, min, max) = avg_min_max(throughputs);
        let row = [
            step,
            &b.to_string(),
            &format_value(avg, "tokens/secs"),
            &format_value(min, "tokens/secs"),
            &format_value(max, "tokens/secs"),
        ];
        builder.push_record(row);
    }
 }
 fn avg_min_max(data: &Vec<f64>) -> (f64, f64, f64) {
    let average = data.iter().sum::<f64>() / data.len() as f64;
    let min = data
        .iter()
        .min_by(|a, b| a.total_cmp(b))
        .unwrap_or(&std::f64::NAN);
    let max = data
        .iter()
        .max_by(|a, b| a.total_cmp(b))
        .unwrap_or(&std::f64::NAN);
    (average, *min, *max)
 }
 fn px(data: &Vec<f64>, p: u32) -> f64 {
    let i = (f64::from(p) / 100.0 * data.len() as f64) as usize;
    *data.get(i).unwrap_or(&std::f64::NAN)
 }
 fn format_value(value: f64, unit: &'static str) -> String {
    format!("{:.2} {unit}", value)
 }