feat(benchmark): tui based benchmarking tool (#149)

2023-03-30 15:26:27 +02:00 · 2023-03-30 15:26:27 +02:00 · 610bb1f978
parent 55106ec476
commit 610bb1f978
20 changed files with 4133 additions and 7 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -5,6 +5,9 @@ members = [
    "router/grpc-metadata",
    "launcher"
 ]
+exclude = [
+    "benchmark"
+]

 [profile.release]
 debug = 1
--- a/3
+++ b/3
@ -7,6 +7,9 @@ install-router:
 install-launcher:
 	cd launcher && cargo install --path .

+install-benchmark:
+	cd benchmark && cargo install --path .
+
 install: install-server install-router install-launcher

 server-dev:
--- a/assets/benchmark.png
+++ b/assets/benchmark.png
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
@ -0,0 +1 @@
+target
--- a/benchmark/Cargo.lock
+++ b/benchmark/Cargo.lock
--- a/benchmark/Cargo.toml
+++ b/benchmark/Cargo.toml
@ -0,0 +1,35 @@
+[package]
+name = "text-generation-benchmark"
+version = "0.1.0"
+edition = "2021"
+authors = ["Olivier Dehaene"]
+description = "Text Generation Benchmarking tool"
+
+[profile.release]
+debug = 1
+incremental = true
+lto = "off"
+panic = "abort"
+
+[lib]
+path = "src/lib.rs"
+
+[[bin]]
+name = "text-generation-benchmark"
+path = "src/main.rs"
+
+[dependencies]
+average = "0.13"
+clap = { version = "4.1.4", features = ["derive", "env"] }
+crossterm = "0.26"
+float-ord = "0.3.2"
+serde = {version = "1.0.142", features = ["derive"]}
+serde_json = "1.0"
+text-generation-client = { path = "../router/client" }
+thiserror = "1.0.38"
+tokenizers = "0.13.2"
+tokio = { version = "1.25.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
+tui = {package = "ratatui", version = "0.20", default-features = false, features = ["crossterm"]}
+tracing = "0.1.37"
+tracing-subscriber = { version = "0.3.16", features = ["json", "env-filter"] }
+
--- a/benchmark/README.md
+++ b/benchmark/README.md
@ -0,0 +1,30 @@
+<div align="center">
+
+# Text Generation Inference benchmarking tool
+
+![benchmark](../assets/benchmark.png)
+
+</div>
+
+A lightweight benchmarking tool based inspired by [oha](https://github.com/hatoo/oha) 
+and powered by [tui](https://github.com/tui-rs-revival/ratatui).
+
+## Install 
+
+```shell 
+make install-benchmark
+```
+
+## Run
+
+First, start `text-generation-inference`:
+
+```shell
+text-generation-launcher --model-id bigscience/bloom-560m
+```
+
+Then run the benchmarking tool:
+
+```shell
+text-generation-benchmark --tokenizer-name bigscience/bloom-560m
+```
--- a/benchmark/rust-toolchain.toml
+++ b/benchmark/rust-toolchain.toml
@ -0,0 +1,3 @@
+[toolchain]
+channel = "1.67.0"
+components = ["rustfmt", "clippy"]
--- a/benchmark/src/app.rs
+++ b/benchmark/src/app.rs
@ -0,0 +1,688 @@
+/// Inspired by https://github.com/hatoo/oha/blob/bb989ea3cd77727e7743e7daa60a19894bb5e901/src/monitor.rs
+use crate::generation::{Decode, Message, Prefill};
+use crossterm::event::{KeyCode, KeyEvent, KeyModifiers};
+use text_generation_client::ClientError;
+use tokio::sync::mpsc;
+use tui::backend::Backend;
+use tui::layout::{Alignment, Constraint, Direction, Layout};
+use tui::style::{Color, Modifier, Style};
+use tui::text::{Span, Spans};
+use tui::widgets::{
+    Axis, BarChart, Block, Borders, Chart, Dataset, Gauge, GraphType, Paragraph, Tabs,
+};
+use tui::{symbols, Frame};
+
+/// TUI powered App
+pub(crate) struct App {
+    pub(crate) running: bool,
+    completed_runs: Vec<usize>,
+    completed_batch: usize,
+    current_batch: usize,
+    current_tab: usize,
+    touched_tab: bool,
+    zoom: bool,
+    is_error: bool,
+    data: Data,
+    tokenizer_name: String,
+    sequence_length: u32,
+    decode_length: u32,
+    n_run: usize,
+    batch_size: Vec<u32>,
+    receiver: mpsc::Receiver<Result<Message, ClientError>>,
+}
+
+impl App {
+    pub(crate) fn new(
+        receiver: mpsc::Receiver<Result<Message, ClientError>>,
+        tokenizer_name: String,
+        sequence_length: u32,
+        decode_length: u32,
+        n_run: usize,
+        batch_size: Vec<u32>,
+    ) -> Self {
+        let data = Data::new(n_run, batch_size.len());
+        let current_tab = 0;
+
+        let completed_runs: Vec<usize> = (0..batch_size.len()).map(|_| 0).collect();
+        let completed_batch = 0;
+        let current_batch = 0;
+        let is_error = false;
+
+        Self {
+            running: true,
+            completed_runs,
+            completed_batch,
+            current_batch,
+            current_tab,
+            touched_tab: false,
+            zoom: false,
+            is_error,
+            data,
+            tokenizer_name,
+            sequence_length,
+            decode_length,
+            n_run,
+            batch_size,
+            receiver,
+        }
+    }
+
+    /// Handle crossterm key events
+    pub(crate) fn handle_key_event(&mut self, key_event: KeyEvent) {
+        match key_event {
+            // Increase and wrap tab
+            KeyEvent {
+                code: KeyCode::Right,
+                ..
+            }
+            | KeyEvent {
+                code: KeyCode::Tab, ..
+            } => {
+                self.touched_tab = true;
+                self.current_tab = (self.current_tab + 1) % self.batch_size.len();
+            }
+            // Decrease and wrap tab
+            KeyEvent {
+                code: KeyCode::Left,
+                ..
+            } => {
+                self.touched_tab = true;
+                if self.current_tab > 0 {
+                    self.current_tab -= 1;
+                } else {
+                    self.current_tab = self.batch_size.len() - 1;
+                }
+            }
+            // Zoom on throughput/latency fig
+            KeyEvent {
+                code: KeyCode::Char('+'),
+                ..
+            } => {
+                self.zoom = true;
+            }
+            // Unzoom on throughput/latency fig
+            KeyEvent {
+                code: KeyCode::Char('-'),
+                ..
+            } => {
+                self.zoom = false;
+            }
+            // Quit
+            KeyEvent {
+                code: KeyCode::Char('q'),
+                ..
+            }
+            | KeyEvent {
+                code: KeyCode::Char('c'),
+                modifiers: KeyModifiers::CONTROL,
+                ..
+            } => {
+                self.running = false;
+            }
+            _ => (),
+        }
+    }
+
+    /// Get all pending messages from generation task
+    pub(crate) fn tick(&mut self) {
+        while let Ok(message) = self.receiver.try_recv() {
+            match message {
+                Ok(message) => match message {
+                    Message::Prefill(step) => self.data.push_prefill(step, self.current_batch),
+                    Message::Decode(step) => self.data.push_decode(step, self.current_batch),
+                    Message::EndRun => {
+                        self.completed_runs[self.current_batch] += 1;
+                    }
+                    Message::EndBatch => {
+                        self.data.end_batch(self.current_batch);
+                        self.completed_batch += 1;
+
+                        if self.current_batch < self.batch_size.len() - 1 {
+                            // Only go to next tab if the user never touched the tab keys
+                            if !self.touched_tab {
+                                self.current_tab += 1;
+                            }
+
+                            self.current_batch += 1;
+                        }
+                    }
+                    Message::Warmup => {}
+                },
+                Err(_) => self.is_error = true,
+            }
+        }
+    }
+
+    /// Render frame
+    pub fn render<B: Backend>(&mut self, f: &mut Frame<'_, B>) {
+        let batch_progress =
+            (self.completed_batch as f64 / self.batch_size.len() as f64).clamp(0.0, 1.0);
+        let run_progress =
+            (self.completed_runs[self.current_batch] as f64 / self.n_run as f64).clamp(0.0, 1.0);
+
+        // Vertical layout
+        let row5 = Layout::default()
+            .direction(Direction::Vertical)
+            .constraints(
+                [
+                    Constraint::Length(1),
+                    Constraint::Length(3),
+                    Constraint::Length(3),
+                    Constraint::Length(13),
+                    Constraint::Min(10),
+                ]
+                .as_ref(),
+            )
+            .split(f.size());
+
+        // Top row horizontal layout
+        let top = Layout::default()
+            .direction(Direction::Horizontal)
+            .constraints([Constraint::Percentage(50), Constraint::Percentage(50)].as_ref())
+            .split(row5[2]);
+
+        // Mid row horizontal layout
+        let mid = Layout::default()
+            .direction(Direction::Horizontal)
+            .constraints(
+                [
+                    Constraint::Percentage(25),
+                    Constraint::Percentage(25),
+                    Constraint::Percentage(25),
+                    Constraint::Percentage(25),
+                ]
+                .as_ref(),
+            )
+            .split(row5[3]);
+
+        // Left mid row vertical layout
+        let prefill_text = Layout::default()
+            .direction(Direction::Vertical)
+            .constraints([Constraint::Length(8), Constraint::Length(5)].as_ref())
+            .split(mid[0]);
+
+        // Right mid row vertical layout
+        let decode_text = Layout::default()
+            .direction(Direction::Vertical)
+            .constraints([Constraint::Length(8), Constraint::Length(5)].as_ref())
+            .split(mid[2]);
+        let decode_text_latency = Layout::default()
+            .direction(Direction::Horizontal)
+            .constraints([Constraint::Percentage(50), Constraint::Percentage(50)].as_ref())
+            .split(decode_text[0]);
+
+        // Bottom row horizontal layout
+        let bottom = Layout::default()
+            .direction(Direction::Horizontal)
+            .constraints([Constraint::Percentage(50), Constraint::Percentage(50)].as_ref())
+            .split(row5[4]);
+
+        // Title
+        let title = Block::default()
+            .borders(Borders::NONE)
+            .title(format!(
+                "Model: {} | Sequence Length: {} | Decode Length: {}",
+                self.tokenizer_name, self.sequence_length, self.decode_length
+            ))
+            .style(
+                Style::default()
+                    .add_modifier(Modifier::BOLD)
+                    .fg(Color::White),
+            );
+        f.render_widget(title, row5[0]);
+
+        // Helper
+        let helper = Block::default()
+            .borders(Borders::NONE)
+            .title("<- | tab | ->: change batch tab | q / CTRL + c: quit | +/-: zoom")
+            .title_alignment(Alignment::Right)
+            .style(Style::default().fg(Color::White));
+        f.render_widget(helper, row5[0]);
+
+        // Batch tabs
+        let titles = self
+            .batch_size
+            .iter()
+            .map(|b| {
+                Spans::from(vec![Span::styled(
+                    format!("Batch: {b}"),
+                    Style::default().fg(Color::White),
+                )])
+            })
+            .collect();
+        let tabs = Tabs::new(titles)
+            .block(Block::default().borders(Borders::ALL).title("Tabs"))
+            .select(self.current_tab)
+            .style(Style::default().fg(Color::LightCyan))
+            .highlight_style(
+                Style::default()
+                    .add_modifier(Modifier::BOLD)
+                    .bg(Color::Black),
+            );
+        f.render_widget(tabs, row5[1]);
+
+        // Total progress bar
+        let color = if self.is_error {
+            Color::Red
+        } else {
+            Color::LightGreen
+        };
+        let batch_gauge = progress_gauge(
+            "Total Progress",
+            format!("{} / {}", self.completed_batch, self.batch_size.len()),
+            batch_progress,
+            color,
+        );
+        f.render_widget(batch_gauge, top[0]);
+
+        // Batch progress Bar
+        let color = if self.is_error {
+            Color::Red
+        } else {
+            Color::LightBlue
+        };
+        let run_gauge = progress_gauge(
+            "Batch Progress",
+            format!(
+                "{} / {}",
+                self.completed_runs[self.current_batch], self.n_run
+            ),
+            run_progress,
+            color,
+        );
+        f.render_widget(run_gauge, top[1]);
+
+        // Prefill text infos
+        let prefill_latency_block = latency_paragraph(
+            &mut self.data.prefill_latencies[self.current_tab],
+            "Prefill",
+        );
+        let prefill_throughput_block =
+            throughput_paragraph(&self.data.prefill_throughputs[self.current_tab], "Prefill");
+
+        f.render_widget(prefill_latency_block, prefill_text[0]);
+        f.render_widget(prefill_throughput_block, prefill_text[1]);
+
+        // Prefill latency histogram
+        let histo_width = 7;
+        let bins = if mid[1].width < 2 {
+            0
+        } else {
+            (mid[1].width as usize - 2) / (histo_width + 1)
+        }
+        .max(2);
+
+        let histo_data =
+            latency_histogram_data(&self.data.prefill_latencies[self.current_tab], bins);
+        let histo_data_str: Vec<(&str, u64)> =
+            histo_data.iter().map(|(l, v)| (l.as_str(), *v)).collect();
+        let prefill_histogram =
+            latency_histogram(&histo_data_str, "Prefill").bar_width(histo_width as u16);
+        f.render_widget(prefill_histogram, mid[1]);
+
+        // Decode text info
+        let decode_latency_block = latency_paragraph(
+            &mut self.data.decode_latencies[self.current_tab],
+            "Decode Total",
+        );
+        let decode_token_latency_block = latency_paragraph(
+            &mut self.data.decode_token_latencies[self.current_tab],
+            "Decode Token",
+        );
+        let decode_throughput_block =
+            throughput_paragraph(&self.data.decode_throughputs[self.current_tab], "Decode");
+        f.render_widget(decode_latency_block, decode_text_latency[0]);
+        f.render_widget(decode_token_latency_block, decode_text_latency[1]);
+        f.render_widget(decode_throughput_block, decode_text[1]);
+
+        // Decode latency histogram
+        let histo_data =
+            latency_histogram_data(&self.data.decode_latencies[self.current_tab], bins);
+        let histo_data_str: Vec<(&str, u64)> =
+            histo_data.iter().map(|(l, v)| (l.as_str(), *v)).collect();
+        let decode_histogram =
+            latency_histogram(&histo_data_str, "Decode").bar_width(histo_width as u16);
+        f.render_widget(decode_histogram, mid[3]);
+
+        // Prefill latency/throughput chart
+        let prefill_latency_throughput_chart = latency_throughput_chart(
+            &self.data.prefill_batch_latency_throughput,
+            &self.batch_size,
+            self.zoom,
+            "Prefill",
+        );
+        f.render_widget(prefill_latency_throughput_chart, bottom[0]);
+
+        // Decode latency/throughput chart
+        let decode_latency_throughput_chart = latency_throughput_chart(
+            &self.data.decode_batch_latency_throughput,
+            &self.batch_size,
+            self.zoom,
+            "Decode",
+        );
+        f.render_widget(decode_latency_throughput_chart, bottom[1]);
+    }
+}
+
+/// App internal data struct
+struct Data {
+    prefill_latencies: Vec<Vec<f64>>,
+    prefill_throughputs: Vec<Vec<f64>>,
+    decode_latencies: Vec<Vec<f64>>,
+    decode_token_latencies: Vec<Vec<f64>>,
+    decode_throughputs: Vec<Vec<f64>>,
+    prefill_batch_latency_throughput: Vec<(f64, f64)>,
+    decode_batch_latency_throughput: Vec<(f64, f64)>,
+}
+
+impl Data {
+    fn new(n_run: usize, n_batch: usize) -> Self {
+        let prefill_latencies: Vec<Vec<f64>> =
+            (0..n_batch).map(|_| Vec::with_capacity(n_run)).collect();
+        let prefill_throughputs: Vec<Vec<f64>> = prefill_latencies.clone();
+
+        let decode_latencies: Vec<Vec<f64>> = prefill_latencies.clone();
+        let decode_token_latencies: Vec<Vec<f64>> = decode_latencies.clone();
+        let decode_throughputs: Vec<Vec<f64>> = prefill_throughputs.clone();
+
+        let prefill_batch_latency_throughput: Vec<(f64, f64)> = Vec::with_capacity(n_batch);
+        let decode_batch_latency_throughput: Vec<(f64, f64)> =
+            prefill_batch_latency_throughput.clone();
+
+        Self {
+            prefill_latencies,
+            prefill_throughputs,
+            decode_latencies,
+            decode_token_latencies,
+            decode_throughputs,
+            prefill_batch_latency_throughput,
+            decode_batch_latency_throughput,
+        }
+    }
+
+    fn push_prefill(&mut self, prefill: Prefill, batch_idx: usize) {
+        let latency = prefill.latency.as_millis() as f64;
+        self.prefill_latencies[batch_idx].push(latency);
+        self.prefill_throughputs[batch_idx].push(prefill.throughput);
+    }
+
+    fn push_decode(&mut self, decode: Decode, batch_idx: usize) {
+        let latency = decode.latency.as_millis() as f64;
+        let token_latency = decode.token_latency.as_millis() as f64;
+        self.decode_latencies[batch_idx].push(latency);
+        self.decode_token_latencies[batch_idx].push(token_latency);
+        self.decode_throughputs[batch_idx].push(decode.throughput);
+    }
+
+    fn end_batch(&mut self, batch_idx: usize) {
+        self.prefill_batch_latency_throughput.push((
+            self.prefill_latencies[batch_idx].iter().sum::<f64>()
+                / self.prefill_latencies[batch_idx].len() as f64,
+            self.prefill_throughputs[batch_idx].iter().sum::<f64>()
+                / self.prefill_throughputs[batch_idx].len() as f64,
+        ));
+        self.decode_batch_latency_throughput.push((
+            self.decode_latencies[batch_idx].iter().sum::<f64>()
+                / self.decode_latencies[batch_idx].len() as f64,
+            self.decode_throughputs[batch_idx].iter().sum::<f64>()
+                / self.decode_throughputs[batch_idx].len() as f64,
+        ));
+    }
+}
+
+/// Progress bar
+fn progress_gauge(title: &str, label: String, progress: f64, color: Color) -> Gauge {
+    Gauge::default()
+        .block(Block::default().title(title).borders(Borders::ALL))
+        .gauge_style(Style::default().fg(color))
+        .label(Span::raw(label))
+        .ratio(progress)
+}
+
+/// Throughput paragraph
+fn throughput_paragraph<'a>(throughput: &Vec<f64>, name: &'static str) -> Paragraph<'a> {
+    // Throughput average/high/low texts
+    let throughput_texts = statis_spans(throughput, "tokens/secs");
+
+    // Throughput block
+    Paragraph::new(throughput_texts).block(
+        Block::default()
+            .title(Span::raw(format!("{name} Throughput")))
+            .borders(Borders::ALL),
+    )
+}
+
+/// Latency paragraph
+fn latency_paragraph<'a>(latency: &mut Vec<f64>, name: &'static str) -> Paragraph<'a> {
+    // Latency average/high/low texts
+    let mut latency_texts = statis_spans(latency, "ms");
+
+    // Sort latency for percentiles
+    float_ord::sort(latency);
+    let latency_percentiles = crate::utils::percentiles(latency, &[50, 90, 99]);
+
+    // Latency p50/p90/p99 texts
+    let colors = vec![Color::LightGreen, Color::LightYellow, Color::LightRed];
+    for (i, (name, value)) in latency_percentiles.iter().enumerate() {
+        let span = Spans::from(vec![Span::styled(
+            format!("{name}:     {value:.2} ms"),
+            Style::default().fg(colors[i]),
+        )]);
+        latency_texts.push(span);
+    }
+
+    Paragraph::new(latency_texts).block(
+        Block::default()
+            .title(Span::raw(format!("{name} Latency")))
+            .borders(Borders::ALL),
+    )
+}
+
+/// Average/High/Low spans
+fn statis_spans<'a>(data: &Vec<f64>, unit: &'static str) -> Vec<Spans<'a>> {
+    vec![
+        Spans::from(vec![Span::styled(
+            format!(
+                "Average: {:.2} {unit}",
+                data.iter().sum::<f64>() / data.len() as f64
+            ),
+            Style::default().fg(Color::LightBlue),
+        )]),
+        Spans::from(vec![Span::styled(
+            format!(
+                "Lowest:  {:.2} {unit}",
+                data.iter()
+                    .min_by(|a, b| a.total_cmp(b))
+                    .unwrap_or(&std::f64::NAN)
+            ),
+            Style::default().fg(Color::Reset),
+        )]),
+        Spans::from(vec![Span::styled(
+            format!(
+                "Highest: {:.2} {unit}",
+                data.iter()
+                    .max_by(|a, b| a.total_cmp(b))
+                    .unwrap_or(&std::f64::NAN)
+            ),
+            Style::default().fg(Color::Reset),
+        )]),
+    ]
+}
+
+/// Latency histogram data
+fn latency_histogram_data(latency: &[f64], bins: usize) -> Vec<(String, u64)> {
+    let histo_data: Vec<(String, u64)> = {
+        let histo = crate::utils::histogram(latency, bins);
+        histo
+            .into_iter()
+            .map(|(label, v)| (format!("{label:.2}"), v as u64))
+            .collect()
+    };
+
+    histo_data
+}
+
+/// Latency Histogram
+fn latency_histogram<'a>(
+    histo_data_str: &'a Vec<(&'a str, u64)>,
+    name: &'static str,
+) -> BarChart<'a> {
+    BarChart::default()
+        .block(
+            Block::default()
+                .title(format!("{name} latency histogram"))
+                .style(Style::default().fg(Color::LightYellow).bg(Color::Reset))
+                .borders(Borders::ALL),
+        )
+        .data(histo_data_str.as_slice())
+}
+
+/// Latency/Throughput chart
+fn latency_throughput_chart<'a>(
+    latency_throughput: &'a Vec<(f64, f64)>,
+    batch_sizes: &'a [u32],
+    zoom: bool,
+    name: &'static str,
+) -> Chart<'a> {
+    let latency_iter = latency_throughput.iter().map(|(l, _)| l);
+    let throughput_iter = latency_throughput.iter().map(|(_, t)| t);
+
+    // Get extreme values
+    let min_latency: f64 = *latency_iter
+        .clone()
+        .min_by(|a, b| a.total_cmp(b))
+        .unwrap_or(&std::f64::NAN);
+    let max_latency: f64 = *latency_iter
+        .max_by(|a, b| a.total_cmp(b))
+        .unwrap_or(&std::f64::NAN);
+    let min_throughput: f64 = *throughput_iter
+        .clone()
+        .min_by(|a, b| a.total_cmp(b))
+        .unwrap_or(&std::f64::NAN);
+    let max_throughput: f64 = *throughput_iter
+        .max_by(|a, b| a.total_cmp(b))
+        .unwrap_or(&std::f64::NAN);
+
+    // Char min max values
+    let min_x = if zoom {
+        ((min_latency - 0.05 * min_latency) / 100.0).floor() * 100.0
+    } else {
+        0.0
+    };
+    let max_x = ((max_latency + 0.05 * max_latency) / 100.0).ceil() * 100.0;
+    let step_x = (max_x - min_x) / 4.0;
+
+    // Chart min max values
+    let min_y = if zoom {
+        ((min_throughput - 0.05 * min_throughput) / 100.0).floor() * 100.0
+    } else {
+        0.0
+    };
+    let max_y = ((max_throughput + 0.05 * max_throughput) / 100.0).ceil() * 100.0;
+    let step_y = (max_y - min_y) / 4.0;
+
+    // Labels
+    let mut x_labels = vec![Span::styled(
+        format!("{min_x:.2}"),
+        Style::default()
+            .add_modifier(Modifier::BOLD)
+            .fg(Color::Gray)
+            .bg(Color::Reset),
+    )];
+    for i in 0..3 {
+        x_labels.push(Span::styled(
+            format!("{:.2}", min_x + ((i + 1) as f64 * step_x)),
+            Style::default().fg(Color::Gray).bg(Color::Reset),
+        ));
+    }
+    x_labels.push(Span::styled(
+        format!("{max_x:.2}"),
+        Style::default()
+            .add_modifier(Modifier::BOLD)
+            .fg(Color::Gray)
+            .bg(Color::Reset),
+    ));
+
+    // Labels
+    let mut y_labels = vec![Span::styled(
+        format!("{min_y:.2}"),
+        Style::default()
+            .add_modifier(Modifier::BOLD)
+            .fg(Color::Gray)
+            .bg(Color::Reset),
+    )];
+    for i in 0..3 {
+        y_labels.push(Span::styled(
+            format!("{:.2}", min_y + ((i + 1) as f64 * step_y)),
+            Style::default().fg(Color::Gray).bg(Color::Reset),
+        ));
+    }
+    y_labels.push(Span::styled(
+        format!("{max_y:.2}"),
+        Style::default()
+            .add_modifier(Modifier::BOLD)
+            .fg(Color::Gray)
+            .bg(Color::Reset),
+    ));
+
+    // Chart dataset
+    let colors = color_vec();
+    let datasets: Vec<Dataset> = (0..latency_throughput.len())
+        .map(|i| {
+            let color_idx = i % colors.len();
+
+            Dataset::default()
+                .name(batch_sizes[i].to_string())
+                .marker(symbols::Marker::Block)
+                .style(Style::default().fg(colors[color_idx]))
+                .graph_type(GraphType::Scatter)
+                .data(&latency_throughput[i..(i + 1)])
+        })
+        .collect();
+
+    // Chart
+    Chart::new(datasets)
+        .style(Style::default().fg(Color::Cyan).bg(Color::Reset))
+        .block(
+            Block::default()
+                .title(Span::styled(
+                    format!("{name} throughput over latency"),
+                    Style::default().fg(Color::Gray).bg(Color::Reset),
+                ))
+                .borders(Borders::ALL),
+        )
+        .x_axis(
+            Axis::default()
+                .title("ms")
+                .style(Style::default().fg(Color::Gray).bg(Color::Reset))
+                .labels(x_labels)
+                .bounds([min_x, max_x]),
+        )
+        .y_axis(
+            Axis::default()
+                .title("tokens/secs")
+                .style(Style::default().fg(Color::Gray).bg(Color::Reset))
+                .labels(y_labels)
+                .bounds([min_y, max_y]),
+        )
+}
+
+// Colors for latency/throughput chart
+fn color_vec() -> Vec<Color> {
+    vec![
+        Color::Red,
+        Color::Green,
+        Color::Yellow,
+        Color::Blue,
+        Color::Magenta,
+        Color::Cyan,
+        Color::Gray,
+        Color::DarkGray,
+        Color::LightRed,
+        Color::LightGreen,
+        Color::LightYellow,
+        Color::LightBlue,
+        Color::LightMagenta,
+        Color::LightCyan,
+    ]
+}
--- a/benchmark/src/event.rs
+++ b/benchmark/src/event.rs
@ -0,0 +1,65 @@
+/// Inspired by https://github.com/orhun/rust-tui-template/blob/472aa515119d4c94903eac12d9784417281dc7f5/src/event.rs
+use crossterm::event;
+use std::time::{Duration, Instant};
+use tokio::sync::{broadcast, mpsc};
+
+/// Events
+#[derive(Debug)]
+pub(crate) enum Event {
+    /// Terminal tick.
+    Tick,
+    /// Key press.
+    Key(event::KeyEvent),
+    /// Terminal resize.
+    Resize(u16, u16),
+}
+
+pub(crate) async fn terminal_event_task(
+    fps: u32,
+    event_sender: mpsc::Sender<Event>,
+    mut shutdown_receiver: broadcast::Receiver<()>,
+    _shutdown_guard_sender: mpsc::Sender<()>,
+) {
+    // End task if a message is received on shutdown_receiver
+    // _shutdown_guard_sender will be dropped once the task is finished
+    tokio::select! {
+        _ = event_loop(fps, event_sender)  => {
+        },
+        _ = shutdown_receiver.recv() => {}
+    }
+}
+
+/// Main event loop
+async fn event_loop(fps: u32, event_sender: mpsc::Sender<Event>) {
+    // Frame budget
+    let per_frame = Duration::from_secs(1) / fps;
+
+    // When was last frame executed
+    let mut last_frame = Instant::now();
+
+    loop {
+        // Sleep to avoid blocking the thread for too long
+        if let Some(sleep) = per_frame.checked_sub(last_frame.elapsed()) {
+            tokio::time::sleep(sleep).await;
+        }
+
+        // Get crossterm event and send a new one over the channel
+        if event::poll(Duration::from_secs(0)).expect("no events available") {
+            match event::read().expect("unable to read event") {
+                event::Event::Key(e) => event_sender.send(Event::Key(e)).await.unwrap_or(()),
+                event::Event::Resize(w, h) => {
+                    event_sender.send(Event::Resize(w, h)).await.unwrap_or(())
+                }
+                _ => (),
+            }
+        }
+
+        // Frame budget exceeded
+        if last_frame.elapsed() >= per_frame {
+            // Send tick
+            event_sender.send(Event::Tick).await.unwrap_or(());
+            // Rest last_frame time
+            last_frame = Instant::now();
+        }
+    }
+}
--- a/benchmark/src/generation.rs
+++ b/benchmark/src/generation.rs
@ -0,0 +1,211 @@
+use std::time::{Duration, Instant};
+use text_generation_client::{
+    Batch, ClientError, NextTokenChooserParameters, Request, ShardedClient,
+    StoppingCriteriaParameters,
+};
+use tokenizers::{Tokenizer, TruncationDirection};
+use tokio::sync::{broadcast, mpsc};
+
+const LOREM_IPSUM: &str = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.";
+
+#[derive(Debug, Clone)]
+pub(crate) struct Prefill {
+    pub(crate) latency: Duration,
+    pub(crate) throughput: f64,
+}
+
+#[derive(Debug, Clone)]
+pub(crate) struct Decode {
+    pub(crate) latency: Duration,
+    pub(crate) token_latency: Duration,
+    pub(crate) throughput: f64,
+}
+
+#[derive(Debug)]
+pub(crate) enum Message {
+    Warmup,
+    Prefill(Prefill),
+    Decode(Decode),
+    EndRun,
+    EndBatch,
+}
+
+/// Benchmarking task
+#[allow(clippy::too_many_arguments)]
+pub(crate) async fn generation_task(
+    tokenizer: Tokenizer,
+    batch_size: Vec<u32>,
+    sequence_length: u32,
+    decode_length: u32,
+    n_runs: usize,
+    warmups: usize,
+    client: ShardedClient,
+    run_sender: mpsc::Sender<Result<Message, ClientError>>,
+    mut shutdown_receiver: broadcast::Receiver<()>,
+    _shutdown_guard_sender: mpsc::Sender<()>,
+) {
+    // End task if a message is received on shutdown_receiver
+    // _shutdown_guard_sender will be dropped once the task is finished
+    tokio::select! {
+        res = generate_runs(tokenizer, batch_size, sequence_length, decode_length, n_runs, warmups, client, run_sender.clone())  => {
+            if let Err(err) = res {
+                run_sender.send(Err(err)).await.unwrap_or(());
+            }
+        },
+        _ = shutdown_receiver.recv() => {}
+    }
+}
+
+/// Benchmark prefill/decode
+#[allow(clippy::too_many_arguments)]
+async fn generate_runs(
+    tokenizer: Tokenizer,
+    batch_size: Vec<u32>,
+    sequence_length: u32,
+    decode_length: u32,
+    n_runs: usize,
+    warmups: usize,
+    mut client: ShardedClient,
+    run_sender: mpsc::Sender<Result<Message, ClientError>>,
+) -> Result<(), ClientError> {
+    // Create a dummy sequence
+    let sequence = create_sequence(sequence_length, tokenizer);
+
+    for b in batch_size {
+        // Warmups on batch size
+        for _ in 0..warmups {
+            let (_, decode_batch) =
+                prefill(sequence.clone(), b, decode_length, &mut client).await?;
+            let _ = decode(decode_batch, &mut client).await?;
+            // Send warmup message
+            run_sender.send(Ok(Message::Warmup)).await.unwrap_or(());
+        }
+
+        for _ in 0..n_runs {
+            let (prefill, decode_batch) =
+                prefill(sequence.clone(), b, decode_length, &mut client).await?;
+            // Send prefill message
+            run_sender
+                .send(Ok(Message::Prefill(prefill)))
+                .await
+                .unwrap_or(());
+
+            let decode = decode(decode_batch, &mut client).await?;
+
+            // Send decode message
+            run_sender
+                .send(Ok(Message::Decode(decode)))
+                .await
+                .unwrap_or(());
+
+            // Send run ended message
+            run_sender.send(Ok(Message::EndRun)).await.unwrap_or(());
+        }
+        // Batch ended
+        run_sender.send(Ok(Message::EndBatch)).await.unwrap_or(());
+    }
+    Ok(())
+}
+
+// Run a prefill step
+async fn prefill(
+    sequence: String,
+    batch_size: u32,
+    decode_length: u32,
+    client: &mut ShardedClient,
+) -> Result<(Prefill, Batch), ClientError> {
+    // Create requests
+    let requests = (0..batch_size)
+        .map(|id| Request {
+            id: id.into(),
+            inputs: sequence.clone(),
+            parameters: Some(NextTokenChooserParameters {
+                temperature: 1.0,
+                top_k: 0,
+                top_p: 1.0,
+                typical_p: 1.0,
+                do_sample: false,
+                seed: 0,
+                repetition_penalty: 1.0,
+                watermark: false,
+            }),
+            stopping_parameters: Some(StoppingCriteriaParameters {
+                max_new_tokens: decode_length,
+                stop_sequences: vec![],
+                ignore_eos_token: true, // Will not stop even if a eos token is generated
+            }),
+        })
+        .collect();
+
+    let batch = Batch {
+        id: 0,
+        requests,
+        size: batch_size,
+    };
+
+    // Run prefill
+    let start_time = Instant::now();
+    let (_, decode_batch) = client.prefill(batch.clone()).await?;
+
+    // Get latency
+    let latency = start_time.elapsed();
+
+    // Compute throughput from latency and batch size
+    let throughput = batch_size as f64 / latency.as_secs_f64();
+
+    // Decode batch cannot be empty
+    let decode_batch = decode_batch.expect("decode_batch is None. This is a bug.");
+
+    let step = Prefill {
+        latency,
+        throughput,
+    };
+
+    Ok((step, decode_batch))
+}
+
+/// Run a full decode
+async fn decode(batch: Batch, client: &mut ShardedClient) -> Result<Decode, ClientError> {
+    let mut decode_length = 0;
+    let batch_size = batch.size;
+
+    let start_time = Instant::now();
+
+    // Full decode over decode length
+    let mut next_batch = Some(batch);
+    while let Some(batch) = next_batch {
+        let result = client.decode(vec![batch]).await?;
+        next_batch = result.1;
+        decode_length += 1;
+    }
+
+    // Get latency
+    let latency = start_time.elapsed();
+    let token_latency = latency / decode_length;
+
+    // Compute throughput from latency, batch size and decode length
+    let throughput = (batch_size * decode_length) as f64 / latency.as_secs_f64();
+
+    let step = Decode {
+        latency,
+        token_latency,
+        throughput,
+    };
+    Ok(step)
+}
+
+/// Create a dummy sequence of the correct length
+fn create_sequence(sequence_length: u32, tokenizer: Tokenizer) -> String {
+    let lorem_ipsum_length = tokenizer.encode(LOREM_IPSUM, true).unwrap().len();
+    // Repeat lorem ipsum to cover sequence length
+    let string_sequence =
+        LOREM_IPSUM.repeat((0..sequence_length).step_by(lorem_ipsum_length).len());
+    // Encode sequence
+    let mut encoding = tokenizer.encode(string_sequence, true).unwrap();
+    // Truncate to sequence_length
+    encoding.truncate(sequence_length as usize, 0, TruncationDirection::Left);
+    // Decode
+    tokenizer
+        .decode(Vec::from(encoding.get_ids()), false)
+        .unwrap()
+}
--- a/benchmark/src/lib.rs
+++ b/benchmark/src/lib.rs
@ -0,0 +1,110 @@
+mod app;
+mod event;
+mod generation;
+mod utils;
+
+use crate::app::App;
+use crate::event::Event;
+use crossterm::ExecutableCommand;
+use std::io;
+use text_generation_client::ShardedClient;
+use tokenizers::Tokenizer;
+use tokio::sync::{broadcast, mpsc};
+use tui::backend::CrosstermBackend;
+use tui::Terminal;
+
+/// Run benchmarking app
+#[allow(clippy::too_many_arguments)]
+pub async fn run(
+    tokenizer_name: String,
+    tokenizer: Tokenizer,
+    batch_size: Vec<u32>,
+    sequence_length: u32,
+    decode_length: u32,
+    n_runs: usize,
+    warmups: usize,
+    client: ShardedClient,
+) -> Result<(), crossterm::ErrorKind> {
+    // Initialize terminal properties
+    crossterm::terminal::enable_raw_mode()?;
+    io::stdout().execute(crossterm::terminal::EnterAlternateScreen)?;
+    io::stdout().execute(crossterm::cursor::Hide)?;
+
+    // Initialize terminal
+    let mut terminal = {
+        let backend = CrosstermBackend::new(io::stdout());
+        Terminal::new(backend)?
+    };
+
+    // Create message channel between generation_task and app
+    let (run_sender, run_receiver) = mpsc::channel(8);
+    // Crossterm event channel
+    let (event_sender, mut event_receiver) = mpsc::channel(8);
+    // Shutdown channel to terminate tasks
+    let (shutdown_sender, _) = broadcast::channel(1);
+    // Channel to check if tasks terminated
+    let (shutdown_guard_sender, mut shutdown_guard_receiver) = mpsc::channel(1);
+
+    // Create generation task
+    tokio::spawn(generation::generation_task(
+        tokenizer,
+        batch_size.clone(),
+        sequence_length,
+        decode_length,
+        n_runs,
+        warmups,
+        client,
+        run_sender,
+        shutdown_sender.subscribe(),
+        shutdown_guard_sender.clone(),
+    ));
+
+    // Create event task
+    tokio::spawn(event::terminal_event_task(
+        250,
+        event_sender,
+        shutdown_sender.subscribe(),
+        shutdown_guard_sender.clone(),
+    ));
+
+    // Drop our end of shutdown sender
+    drop(shutdown_guard_sender);
+
+    // Create App
+    let mut app = App::new(
+        run_receiver,
+        tokenizer_name,
+        sequence_length,
+        decode_length,
+        n_runs,
+        batch_size,
+    );
+
+    while app.running {
+        // Draw frame
+        terminal.draw(|frame| app.render(frame))?;
+
+        // Await a new event from event handling task
+        match event_receiver.recv().await {
+            None => break,
+            // Update app state
+            Some(event) => match event {
+                Event::Tick => app.tick(),
+                Event::Key(key_event) => app.handle_key_event(key_event),
+                _ => {}
+            },
+        }
+    }
+
+    // Ask tasks to shutdown
+    let _ = shutdown_sender.send(());
+    // Wait for tasks to shutdown
+    let _ = shutdown_guard_receiver.recv().await;
+
+    // Revert terminal to original view
+    io::stdout().execute(crossterm::terminal::LeaveAlternateScreen)?;
+    crossterm::terminal::disable_raw_mode()?;
+    io::stdout().execute(crossterm::cursor::Show)?;
+
+    Ok(())
+}
--- a/benchmark/src/main.rs
+++ b/benchmark/src/main.rs
@ -0,0 +1,119 @@
+/// Text Generation Inference benchmarking tool
+///
+/// Inspired by the great Oha app: https://github.com/hatoo/oha
+/// and: https://github.com/orhun/rust-tui-template
+use clap::Parser;
+use std::path::Path;
+use text_generation_client::ShardedClient;
+use tokenizers::Tokenizer;
+use tracing_subscriber::layer::SubscriberExt;
+use tracing_subscriber::util::SubscriberInitExt;
+use tracing_subscriber::EnvFilter;
+
+/// App Configuration
+#[derive(Parser, Debug)]
+#[clap(author, version, about, long_about = None)]
+struct Args {
+    #[clap(short, long, env)]
+    tokenizer_name: String,
+    #[clap(short, long)]
+    batch_size: Option<Vec<u32>>,
+    #[clap(default_value = "10", short, long, env)]
+    sequence_length: u32,
+    #[clap(default_value = "8", short, long, env)]
+    decode_length: u32,
+    #[clap(default_value = "10", short, long, env)]
+    runs: usize,
+    #[clap(default_value = "1", short, long, env)]
+    warmups: usize,
+    #[clap(default_value = "/tmp/text-generation-server-0", short, long, env)]
+    master_shard_uds_path: String,
+}
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // Get args
+    let args = Args::parse();
+    // Pattern match configuration
+    let Args {
+        tokenizer_name,
+        batch_size,
+        sequence_length,
+        decode_length,
+        runs,
+        warmups,
+        master_shard_uds_path,
+    } = args;
+
+    let batch_size = batch_size.unwrap_or(vec![1, 2, 4, 8, 16, 32]);
+
+    init_logging();
+
+    // Tokenizer instance
+    // This will only be used to validate payloads
+    tracing::info!("Loading tokenizer");
+    let local_path = Path::new(&tokenizer_name);
+    let tokenizer =
+        if local_path.exists() && local_path.is_dir() && local_path.join("tokenizer.json").exists()
+        {
+            // Load local tokenizer
+            tracing::info!("Found local tokenizer");
+            Tokenizer::from_file(local_path.join("tokenizer.json")).unwrap()
+        } else {
+            // Download and instantiate tokenizer
+            // We need to download it outside of the Tokio runtime
+            tracing::info!("Downloading tokenizer");
+            Tokenizer::from_pretrained(tokenizer_name.clone(), None).unwrap()
+        };
+    tracing::info!("Tokenizer loaded");
+
+    // Launch Tokio runtime
+    tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .unwrap()
+        .block_on(async {
+            // Instantiate sharded client from the master unix socket
+            tracing::info!("Connect to model server");
+            let mut sharded_client = ShardedClient::connect_uds(master_shard_uds_path)
+                .await
+                .expect("Could not connect to server");
+            // Clear the cache; useful if the webserver rebooted
+            sharded_client
+                .clear_cache(None)
+                .await
+                .expect("Unable to clear cache");
+            tracing::info!("Connected");
+
+            // Run app
+            text_generation_benchmark::run(
+                tokenizer_name,
+                tokenizer,
+                batch_size,
+                sequence_length,
+                decode_length,
+                runs,
+                warmups,
+                sharded_client,
+            )
+            .await
+            .unwrap();
+        });
+    Ok(())
+}
+
+/// Init logging using LOG_LEVEL
+fn init_logging() {
+    // STDOUT/STDERR layer
+    let fmt_layer = tracing_subscriber::fmt::layer()
+        .with_file(true)
+        .with_line_number(true);
+
+    // Filter events with LOG_LEVEL
+    let env_filter =
+        EnvFilter::try_from_env("LOG_LEVEL").unwrap_or_else(|_| EnvFilter::new("info"));
+
+    tracing_subscriber::registry()
+        .with(env_filter)
+        .with(fmt_layer)
+        .init();
+}
--- a/benchmark/src/utils.rs
+++ b/benchmark/src/utils.rs
@ -0,0 +1,43 @@
+/// MIT License
+//
+// Copyright (c) 2020 hatoo
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+use std::collections::BTreeMap;
+
+pub(crate) fn histogram(values: &[f64], bins: usize) -> Vec<(f64, usize)> {
+    assert!(bins >= 2);
+    let mut bucket: Vec<usize> = vec![0; bins];
+    let min = values.iter().collect::<average::Min>().min();
+    let max = values.iter().collect::<average::Max>().max();
+    let step = (max - min) / (bins - 1) as f64;
+
+    for &v in values {
+        let i = std::cmp::min(((v - min) / step).ceil() as usize, bins - 1);
+        bucket[i] += 1;
+    }
+
+    bucket
+        .into_iter()
+        .enumerate()
+        .map(|(i, v)| (min + step * i as f64, v))
+        .collect()
+}
+
+pub(crate) fn percentiles(values: &[f64], pecents: &[i32]) -> BTreeMap<String, f64> {
+    pecents
+        .iter()
+        .map(|&p| {
+            let i = (f64::from(p) / 100.0 * values.len() as f64) as usize;
+            (format!("p{p}"), *values.get(i).unwrap_or(&std::f64::NAN))
+        })
+        .collect()
+}
--- a/proto/generate.proto
+++ b/proto/generate.proto
@ -53,6 +53,9 @@ message StoppingCriteriaParameters {
    uint32 max_new_tokens = 1;
    /// Optional stopping sequences
    repeated string stop_sequences = 2;
+    /// Ignore end of sequence token
+    /// used for benchmarking
+    bool ignore_eos_token = 3;
 }

 message Request {
--- a/router/src/main.rs
+++ b/router/src/main.rs
@ -37,7 +37,7 @@ struct Args {
    max_waiting_tokens: usize,
    #[clap(default_value = "3000", long, short, env)]
    port: u16,
-    #[clap(default_value = "/tmp/text-generation-0", long, env)]
+    #[clap(default_value = "/tmp/text-generation-server-0", long, env)]
    master_shard_uds_path: String,
    #[clap(default_value = "bigscience/bloom", long, env)]
    tokenizer_name: String,
@ -76,6 +76,8 @@ fn main() -> Result<(), std::io::Error> {
        panic!("validation_workers must be > 0");
    }

+    init_logging(otlp_endpoint, json_output);
+
    // CORS allowed origins
    // map to go inside the option and then map to parse from String to HeaderValue
    // Finally, convert to AllowOrigin
@ -89,17 +91,21 @@ fn main() -> Result<(), std::io::Error> {

    // Tokenizer instance
    // This will only be used to validate payloads
+    tracing::info!("Loading tokenizer");
    let local_path = Path::new(&tokenizer_name);
    let tokenizer =
        if local_path.exists() && local_path.is_dir() && local_path.join("tokenizer.json").exists()
        {
            // Load local tokenizer
+            tracing::info!("Found local tokenizer");
            Tokenizer::from_file(local_path.join("tokenizer.json")).unwrap()
        } else {
            // Download and instantiate tokenizer
            // We need to download it outside of the Tokio runtime
+            tracing::info!("Downloading tokenizer");
            Tokenizer::from_pretrained(tokenizer_name.clone(), None).unwrap()
        };
+    tracing::info!("Tokenizer loaded");

    // Launch Tokio runtime
    tokio::runtime::Builder::new_multi_thread()
@ -107,8 +113,6 @@ fn main() -> Result<(), std::io::Error> {
        .build()
        .unwrap()
        .block_on(async {
-            init_logging(otlp_endpoint, json_output);
-
            // Get pipeline tag
            let model_info = reqwest::get(format!(
                "https://huggingface.co/api/models/{tokenizer_name}"
--- a/router/src/queue.rs
+++ b/router/src/queue.rs
@ -237,6 +237,7 @@ mod tests {
                    watermark: false,
                },
                stopping_parameters: StoppingCriteriaParameters {
+                    ignore_eos_token: false,
                    max_new_tokens: 0,
                    stop_sequences: vec![],
                },
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -315,6 +315,7 @@ fn validate(
    let stopping_parameters = StoppingCriteriaParameters {
        max_new_tokens,
        stop_sequences,
+        ignore_eos_token: false,
    };

    metrics::histogram!("tgi_request_input_length", input_length as f64);
--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@ -18,7 +18,7 @@ def serve(
    revision: Optional[str] = None,
    sharded: bool = False,
    quantize: bool = False,
-    uds_path: Path = "/tmp/text-generation",
+    uds_path: Path = "/tmp/text-generation-server",
    logger_level: str = "INFO",
    json_output: bool = False,
    otlp_endpoint: Optional[str] = None,
--- a/server/text_generation_server/utils/tokens.py
+++ b/server/text_generation_server/utils/tokens.py
@ -123,20 +123,22 @@ class StoppingCriteria:
        self,
        eos_token_id: int,
        stop_sequence_criterias: List[StopSequenceCriteria],
-        max_new_tokens=20,
+        max_new_tokens: int = 20,
+        ignore_eos_token: bool = False,
    ):
        self.eos_token_id = eos_token_id
        self.stop_sequence_criterias = stop_sequence_criterias
        self.max_new_tokens = max_new_tokens
        self.current_tokens = 0
        self.current_output = ""
+        self.ignore_eos_token = ignore_eos_token

    def __call__(self, last_token: int, last_output: str) -> Tuple[bool, Optional[str]]:
        self.current_tokens += 1
        if self.current_tokens >= self.max_new_tokens:
            return True, FinishReason.FINISH_REASON_LENGTH

-        if last_token == self.eos_token_id:
+        if not self.ignore_eos_token and last_token == self.eos_token_id:
            return True, FinishReason.FINISH_REASON_EOS_TOKEN

        self.current_output += last_output
@ -156,5 +158,8 @@ class StoppingCriteria:
            StopSequenceCriteria(sequence) for sequence in pb.stop_sequences
        ]
        return StoppingCriteria(
-            tokenizer.eos_token_id, stop_sequence_criterias, pb.max_new_tokens
+            tokenizer.eos_token_id,
+            stop_sequence_criterias,
+            pb.max_new_tokens,
+            pb.ignore_eos_token,
        )