Add a new `/tokenize` route to get the tokenized input (#1471)

# What does this PR do? Ideally this is done client side, but this is a recurring request, therefore we implemented it. - Runs only if rust tokenizer is present (not encumbering the main inference pipeline is important). - Returns simple results, ID, text (gotten with offsets from the original string) and offsets (so users can do things like highlighting text).   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.
2024-01-25 14:19:03 +01:00 · 2024-01-25 14:19:03 +01:00 · 86c8335f1b
parent 7872b8c55b
commit 86c8335f1b
5 changed files with 115 additions and 903 deletions
--- a/docs/openapi.json
+++ b/docs/openapi.json
--- a/router/src/infer.rs
+++ b/router/src/infer.rs
@ -165,6 +165,28 @@ impl Infer {
        ))
    }
    /// Tokenizer the input
    #[instrument(skip_all)]
    pub(crate) async fn tokenize(
        &self,
        request: GenerateRequest,
    ) -> Result<Option<tokenizers::Encoding>, InferError> {
        // Tokenize request
        let inputs = request.inputs;
        let truncate = request.parameters.truncate;
        let encoding = self
            .validation
            .tokenize(inputs, truncate)
            .await
            .map_err(|err| {
                tracing::error!("Tokenization {err}");
                err
            })?;
        // Return Encoding
        Ok(encoding.map(|(encoding, _)| encoding))
    }
    /// Apply the chat template to the chat request
    #[instrument(skip_all)]
    pub(crate) fn apply_chat_template(&self, messages: Vec<Message>) -> Result<String, InferError> {
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -444,6 +444,18 @@ pub struct Token {
    special: bool,
 }
 #[derive(Debug, Serialize, ToSchema)]
 pub struct SimpleToken {
    #[schema(example = 0)]
    id: u32,
    #[schema(example = "test")]
    text: String,
    #[schema(example = 0)]
    start: usize,
    #[schema(example = 2)]
    stop: usize,
 }
 #[derive(Serialize, ToSchema)]
 #[serde(rename_all(serialize = "snake_case"))]
 pub(crate) enum FinishReason {
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -5,8 +5,8 @@ use crate::validation::ValidationError;
 use crate::{
    BestOfSequence, ChatCompletion, ChatCompletionChunk, ChatRequest, CompatGenerateRequest,
    Details, ErrorResponse, FinishReason, GenerateParameters, GenerateRequest, GenerateResponse,
-    HubModelInfo, HubTokenizerConfig, Infer, Info, PrefillToken, StreamDetails, StreamResponse,
+    HubModelInfo, HubTokenizerConfig, Infer, Info, PrefillToken, SimpleToken, StreamDetails,
-    Token, Validation,
+    StreamResponse, Token, Validation,
 };
 use axum::extract::Extension;
 use axum::http::{HeaderMap, Method, StatusCode};
@ -532,7 +532,7 @@ async fn generate_stream_internal(
    path = "/v1/chat/completions",
    request_body = ChatRequest,
    responses(
-    (status = 200, description = "Generated Text", body = GenerateResponse),
+    (status = 200, description = "Generated Text", body = ChatCompletionChunk),
    (status = 424, description = "Generation Error", body = ErrorResponse,
    example = json ! ({"error": "Request failed during generation"})),
    (status = 429, description = "Model is overloaded", body = ErrorResponse,
@ -672,6 +672,52 @@ async fn chat_completions(
    }
 }
 /// Tokenize inputs
 #[utoipa::path(
    post,
    tag = "Text Generation Inference",
    path = "/tokenize",
    request_body = TokenizeRequest,
    responses(
    (status = 200, description = "Tokenized ids", body = TokenizeResponse),
    (status = 404, description = "No tokenizer found", body = ErrorResponse,
    example = json ! ({"error": "No fast tokenizer available"})),
    )
    )]
 #[instrument(skip_all)]
 async fn tokenize(
    Extension(infer): Extension<Infer>,
    Json(req): Json<GenerateRequest>,
 ) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
    let input = req.inputs.clone();
    let encoding = infer.tokenize(req).await?;
    if let Some(encoding) = encoding {
        let tokens: Vec<SimpleToken> = encoding
            .get_ids()
            .iter()
            .zip(encoding.get_offsets())
            .map(|(&id, &(start, stop))| {
                let text: String = input.chars().skip(start).take(stop - start).collect();
                SimpleToken {
                    id,
                    text,
                    start,
                    stop,
                }
            })
            .collect();
        Ok(Json(tokens).into_response())
    } else {
        Err((
            StatusCode::NOT_FOUND,
            Json(ErrorResponse {
                error: "No fast tokenizer or tokenizer.json for this model".to_string(),
                error_type: "no fast tokenizer".to_string(),
            }),
        ))
    }
 }
 /// Prometheus metrics scrape endpoint
 #[utoipa::path(
 get,
@ -719,6 +765,8 @@ pub async fn run(
    compat_generate,
    generate,
    generate_stream,
    chat_completions,
    tokenize,
    metrics,
    ),
    components(
@ -867,6 +915,7 @@ pub async fn run(
        .route("/generate", post(generate))
        .route("/generate_stream", post(generate_stream))
        .route("/v1/chat/completions", post(chat_completions))
        .route("/tokenize", post(tokenize))
        .route("/health", get(health))
        .route("/ping", get(health))
        .route("/metrics", get(metrics));
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -70,12 +70,11 @@ impl Validation {
    }
    #[instrument(skip(self, inputs))]
-    async fn validate_input(
+    pub async fn tokenize(
        &self,
        inputs: String,
        truncate: Option<usize>,
-        max_new_tokens: Option<u32>,
+    ) -> Result<Option<(tokenizers::Encoding, String)>, ValidationError> {
    ) -> Result<(String, usize, u32), ValidationError> {
        // If we have a fast tokenizer
        if let Some(sender) = &self.sender {
            // Create response channel
@ -88,7 +87,24 @@ impl Validation {
            // Await on response channel
            // Unwrap is safe here
-            let (inputs, input_length) = response_receiver.await.unwrap()?;
+            let encoding = response_receiver.await.unwrap()?;
            Ok(Some(encoding))
        } else {
            Ok(None)
        }
    }
    #[instrument(skip(self, inputs))]
    async fn validate_input(
        &self,
        inputs: String,
        truncate: Option<usize>,
        max_new_tokens: Option<u32>,
    ) -> Result<(String, usize, u32), ValidationError> {
        // If we have a fast tokenizer
        if let Some((encoding, inputs)) = self.tokenize(inputs.clone(), truncate).await? {
            // Create response channel
            let input_length = encoding.len();
            // Get total tokens
            let max_new_tokens: u32 = if let Some(max_new_tokens) = max_new_tokens {
@ -343,36 +359,31 @@ fn tokenizer_worker(tokenizer: Tokenizer, mut receiver: mpsc::UnboundedReceiver<
 /// Get input length and optionally truncate it
 fn prepare_input(
-    inputs: String,
+    mut inputs: String,
    truncate: Option<usize>,
    tokenizer: &Tokenizer,
-) -> Result<(String, usize), ValidationError> {
+) -> Result<(tokenizers::Encoding, String), ValidationError> {
    // Get the number of tokens in the input
    let mut encoding = tokenizer
        .encode(inputs.clone(), true)
        .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;
    // Optionally truncate
-    let (inputs, input_length) = match truncate {
+    if let Some(truncate) = truncate {
-        // Truncate is some and < encoding length
+        if truncate < encoding.len() {
        Some(truncate) if truncate < encoding.len() => {
            // truncate encoding and decode new inputs
            encoding.truncate(truncate, 0, TruncationDirection::Left);
-            let inputs = tokenizer
+            inputs = tokenizer
                .decode(encoding.get_ids(), false)
                .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;
            (inputs, encoding.len())
        }
-        // Nothing to do
+    }
        _ => (inputs, encoding.len()),
    };
-    Ok((inputs, input_length))
+    Ok((encoding, inputs))
 }
 type TokenizerRequest = (
    (String, Option<usize>),
-    oneshot::Sender<Result<(String, usize), ValidationError>>,
+    oneshot::Sender<Result<(tokenizers::Encoding, String), ValidationError>>,
    Span,
 );