feat(backend): add number of generated tokens in the callback

2024-11-03 23:07:22 +01:00 · 2024-11-03 23:07:22 +01:00 · 05ff551950
parent 188442f67d
commit 05ff551950
5 changed files with 10 additions and 9 deletions
--- a/backends/llamacpp/csrc/backend.cpp
+++ b/backends/llamacpp/csrc/backend.cpp
@ -120,8 +120,8 @@ namespace huggingface::tgi::backends::llamacpp {
                }

                // Bubble up the generated token if a callback is provided
-                std::invoke(
-                        std::forward<const llama_decode_callback>(callback_), new_token_id, new_token_logits, is_eos);
+                std::invoke(std::forward<const llama_decode_callback>(callback_),
+                            new_token_id, new_token_logits, is_eos, n_decoded_tokens);

                batch = llama_batch_get_one(&new_token_id, 1);
            }
--- a/backends/llamacpp/csrc/backend.hpp
+++ b/backends/llamacpp/csrc/backend.hpp
@ -29,8 +29,8 @@ namespace huggingface::tgi::backends::llamacpp {
    static constexpr auto llama_sampler_deleter = [](llama_sampler *pSampler) { llama_sampler_free(pSampler); };
    typedef std::unique_ptr<llama_sampler, decltype(llama_sampler_deleter)> llama_sampler_ptr;

-    typedef std::function<void(llama_token, float_t, bool)> llama_decode_callback;
-    static constexpr auto llama_void_callback = [](llama_token, float_t, bool) {};
+    typedef std::function<void(llama_token, float_t, bool, size_t)> llama_decode_callback;
+    static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) {};

    /**
     *
--- a/backends/llamacpp/csrc/ffi.hpp
+++ b/backends/llamacpp/csrc/ffi.hpp
@ -68,14 +68,14 @@ namespace huggingface::tgi::backends::llamacpp {
                const generation_params_t generation_params,
                const sampling_params_t &sampling_params,
                OpaqueStream *stream,
-                rust::Fn<void(OpaqueStream *, uint32_t, float_t, bool)> callback
+                rust::Fn<void(OpaqueStream *, uint32_t, float_t, bool, size_t)> callback
        ) {
            // Define the visitor lambda function which requires the has_emplace_generate constraint on T
            auto inner_fw = [=, &sampling_params, &stream, &callback]<has_emplace_generate T>(T &&backend)
                    -> std::expected<size_t, backend_error_t> {

-                auto context_forwarding_callback = [=, &stream](uint32_t new_token_id, float_t logits, bool is_eos){
-                    callback(stream, new_token_id, logits, is_eos);
+                auto context_forwarding_callback = [=, &stream](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens){
+                    callback(stream, new_token_id, logits, is_eos, n_generated_tokens);
                };

                // Ask the compiler to create view over Rust slice transmuting from uint32_t* to int32_t*
--- a/backends/llamacpp/src/backend.rs
+++ b/backends/llamacpp/src/backend.rs
@ -102,6 +102,7 @@ fn llama_generate_callback(
    new_token_id: u32,
    new_token_logit: f32,
    is_eos: bool,
+    n_generated_tokens: usize,
 ) {
    let response = InferStreamResponse::Intermediate {
        token: Token {
@ -112,7 +113,7 @@ fn llama_generate_callback(
        },
        top_tokens: vec![],
    };
-    debug!("Generated token: {new_token_id} -> logits={new_token_logit}, is_eos={is_eos}");
+    info!("Generated token: {new_token_id} -> logits={new_token_logit}, is_eos={is_eos} ({n_generated_tokens})");

    unsafe {
        if let Err(ref err) = (*channel).0.send(Ok(response)) {
--- a/backends/llamacpp/src/lib.rs
+++ b/backends/llamacpp/src/lib.rs
@ -70,7 +70,7 @@ mod ffi {
            generation_params: GenerationParams,
            sampling_params: &SamplingParams,
            stream: *mut OpaqueStream,
-            callback: unsafe fn(*mut OpaqueStream, u32, f32, bool),
+            callback: unsafe fn(*mut OpaqueStream, u32, f32, bool, usize),
        ) -> Result<usize>;
    }
 }