flashinfer: reminder to remove contiguous call in the future (#2685)

2024-10-24 14:59:56 +02:00 · 2024-10-24 14:59:56 +02:00 · 1b914f37e7
parent 41c2623735
commit 1b914f37e7
1 changed files with 2 additions and 0 deletions
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@ -55,6 +55,7 @@ def paged_attention(
        from text_generation_server.layers.attention.flashinfer import decode_state

        return decode_state.get().forward(
+            # TODO: remove `contiguous` call once https://github.com/flashinfer-ai/flashinfer/pull/553 is merged.
            query.contiguous(),
            paged_kv_cache=(kv_cache.key, kv_cache.value),
            logits_soft_cap=softcap,
@ -220,6 +221,7 @@ def attention(
            softcap = 0.0

        return prefill_with_paged_kv_state.get().forward(
+            # TODO: remove `contiguous` call once https://github.com/flashinfer-ai/flashinfer/pull/553 is merged.
            query.contiguous(),
            causal=causal,
            paged_kv_cache=(kv_cache.key, kv_cache.value),