45 lines
1.3 KiB
Python
45 lines
1.3 KiB
Python
from dataclasses import dataclass
|
|
from text_generation_server.models.globals import FLASH_DECODING, FLASH_INFER
|
|
import torch
|
|
from typing import Optional
|
|
|
|
|
|
if FLASH_DECODING or FLASH_INFER:
|
|
|
|
@dataclass
|
|
class Seqlen:
|
|
input_lengths: torch.Tensor
|
|
cu_seqlen_q: Optional[torch.Tensor]
|
|
cu_seqlen_k: Optional[torch.Tensor]
|
|
|
|
def __init__(self, input_lengths):
|
|
self.input_lengths = input_lengths
|
|
device = self.input_lengths.device
|
|
shape = self.input_lengths.shape
|
|
cu_seqlen_q = torch.arange(
|
|
shape[0] + 1,
|
|
device=device,
|
|
dtype=torch.int32,
|
|
)
|
|
cu_seqlen_k = torch.zeros(shape[-1] + 1, device=device, dtype=torch.int32)
|
|
# cuda graphs don't like this and this is necessary to clamp within mistral
|
|
# Although FA2 might not want the clamping
|
|
# cu_seqlen_k[0] = 0
|
|
torch.cumsum(self.input_lengths, -1, out=cu_seqlen_k[1:])
|
|
|
|
self.cu_seqlen_q = cu_seqlen_q
|
|
self.cu_seqlen_k = cu_seqlen_k
|
|
|
|
def clamp(self, max):
|
|
# Flash decoding doesn't need to clamp
|
|
return self
|
|
|
|
else:
|
|
|
|
@dataclass
|
|
class Seqlen:
|
|
input_lengths: torch.Tensor
|
|
|
|
def clamp(self, max):
|
|
return Seqlen(torch.clamp(self.input_lengths, max=max))
|