parent
422bf1f986
commit
b5f1c9de06
|
@ -72,4 +72,4 @@ Once Prometheus data source is configured, we can finally create our dashboard!
|
||||||
|
|
||||||
Community contributed dashboard templates are also available, for example [here](https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/) or [here](https://grafana.com/grafana/dashboards/20246-text-generation-inference/).
|
Community contributed dashboard templates are also available, for example [here](https://grafana.com/grafana/dashboards/19831-text-generation-inference-dashboard/) or [here](https://grafana.com/grafana/dashboards/20246-text-generation-inference/).
|
||||||
|
|
||||||
Load your dashboard configuration, and your TGI dashboard should be ready to go!
|
Load your dashboard configuration, and your TGI dashboard should be ready to go!
|
||||||
|
|
|
@ -391,6 +391,28 @@ class BaseFlashMistral(FlashCausalLM):
|
||||||
def batch_type(self) -> Type[FlashMistralBatch]:
|
def batch_type(self) -> Type[FlashMistralBatch]:
|
||||||
return FlashMistralBatch
|
return FlashMistralBatch
|
||||||
|
|
||||||
|
def tunableop_warmup(self, seqlen: int):
|
||||||
|
input_ids = torch.zeros(seqlen, dtype=torch.int64, device=self.device)
|
||||||
|
position_ids = torch.zeros(seqlen, dtype=torch.int32, device=self.device)
|
||||||
|
slots = torch.arange(seqlen, dtype=torch.int64, device=self.device)
|
||||||
|
kv_cache = get_cache_manager().kv_cache
|
||||||
|
|
||||||
|
# We pass a `cu_seqlen_prefill` in order not to have to deal with paged attention cache allocation/deallocation.
|
||||||
|
self.model.forward(
|
||||||
|
input_ids=input_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
cu_seqlen_prefill=torch.tensor(
|
||||||
|
[0, seqlen], device=self.device, dtype=torch.int32
|
||||||
|
),
|
||||||
|
kv_cache=get_cache_manager().kv_cache,
|
||||||
|
block_tables=None,
|
||||||
|
input_lengths=None,
|
||||||
|
slots=slots,
|
||||||
|
max_s=seqlen,
|
||||||
|
lm_head_indices=None,
|
||||||
|
prefill_cache_indices=None,
|
||||||
|
)
|
||||||
|
|
||||||
def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
|
def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
|
||||||
input_ids = torch.zeros(bs, dtype=torch.int64, device=self.device)
|
input_ids = torch.zeros(bs, dtype=torch.int64, device=self.device)
|
||||||
position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device)
|
position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device)
|
||||||
|
|
|
@ -522,6 +522,30 @@ class Mamba(Model):
|
||||||
}
|
}
|
||||||
self.cuda_graphs[batch_size] = graph_dict
|
self.cuda_graphs[batch_size] = graph_dict
|
||||||
|
|
||||||
|
def tunableop_warmup(self, seqlen: int):
|
||||||
|
input_ids = torch.zeros((batch_size, 1), dtype=torch.int64, device=self.device)
|
||||||
|
n_blocks = len(self.model.blocks)
|
||||||
|
|
||||||
|
d_state = self.model.config.d_state
|
||||||
|
d_conv = self.model.config.d_conv
|
||||||
|
# Inner takes the expand multiplication
|
||||||
|
d_inner = self.model.config.d_inner
|
||||||
|
|
||||||
|
# Important seqlen_offset to go through the update mecanism with the state
|
||||||
|
seqlen_offset = 1
|
||||||
|
inference_params = new_inference_params(
|
||||||
|
n_blocks=n_blocks,
|
||||||
|
batch_size=seqlen,
|
||||||
|
d_state=d_state,
|
||||||
|
d_conv=d_conv,
|
||||||
|
d_inner=d_inner,
|
||||||
|
seqlen_offset=seqlen_offset,
|
||||||
|
device=self.device,
|
||||||
|
dtype=self.dtype,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.model.forward(input_ids=input_ids, inference_params=inference_params)
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self, input_ids: torch.Tensor, inference_params: Any
|
self, input_ids: torch.Tensor, inference_params: Any
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
|
Loading…
Reference in New Issue