hf_text-generation-inference/server/text_generation_server/utils/sgmv.py

# Origin:   https://github.com/predibase/lorax
# Path:     lorax/server/lorax_server/utils/sgmv.py
# License:  Apache License Version 2.0, January 2004

import os
import warnings
from functools import lru_cache
from typing import List, Tuple

import torch
import torch.nn.functional as F

try:
    import punica_kernels as _kernels

    HAS_SGMV = not bool(os.environ.get("DISABLE_SGMV", ""))
except ImportError:
    warnings.warn("Could not import SGMV kernel from Punica, falling back to loop.")
    _kernels = None
    HAS_SGMV = False


MIN_SGMV_RANK = 8
MIN_RANK_CUSTOM = 16
MAX_RANK_CUSTOM = 128
SGMV_BLOCK_SIZE = 16
BGMV_MAX_RANK = 64


def has_sgmv() -> bool:
    return HAS_SGMV


def pad_rank(t: torch.Tensor, dim: int, world_size: int) -> torch.Tensor:
    """Pad a tensor to the minimum rank for SGMV and the nearest multiple of the SGMV block size."""
    if not has_sgmv():
        return t

    # tensor parallelism will result in effective rank being divided by world_size,
    # so we need to scale the min rank to offset that effect
    min_rank = MIN_SGMV_RANK * world_size

    # if we're at or below the min rank, pad up to the min rank
    # otherwise, pad to the nearest multiple of the block size
    current_rank = t.size(dim)
    target_rank = (
        min_rank
        if current_rank <= min_rank
        else (current_rank + SGMV_BLOCK_SIZE - 1) // SGMV_BLOCK_SIZE * SGMV_BLOCK_SIZE
    )
    if current_rank == target_rank:
        return t

    pad_size = target_rank - current_rank

    # see complicatd pad syntax here: https://pytorch.org/docs/stable/generated/torch.nn.functional.pad.html
    pad = [0, 0] * t.dim()
    pad[(t.dim() - dim - 1) * 2 + 1] = pad_size
    pad = tuple(pad)

    return F.pad(t, pad, mode="constant", value=0.0)


def use_cutlass_shrink(lora_rank: int) -> bool:
    return lora_rank < MIN_RANK_CUSTOM


def orient_for_rank(t: torch.Tensor, rank: int) -> torch.Tensor:
    if MIN_RANK_CUSTOM <= rank <= MAX_RANK_CUSTOM:
        return t.transpose(0, 1)
    return t


# Source: https://github.com/punica-ai/punica/blob/master/src/punica/ops/__init__.py
def add_lora_sgmv_cutlass(
    y: torch.Tensor,
    x: torch.Tensor,
    wa_ptr: torch.Tensor,
    wb_ptr: torch.Tensor,
    s_start: torch.Tensor,
    s_end: torch.Tensor,
    layer_idx: int,
    lora_rank: int,
):
    """
    Semantics:
        y[s[i]:s[i+1]] += x[s[i]:s[i+1]] @ deref(wa_ptr[i]).T @ deref(wb_ptr[i])

    Args:
        y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
        x: Shape: `[B, H1]`. Input vectors.
        wa_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
            Weight matrix shape: `[num_layers, R, H1]`.
        wb_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
            Weight matrix shape: `[num_layers, R, H2]`.
        s_start: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices start indices.
        s_end: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices end indices.
        layer_idx: Layer index of the weight matrices.
    """
    if lora_rank < MIN_RANK_CUSTOM or lora_rank > MAX_RANK_CUSTOM:
        # Custom SGMV shrink only supports rank 16, 32, 64, 128
        _add_lora_sgmv_cutlass_legacy(
            y, x, wa_ptr, wb_ptr, s_start, s_end, layer_idx, lora_rank
        )
        return

    tmp1 = torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=x.device)
    tmp2_size = _kernels.sgmv_cutlass_tmp_size(wa_ptr.size(0))
    tmp2 = torch.empty((tmp2_size,), dtype=torch.uint8, device=x.device)
    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
    _kernels.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp1, layer_idx)
    _kernels.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp2, layer_idx)


def _add_lora_sgmv_cutlass_legacy(
    y: torch.Tensor,
    x: torch.Tensor,
    wa_ptr: torch.Tensor,
    wb_ptr: torch.Tensor,
    s_start: torch.IntTensor,
    s_end: torch.IntTensor,
    layer_idx: int,
    lora_rank: int,
):
    tmp_size = _kernels.sgmv_cutlass_tmp_size(wa_ptr.size(0))
    tmp = torch.empty((tmp_size,), dtype=torch.uint8, device=x.device)
    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
    _kernels.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
    _kernels.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)


@lru_cache(maxsize=1)
def get_tmp_tensor(device: torch.device) -> torch.Tensor:
    return torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=device)


@lru_cache(maxsize=32)
def get_tmp_tensor_for_size(size: int, device: torch.device) -> torch.Tensor:
    tmp_size = _kernels.sgmv_cutlass_tmp_size(size)
    return torch.empty((tmp_size,), dtype=torch.uint8, device=device)


def get_tmp_tensor_for_size_no_kernels(size: int, device: torch.device) -> torch.Tensor:
    return torch.empty((size,), dtype=torch.uint8, device=device)


def get_tmp_expand_size(size: int) -> int:
    return _kernels.sgmv_cutlass_tmp_size(size)


def get_tmp_tensors(
    nsegments: int, lora_rank: int, device: torch.device
) -> Tuple[torch.Tensor, torch.Tensor]:
    if use_cutlass_shrink(lora_rank) and has_sgmv():
        tmp = get_tmp_tensor_for_size(nsegments, device)
        return tmp, tmp
    else:
        tmp_shrink = get_tmp_tensor(device)
        tmp_expand = get_tmp_tensor_for_size_no_kernels(nsegments, device)
        return tmp_shrink, tmp_expand


def lora_a_sgmv_cutlass(
    x: torch.Tensor,
    tmp: torch.Tensor,
    wa_ptr: torch.Tensor,
    s_start: torch.IntTensor,
    s_end: torch.IntTensor,
    layer_idx: int,
    lora_rank: int,
) -> torch.Tensor:
    v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)
    if MIN_RANK_CUSTOM <= lora_rank <= MAX_RANK_CUSTOM:
        _kernels.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
    else:
        _kernels.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)
    return v


def lora_b_sgmv_cutlass(
    y: torch.Tensor,
    v: torch.Tensor,
    tmp: torch.Tensor,
    wb_ptr: torch.Tensor,
    s_start: torch.IntTensor,
    s_end: torch.IntTensor,
    layer_idx: int,
):
    _kernels.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)


"""
Semantics:
    y[i] += (
        x[i].unsqueeze(0)
        @ wa_T_all[indices[i], layer_idx, :, :].transpose(-1, -2)
        @ wb_T_all[indices[i], layer_idx, :, :].transpose(-1, -2)
        * scale
    ).squeeze(0)

Args:
    y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
    v: Shape: `[B, R]`. Temporary vector.
    x: Shape: `[B, H1]`. Input vectors.
    wa_T_all: Shape: `[None, L, R, H1]`. All of the transposed LoRA A matrices.
    wb_T_all: Shape: `[None, L, H2, R]`. All of the transposed LoRA B matrices.
    indicies: Shape: `[B]`. Indices of the LoRA weights.
    layer_idx: Layer index of LoRA weights.
    scale: Scaling factor.
"""


def add_lora_a_bgmv(
    v: torch.Tensor,
    x: torch.Tensor,
    wa_T_all: torch.Tensor,
    indicies: torch.LongTensor,
    layer_idx: int,
):
    _kernels.dispatch_bgmv(v, x, wa_T_all, indicies, layer_idx, 1.0)


def add_lora_b_bgmv(
    y: torch.Tensor,
    v: torch.Tensor,
    wb_T_all: torch.Tensor,
    indicies: torch.LongTensor,
    layer_idx: int,
):
    _kernels.dispatch_bgmv(y, v, wb_T_all, indicies, layer_idx, 1.0)


def segmented_matmul(
    y: torch.Tensor,
    x: torch.Tensor,
    w: List[torch.Tensor],
    b: List[torch.Tensor],
    s_start: torch.IntTensor,
    s_end: torch.IntTensor,
):
    for i in range(len(w)):
        if s_end[i] - s_start[i] <= 0:
            continue

        xi = x[s_start[i] : s_end[i]]
        wi = w[i]
        bi = b[i]
        y[s_start[i] : s_end[i]] = F.linear(xi, wi, bi)
Enable multiple LoRa adapters (#2010) * feat: first draft load multiple lora * feat: load weights within layer and refactor lora pass * fix: refactor and reduce lora math * feat: baseline impl single request multi lora support * feat: prefer lorax implementation and port loading logic * fix: prefer adapter_data and refactors * feat: perfer loraxs custom punica kernels and add mlp loras * fix: adjust batch for bgmv * fix: adjust adapter_segments logic when in batch * fix: refactor and move changes to v3 proto * fix: pass model_id for all flash causal lms * fix: pass model_id for all causal and seq2seq lms * fix: add model_id to model test * feat: add lora support to mistral and refactors * feat: prefer model id in request * fix: include rust code for adapter id * feat: bump launcher and add new lora docs * feat: support base model generation and refactors * fix: rename doc to retry ci build * feat: support if vlm models * fix: add adapter_data param and avoid missing layers * fix: add adapter_data param to phi and neox * fix: update all models forwards to include adapter_data * fix: add model_id to IdeficsCausalLM * Update lora.md Fixed a typo * Update lora.md Fixing spam image * fix: add lora kernel to dockerfile, support running without kernels and refactors * fix: avoid dockerfile conflict * fix: refactors and adjust flash llama lora logic * fix: skip llama test due to CI issue (temp) * fix: skip llama test CI (temp) 2 * fix: revert skips and prefer updated ci token for tests * fix: refactors and helpful comments * fix: add noop in TensorParallelAdapterRowLinear too * fix: refactor and move shard_lora_weights logic * fix: exit early if no adapter_data --------- Co-authored-by: Derek <datavistics@gmail.com> 2024-06-25 12:46:27 -06:00			`# Origin: https://github.com/predibase/lorax`
			`# Path: lorax/server/lorax_server/utils/sgmv.py`
			`# License: Apache License Version 2.0, January 2004`

			`import os`
			`import warnings`
			`from functools import lru_cache`
			`from typing import List, Tuple`

			`import torch`
			`import torch.nn.functional as F`

			`try:`
			`import punica_kernels as _kernels`

			`HAS_SGMV = not bool(os.environ.get("DISABLE_SGMV", ""))`
			`except ImportError:`
			`warnings.warn("Could not import SGMV kernel from Punica, falling back to loop.")`
			`_kernels = None`
			`HAS_SGMV = False`


			`MIN_SGMV_RANK = 8`
			`MIN_RANK_CUSTOM = 16`
			`MAX_RANK_CUSTOM = 128`
			`SGMV_BLOCK_SIZE = 16`
			`BGMV_MAX_RANK = 64`


			`def has_sgmv() -> bool:`
			`return HAS_SGMV`


			`def pad_rank(t: torch.Tensor, dim: int, world_size: int) -> torch.Tensor:`
			`"""Pad a tensor to the minimum rank for SGMV and the nearest multiple of the SGMV block size."""`
			`if not has_sgmv():`
			`return t`

			`# tensor parallelism will result in effective rank being divided by world_size,`
			`# so we need to scale the min rank to offset that effect`
			`min_rank = MIN_SGMV_RANK * world_size`

			`# if we're at or below the min rank, pad up to the min rank`
			`# otherwise, pad to the nearest multiple of the block size`
			`current_rank = t.size(dim)`
			`target_rank = (`
			`min_rank`
			`if current_rank <= min_rank`
			`else (current_rank + SGMV_BLOCK_SIZE - 1) // SGMV_BLOCK_SIZE * SGMV_BLOCK_SIZE`
			`)`
			`if current_rank == target_rank:`
			`return t`

			`pad_size = target_rank - current_rank`

			`# see complicatd pad syntax here: https://pytorch.org/docs/stable/generated/torch.nn.functional.pad.html`
			`pad = [0, 0] * t.dim()`
			`pad[(t.dim() - dim - 1) * 2 + 1] = pad_size`
			`pad = tuple(pad)`

			`return F.pad(t, pad, mode="constant", value=0.0)`


			`def use_cutlass_shrink(lora_rank: int) -> bool:`
			`return lora_rank < MIN_RANK_CUSTOM`


			`def orient_for_rank(t: torch.Tensor, rank: int) -> torch.Tensor:`
			`if MIN_RANK_CUSTOM <= rank <= MAX_RANK_CUSTOM:`
			`return t.transpose(0, 1)`
			`return t`


			`# Source: https://github.com/punica-ai/punica/blob/master/src/punica/ops/__init__.py`
			`def add_lora_sgmv_cutlass(`
			`y: torch.Tensor,`
			`x: torch.Tensor,`
			`wa_ptr: torch.Tensor,`
			`wb_ptr: torch.Tensor,`
			`s_start: torch.Tensor,`
			`s_end: torch.Tensor,`
			`layer_idx: int,`
			`lora_rank: int,`
			`):`
			`"""`
			`Semantics:`
			`y[s[i]:s[i+1]] += x[s[i]:s[i+1]] @ deref(wa_ptr[i]).T @ deref(wb_ptr[i])`

			`Args:`
			y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
			x: Shape: `[B, H1]`. Input vectors.
			wa_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
			Weight matrix shape: `[num_layers, R, H1]`.
			wb_ptr: Shape: `[S]`. DType: torch.int64. Pointer to the weight matrices.\
			Weight matrix shape: `[num_layers, R, H2]`.
			s_start: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices start indices.
			s_end: Shape: `[S]`, DType: torch.int32. Indptr of the weight matrices end indices.
			`layer_idx: Layer index of the weight matrices.`
			`"""`
			`if lora_rank < MIN_RANK_CUSTOM or lora_rank > MAX_RANK_CUSTOM:`
			`# Custom SGMV shrink only supports rank 16, 32, 64, 128`
			`_add_lora_sgmv_cutlass_legacy(`
			`y, x, wa_ptr, wb_ptr, s_start, s_end, layer_idx, lora_rank`
			`)`
			`return`

			`tmp1 = torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=x.device)`
			`tmp2_size = _kernels.sgmv_cutlass_tmp_size(wa_ptr.size(0))`
			`tmp2 = torch.empty((tmp2_size,), dtype=torch.uint8, device=x.device)`
			`v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)`
			`_kernels.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp1, layer_idx)`
			`_kernels.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp2, layer_idx)`


			`def _add_lora_sgmv_cutlass_legacy(`
			`y: torch.Tensor,`
			`x: torch.Tensor,`
			`wa_ptr: torch.Tensor,`
			`wb_ptr: torch.Tensor,`
			`s_start: torch.IntTensor,`
			`s_end: torch.IntTensor,`
			`layer_idx: int,`
			`lora_rank: int,`
			`):`
			`tmp_size = _kernels.sgmv_cutlass_tmp_size(wa_ptr.size(0))`
			`tmp = torch.empty((tmp_size,), dtype=torch.uint8, device=x.device)`
			`v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)`
			`_kernels.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)`
			`_kernels.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)`


			`@lru_cache(maxsize=1)`
			`def get_tmp_tensor(device: torch.device) -> torch.Tensor:`
			`return torch.empty((8 * 1024 * 1024,), dtype=torch.uint8, device=device)`


			`@lru_cache(maxsize=32)`
			`def get_tmp_tensor_for_size(size: int, device: torch.device) -> torch.Tensor:`
			`tmp_size = _kernels.sgmv_cutlass_tmp_size(size)`
			`return torch.empty((tmp_size,), dtype=torch.uint8, device=device)`


			`def get_tmp_tensor_for_size_no_kernels(size: int, device: torch.device) -> torch.Tensor:`
			`return torch.empty((size,), dtype=torch.uint8, device=device)`


			`def get_tmp_expand_size(size: int) -> int:`
			`return _kernels.sgmv_cutlass_tmp_size(size)`


			`def get_tmp_tensors(`
			`nsegments: int, lora_rank: int, device: torch.device`
			`) -> Tuple[torch.Tensor, torch.Tensor]:`
			`if use_cutlass_shrink(lora_rank) and has_sgmv():`
			`tmp = get_tmp_tensor_for_size(nsegments, device)`
			`return tmp, tmp`
			`else:`
			`tmp_shrink = get_tmp_tensor(device)`
			`tmp_expand = get_tmp_tensor_for_size_no_kernels(nsegments, device)`
			`return tmp_shrink, tmp_expand`


			`def lora_a_sgmv_cutlass(`
			`x: torch.Tensor,`
			`tmp: torch.Tensor,`
			`wa_ptr: torch.Tensor,`
			`s_start: torch.IntTensor,`
			`s_end: torch.IntTensor,`
			`layer_idx: int,`
			`lora_rank: int,`
			`) -> torch.Tensor:`
			`v = torch.zeros((x.size(0), lora_rank), dtype=x.dtype, device=x.device)`
			`if MIN_RANK_CUSTOM <= lora_rank <= MAX_RANK_CUSTOM:`
			`_kernels.sgmv_shrink(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)`
			`else:`
			`_kernels.sgmv_cutlass(v, x, wa_ptr, s_start, s_end, tmp, layer_idx)`
			`return v`


			`def lora_b_sgmv_cutlass(`
			`y: torch.Tensor,`
			`v: torch.Tensor,`
			`tmp: torch.Tensor,`
			`wb_ptr: torch.Tensor,`
			`s_start: torch.IntTensor,`
			`s_end: torch.IntTensor,`
			`layer_idx: int,`
			`):`
			`_kernels.sgmv_cutlass(y, v, wb_ptr, s_start, s_end, tmp, layer_idx)`


			`"""`
			`Semantics:`
			`y[i] += (`
			`x[i].unsqueeze(0)`
			`@ wa_T_all[indices[i], layer_idx, :, :].transpose(-1, -2)`
			`@ wb_T_all[indices[i], layer_idx, :, :].transpose(-1, -2)`
			`* scale`
			`).squeeze(0)`

			`Args:`
			y: Shape: `[B, H2]`. Output vectors. Will be changed in-place.
			v: Shape: `[B, R]`. Temporary vector.
			x: Shape: `[B, H1]`. Input vectors.
			wa_T_all: Shape: `[None, L, R, H1]`. All of the transposed LoRA A matrices.
			wb_T_all: Shape: `[None, L, H2, R]`. All of the transposed LoRA B matrices.
			indicies: Shape: `[B]`. Indices of the LoRA weights.
			`layer_idx: Layer index of LoRA weights.`
			`scale: Scaling factor.`
			`"""`


			`def add_lora_a_bgmv(`
			`v: torch.Tensor,`
			`x: torch.Tensor,`
			`wa_T_all: torch.Tensor,`
			`indicies: torch.LongTensor,`
			`layer_idx: int,`
			`):`
			`_kernels.dispatch_bgmv(v, x, wa_T_all, indicies, layer_idx, 1.0)`


			`def add_lora_b_bgmv(`
			`y: torch.Tensor,`
			`v: torch.Tensor,`
			`wb_T_all: torch.Tensor,`
			`indicies: torch.LongTensor,`
			`layer_idx: int,`
			`):`
			`_kernels.dispatch_bgmv(y, v, wb_T_all, indicies, layer_idx, 1.0)`


			`def segmented_matmul(`
			`y: torch.Tensor,`
			`x: torch.Tensor,`
			`w: List[torch.Tensor],`
			`b: List[torch.Tensor],`
			`s_start: torch.IntTensor,`
			`s_end: torch.IntTensor,`
			`):`
			`for i in range(len(w)):`
			`if s_end[i] - s_start[i] <= 0:`
			`continue`

			`xi = x[s_start[i] : s_end[i]]`
			`wi = w[i]`
			`bi = b[i]`
			`y[s_start[i] : s_end[i]] = F.linear(xi, wi, bi)`