hf_text-generation-inference/server/text_generation_server/layers/moe/unquantized.py

from typing import Optional

import torch
import torch.nn as nn

from text_generation_server.utils.import_utils import SYSTEM
from text_generation_server.utils.weights import UnquantizedWeight, Weights

if SYSTEM == "ipex":
    from intel_extension_for_pytorch.llm.modules import GatedMLPMOE
else:
    from moe_kernels.fused_moe import fused_moe


class UnquantizedSparseMoELayer(nn.Module):
    def __init__(
        self,
        *,
        n_expert_group: Optional[int],
        n_experts: int,
        prefix: str,
        renormalize: bool,
        topk: int,
        topk_group: Optional[int],
        weights: Weights,
        gate_proj_name: str = "gate_proj",
        up_proj_name: str = "up_proj",
        down_proj_name: str = "down_proj",
    ):
        super().__init__()

        assert (n_expert_group is None) == (
            topk_group is None
        ), "n_expert_group and topk_group must both be None or have some value"

        self.n_expert_group = n_expert_group
        self.topk = topk
        self.topk_group = topk_group
        self.renormalize = renormalize

        self.gate_up_proj = _load_expert_multi_weights_col(
            prefix=prefix,
            n_experts=n_experts,
            gate_proj_name=gate_proj_name,
            up_proj_name=up_proj_name,
            weights=weights,
        )

        self.down_proj = _load_expert_weights_row(
            prefix=prefix,
            n_experts=n_experts,
            name=down_proj_name,
            weights=weights,
        )
        if SYSTEM == "ipex":
            self.ipex_fused_moe = GatedMLPMOE(
                W13=self.gate_up_proj, W2=self.down_proj, use_prepack=True
            )

    def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:
        if SYSTEM == "rocm":
            return fused_moe(
                x,
                self.gate_up_proj,
                self.down_proj,
                gating_output,
                self.topk,
                renormalize=self.renormalize,
                inplace=True,
            )
        elif SYSTEM == "ipex":
            return self.ipex_fused_moe(
                hidden_states=x,
                router_logits=gating_output,
                top_k=self.topk,
                renormalize=self.renormalize,
                use_grouped_topk=self.n_expert_group is not None,
                num_expert_group=self.n_expert_group,
                topk_group=self.topk_group,
            )

        return fused_moe(
            x,
            w1=self.gate_up_proj,
            w2=self.down_proj,
            gating_output=gating_output,
            topk=self.topk,
            renormalize=self.renormalize,
            inplace=True,
            use_grouped_topk=self.n_expert_group is not None,
            num_expert_group=self.n_expert_group,
            topk_group=self.topk_group,
        )


def _load_expert_multi_weights_col(
    *,
    prefix: str,
    n_experts: int,
    gate_proj_name: str,
    up_proj_name: str,
    weights: Weights,
) -> torch.Tensor:
    all_weight = None
    for i in range(n_experts):
        weight = weights.get_multi_weights_col(
            [f"{prefix}.{i}.{gate_proj_name}", f"{prefix}.{i}.{up_proj_name}"], 0
        )

        assert isinstance(weight, UnquantizedWeight)

        if all_weight is None:
            all_weight = torch.empty(
                (n_experts,) + weight.weight.shape,
                dtype=weight.weight.dtype,
                device=weight.weight.device,
            )

        all_weight[i] = weight.weight

    assert all_weight is not None

    return all_weight


def _load_expert_weights_row(
    *,
    prefix: str,
    n_experts: int,
    name: str,
    weights: Weights,
) -> torch.Tensor:
    all_weight = None
    for i in range(n_experts):
        weight = weights.get_weights_row(
            f"{prefix}.{i}.{name}",
        )

        assert isinstance(weight, UnquantizedWeight)

        if all_weight is None:
            all_weight = torch.empty(
                (n_experts,) + weight.weight.shape,
                dtype=weight.weight.dtype,
                device=weight.weight.device,
            )

        all_weight[i] = weight.weight

    assert all_weight is not None

    return all_weight
Move to moe-kernels package and switch to common MoE layer (#2511) * Move to moe-kernels package and switch to common MoE layer This change introduces the new `moe-kernels` package: - Add `moe-kernels` as a dependency. - Introduce a `SparseMoELayer` module that can be used by MoE models. - Port over Mixtral and Deepseek. * Make `cargo check` pass * Update runner 2024-09-17 10:08:58 -06:00			`from typing import Optional`

			`import torch`
			`import torch.nn as nn`

			`from text_generation_server.utils.import_utils import SYSTEM`
			`from text_generation_server.utils.weights import UnquantizedWeight, Weights`

Update vllm kernels for ROCM (#2826) * (vllm) updated vllm rocm kernels * revert silu * update partition size * remove grouped_topk * (nit) remove log * update moe-kernels commit 2024-12-18 04:44:42 -07:00			`if SYSTEM == "ipex":`
add ipex moe implementation to support Mixtral and PhiMoe (#2707) * add ipex moe implementation to support Mixtral and PhiMoe Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * update to ipex xpu 2.5 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * torch has xpu support in 2.5 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix oneapi basekit version Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * Apply suggestions from code review Co-authored-by: Daniël de Kok <me@github.danieldk.eu> --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Co-authored-by: Daniël de Kok <me@github.danieldk.eu> 2024-11-18 09:16:55 -07:00			`from intel_extension_for_pytorch.llm.modules import GatedMLPMOE`
Simplify two ipex conditions (#2755) 2024-11-19 00:04:23 -07:00			`else:`
			`from moe_kernels.fused_moe import fused_moe`
Move to moe-kernels package and switch to common MoE layer (#2511) * Move to moe-kernels package and switch to common MoE layer This change introduces the new `moe-kernels` package: - Add `moe-kernels` as a dependency. - Introduce a `SparseMoELayer` module that can be used by MoE models. - Port over Mixtral and Deepseek. * Make `cargo check` pass * Update runner 2024-09-17 10:08:58 -06:00

			`class UnquantizedSparseMoELayer(nn.Module):`
			`def __init__(`
			`self,`
			`*,`
			`n_expert_group: Optional[int],`
			`n_experts: int,`
			`prefix: str,`
			`renormalize: bool,`
			`topk: int,`
			`topk_group: Optional[int],`
			`weights: Weights,`
			`gate_proj_name: str = "gate_proj",`
			`up_proj_name: str = "up_proj",`
			`down_proj_name: str = "down_proj",`
			`):`
			`super().__init__()`

			`assert (n_expert_group is None) == (`
			`topk_group is None`
			`), "n_expert_group and topk_group must both be None or have some value"`

			`self.n_expert_group = n_expert_group`
			`self.topk = topk`
			`self.topk_group = topk_group`
			`self.renormalize = renormalize`

			`self.gate_up_proj = _load_expert_multi_weights_col(`
			`prefix=prefix,`
			`n_experts=n_experts,`
			`gate_proj_name=gate_proj_name,`
			`up_proj_name=up_proj_name,`
			`weights=weights,`
			`)`

			`self.down_proj = _load_expert_weights_row(`
			`prefix=prefix,`
			`n_experts=n_experts,`
			`name=down_proj_name,`
			`weights=weights,`
			`)`
add ipex moe implementation to support Mixtral and PhiMoe (#2707) * add ipex moe implementation to support Mixtral and PhiMoe Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * update to ipex xpu 2.5 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * torch has xpu support in 2.5 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix oneapi basekit version Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * Apply suggestions from code review Co-authored-by: Daniël de Kok <me@github.danieldk.eu> --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Co-authored-by: Daniël de Kok <me@github.danieldk.eu> 2024-11-18 09:16:55 -07:00			`if SYSTEM == "ipex":`
			`self.ipex_fused_moe = GatedMLPMOE(`
			`W13=self.gate_up_proj, W2=self.down_proj, use_prepack=True`
			`)`
Move to moe-kernels package and switch to common MoE layer (#2511) * Move to moe-kernels package and switch to common MoE layer This change introduces the new `moe-kernels` package: - Add `moe-kernels` as a dependency. - Introduce a `SparseMoELayer` module that can be used by MoE models. - Port over Mixtral and Deepseek. * Make `cargo check` pass * Update runner 2024-09-17 10:08:58 -06:00
			`def forward(self, x: torch.Tensor, *, gating_output: torch.Tensor) -> torch.Tensor:`
Update ROCM libs and improvements (#2579) * style * update torch * ix issues * fix clone * revert mkl * added custom PA * style * fix style * style * hide env vart * fix mixtral model * add skinny kernel and merge fixes * fixed style * fix issue for sliding window models * addressed review comments * fix import * improved error messag * updated default value * remove import * fix imports after rebase * float16 dep * improve dockerfile * cleaned dockerfile 2024-09-30 02:54:32 -06:00			`if SYSTEM == "rocm":`
			`return fused_moe(`
			`x,`
			`self.gate_up_proj,`
			`self.down_proj,`
			`gating_output,`
			`self.topk,`
			`renormalize=self.renormalize,`
			`inplace=True,`
			`)`
add ipex moe implementation to support Mixtral and PhiMoe (#2707) * add ipex moe implementation to support Mixtral and PhiMoe Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * update to ipex xpu 2.5 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * torch has xpu support in 2.5 Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * fix oneapi basekit version Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> * Apply suggestions from code review Co-authored-by: Daniël de Kok <me@github.danieldk.eu> --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Co-authored-by: Daniël de Kok <me@github.danieldk.eu> 2024-11-18 09:16:55 -07:00			`elif SYSTEM == "ipex":`
			`return self.ipex_fused_moe(`
			`hidden_states=x,`
			`router_logits=gating_output,`
			`top_k=self.topk,`
			`renormalize=self.renormalize,`
			`use_grouped_topk=self.n_expert_group is not None,`
			`num_expert_group=self.n_expert_group,`
			`topk_group=self.topk_group,`
			`)`
Update ROCM libs and improvements (#2579) * style * update torch * ix issues * fix clone * revert mkl * added custom PA * style * fix style * style * hide env vart * fix mixtral model * add skinny kernel and merge fixes * fixed style * fix issue for sliding window models * addressed review comments * fix import * improved error messag * updated default value * remove import * fix imports after rebase * float16 dep * improve dockerfile * cleaned dockerfile 2024-09-30 02:54:32 -06:00
Move to moe-kernels package and switch to common MoE layer (#2511) * Move to moe-kernels package and switch to common MoE layer This change introduces the new `moe-kernels` package: - Add `moe-kernels` as a dependency. - Introduce a `SparseMoELayer` module that can be used by MoE models. - Port over Mixtral and Deepseek. * Make `cargo check` pass * Update runner 2024-09-17 10:08:58 -06:00			`return fused_moe(`
			`x,`
			`w1=self.gate_up_proj,`
			`w2=self.down_proj,`
			`gating_output=gating_output,`
			`topk=self.topk,`
			`renormalize=self.renormalize,`
			`inplace=True,`
			`use_grouped_topk=self.n_expert_group is not None,`
			`num_expert_group=self.n_expert_group,`
			`topk_group=self.topk_group,`
			`)`


			`def _load_expert_multi_weights_col(`
			`*,`
			`prefix: str,`
			`n_experts: int,`
			`gate_proj_name: str,`
			`up_proj_name: str,`
			`weights: Weights,`
			`) -> torch.Tensor:`
			`all_weight = None`
			`for i in range(n_experts):`
			`weight = weights.get_multi_weights_col(`
			`[f"{prefix}.{i}.{gate_proj_name}", f"{prefix}.{i}.{up_proj_name}"], 0`
			`)`

			`assert isinstance(weight, UnquantizedWeight)`

			`if all_weight is None:`
			`all_weight = torch.empty(`
			`(n_experts,) + weight.weight.shape,`
			`dtype=weight.weight.dtype,`
			`device=weight.weight.device,`
			`)`

			`all_weight[i] = weight.weight`

			`assert all_weight is not None`

			`return all_weight`


			`def _load_expert_weights_row(`
			`*,`
			`prefix: str,`
			`n_experts: int,`
			`name: str,`
			`weights: Weights,`
			`) -> torch.Tensor:`
			`all_weight = None`
			`for i in range(n_experts):`
			`weight = weights.get_weights_row(`
			`f"{prefix}.{i}.{name}",`
			`)`

			`assert isinstance(weight, UnquantizedWeight)`

			`if all_weight is None:`
			`all_weight = torch.empty(`
			`(n_experts,) + weight.weight.shape,`
			`dtype=weight.weight.dtype,`
			`device=weight.weight.device,`
			`)`

			`all_weight[i] = weight.weight`

			`assert all_weight is not None`

			`return all_weight`