diff --git a/server/text_generation_server/layers/lora.py b/server/text_generation_server/layers/lora.py index 7adfbb29..bb672feb 100644 --- a/server/text_generation_server/layers/lora.py +++ b/server/text_generation_server/layers/lora.py @@ -1,12 +1,13 @@ import math import os -from typing import TYPE_CHECKING, Optional, Tuple +from typing import TYPE_CHECKING, Optional, Tuple, List import torch import torch.distributed from accelerate import init_empty_weights from torch import nn from torch.nn import functional as F +from torch.distributed import ProcessGroup from text_generation_server.utils.sgmv import ( add_lora_a_bgmv, @@ -26,7 +27,9 @@ if TYPE_CHECKING: class LoraLinear(nn.Module): - def __init__(self, base_layer, layer_id, process_group): + def __init__( + self, base_layer: nn.Module, layer_id: int, process_group: ProcessGroup + ): super().__init__() self.base_layer = base_layer self.layer_id = layer_id @@ -49,6 +52,18 @@ class LoraLinear(nn.Module): ) if has_sgmv() and data is not None and data.can_vectorize(self.process_group): + # In tensor-parallel configurations, each GPU processes a specific segment of the output. + # The 'result' tensor represents the full output, which can vary in size based on + # the layer type (e.g., attention vs. feed-forward layers). We define the current + # segment using start_idx and end_idx. If the segment size doesn't match this GPU's + # slice of 'result', we create a zero tensor of the correct size for LoRA computation. + # This approach ensures accurate LoRA application across various layer sizes and + # configurations, adapting to different model architectures and parallelization strategies. + # + # Example scenarios where this is necessary: + # 1. The adapter's size doesn't evenly divide across GPUs. + # 2. We're processing the last segment which might be smaller. + # 3. Different projection layers (q, k, v) have different sizes. if end_idx - start_idx != result.shape[1]: proj = torch.zeros_like(result[:, start_idx:end_idx]) else: @@ -149,13 +164,27 @@ class LoraLinear(nn.Module): class TensorParallelMultiAdapterLinear(LoraLinear): - def __init__(self, base_layer, layer_id, layer_names, sizes, process_group): + def __init__( + self, + base_layer: nn.Module, + layer_id: int, + layer_names: List[str], + sizes: List[int], + process_group: ProcessGroup, + ): super().__init__(base_layer, layer_id, process_group) self.layer_names = layer_names self.sizes = sizes @classmethod - def load(cls, base_layer, layer_id, layer_names, sizes, process_group): + def load( + cls, + base_layer: nn.Module, + layer_id: int, + layer_names: List[str], + sizes: List[int], + process_group: ProcessGroup, + ): return TensorParallelMultiAdapterLinear( base_layer, layer_id, layer_names, sizes, process_group ) @@ -178,7 +207,12 @@ class TensorParallelMultiAdapterLinear(LoraLinear): offset = 0 for i, layer_name in enumerate(self.layer_names): start_idx = offset // self.process_group.size() - + # The 'sizes' parameter is essential in tensor-parallel setups for handling multiple + # projection layers (q_proj, k_proj, v_proj) by defining their output dimensions. It + # ensures correct slicing of the result tensor, accommodating variations like grouped-query + # attention where k_proj and v_proj differ from q_proj. This allows precise application of + # LoRA adapters to each sub-component of the multi-head attention mechanism, managing the + # different projection sizes across layers and model architectures. if self.sizes is not None: offset += self.sizes[i] end_idx = offset // self.process_group.size() diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py index 8da44273..7a579478 100644 --- a/server/text_generation_server/models/model.py +++ b/server/text_generation_server/models/model.py @@ -292,31 +292,3 @@ class Model(ABC): ] return weights_a, weights_b - - def offload_adapter( - self, - adapter_parameters: AdapterParameters, - adapter_source: AdapterSource, - adapter_index: int, - ): - """Offloads the adapter weights from GPU to CPU or disk.""" - if adapter_index not in self.loaded_adapters: - # Adapter already offloaded - return - - if not self.supports_adapter_loading: - raise ValueError("This model does not support adapter loading.") - - if not self.dynamic_adapter_loading_enabled: - raise ValueError( - f"This model was initialized with the adapter {self.static_adapter_id} " - f"and therefore does not support dynamic adapter loading. " - f"Please initialize a new model instance from the base model in " - f"order to use the dynamic adapter loading feature." - ) - - for layer_name in self.adapter_layers: - if layer_name in self.layer_to_adapter_weights: - self.layer_to_adapter_weights[layer_name].remove_adapter(adapter_index) - - self.loaded_adapters.remove(adapter_index) diff --git a/server/text_generation_server/utils/merges/strategies.py b/server/text_generation_server/utils/merges/strategies.py index 3b885313..9f8387ac 100644 --- a/server/text_generation_server/utils/merges/strategies.py +++ b/server/text_generation_server/utils/merges/strategies.py @@ -1,3 +1,7 @@ +# Origin: https://github.com/predibase/lorax +# Path: lorax/server/lorax_server/utils/merges/strategies.py +# License: Apache License Version 2.0, January 2004 + import copy from abc import ABC from collections import defaultdict