hf_text-generation-inference/server/text_generation_server/models/model.py

import inspect
import torch

from abc import ABC, abstractmethod
from typing import List, Tuple, Optional, TypeVar, Type, Dict
from collections import defaultdict
from transformers import PreTrainedTokenizerBase

from text_generation_server.models.types import Batch, Generation
from text_generation_server.utils.speculate import get_speculate
from text_generation_server.pb.generate_pb2 import InfoResponse
from text_generation_server.adapters.weights import LayerAdapterWeights

BASE_MODEL_ADAPTER_ID = "__base_model__"


B = TypeVar("B", bound=Batch)


class Model(ABC):
    def __init__(
        self,
        model_id: str,
        model: torch.nn.Module,
        tokenizer: PreTrainedTokenizerBase,
        requires_padding: bool,
        dtype: torch.dtype,
        device: torch.device,
        rank: int = 0,
        world_size: int = 1,
        sliding_window: Optional[int] = None,
        speculate: Optional[int] = None,
        adapter_id: str = BASE_MODEL_ADAPTER_ID,
    ):
        self.model_id = model_id
        self.model = model.eval()
        self.tokenizer = tokenizer

        # all_special_ids is not set correctly if the rust tokenizer is unpacked
        # TODO report this to transformers.
        other_special_ids = {
            id for id, token in tokenizer.added_tokens_decoder.items() if token.special
        }
        self.all_special_ids = set(tokenizer.all_special_ids)
        self.all_special_ids.update(other_special_ids)
        self.requires_padding = requires_padding
        self.dtype = dtype
        self.device = device
        self.rank = rank
        self.world_size = world_size
        self.sliding_window = sliding_window if sliding_window != -1 else None

        self.layer_to_adapter_weights: Dict[str, LayerAdapterWeights] = defaultdict(
            LayerAdapterWeights
        )
        self.loaded_adapters = set()
        self.static_adapter_id = adapter_id

        if speculate is None:
            speculate = get_speculate()
        self.speculate = speculate

        self.has_position_ids = (
            inspect.signature(model.forward).parameters.get("position_ids", None)
            is not None
        )

        self.check_initialized()

    @property
    def info(self) -> InfoResponse:
        if self.requires_padding and self.sliding_window is not None:
            raise NotImplementedError("sliding_window is not implemented with padding")

        return InfoResponse(
            requires_padding=self.requires_padding,
            dtype=str(self.dtype),
            device_type=self.device.type,
            window_size=self.sliding_window,
            speculate=self.speculate,
        )

    @property
    @abstractmethod
    def batch_type(self) -> Type[B]:
        raise NotImplementedError

    @abstractmethod
    def generate_token(
        self, batch: B
    ) -> Tuple[List[Generation], Optional[B], Tuple[int, int]]:
        raise NotImplementedError

    def warmup(self, batch: B) -> Optional[int]:
        self.generate_token(batch)
        return None

    def decode_token(
        self,
        all_input_ids: List[int],
        prefix_offset: int = 0,
        read_offset: int = 0,
        skip_special_tokens: bool = False,
    ) -> Tuple[str, int, int]:
        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""

        # The prefix text is necessary only to defeat cleanup algorithms in the decode
        # which decide to add a space or not depending on the surrounding ids.
        prefix_text = self.tokenizer.decode(
            all_input_ids[prefix_offset:read_offset],
            skip_special_tokens=skip_special_tokens,
        )
        new_text = self.tokenizer.decode(
            all_input_ids[prefix_offset:], skip_special_tokens=skip_special_tokens
        )

        if len(new_text) > len(prefix_text) and not new_text.endswith("<EFBFBD>"):
            # utf-8 char at the end means it's a potential unfinished byte sequence
            # from byte fallback tokenization.
            # If it's in the middle, it's probably a real invalid id generated
            # by the model
            new_text = new_text[len(prefix_text) :]
            return new_text, read_offset, len(all_input_ids)
        else:
            return "", prefix_offset, read_offset

    def check_initialized(self):
        uninitialized_parameters = []
        for n, p in self.model.named_parameters():
            if p.data.device == torch.device("meta"):
                uninitialized_parameters.append(n)
        if uninitialized_parameters:
            raise RuntimeError(
                f"found uninitialized parameters in model {self.__class__.__name__}: {uninitialized_parameters}"
            )
-												fix(server): fix has_position_ids (#395)

Fix #389
											
										
										
											2023-06-01 03:41:35 -06:00
+								import inspect
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 07:22:47 -06:00
+								import torch
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 09:07:54 -06:00
+								from abc import ABC, abstractmethod
-												feat: add ruff and resolve issue (#2262)

* feat: add ruff and resolve issue

* fix: update client exports and adjust after rebase

* fix: adjust syntax to avoid circular import

* fix: adjust client ruff settings

* fix: lint and refactor import check and avoid model enum as global names

* fix: improve fbgemm_gpu check and lints

* fix: update lints

* fix: prefer comparing model enum over str

* fix: adjust lints and ignore specific rules

* fix: avoid unneeded quantize check
											
										
										
											2024-07-26 08:29:09 -06:00
+								from typing import List, Tuple, Optional, TypeVar, Type, Dict
-												Enable multiple LoRa adapters (#2010)

* feat: first draft load multiple lora

* feat: load weights within layer and refactor lora pass

* fix: refactor and reduce lora math

* feat: baseline impl single request multi lora support

* feat: prefer lorax implementation and port loading logic

* fix: prefer adapter_data and refactors

* feat: perfer loraxs custom punica kernels and add mlp loras

* fix: adjust batch for bgmv

* fix: adjust adapter_segments logic when in batch

* fix: refactor and move changes to v3 proto

* fix: pass model_id for all flash causal lms

* fix: pass model_id for all causal and seq2seq lms

* fix: add model_id to model test

* feat: add lora support to mistral and refactors

* feat: prefer model id in request

* fix: include rust code for adapter id

* feat: bump launcher and add new lora docs

* feat: support base model generation and refactors

* fix: rename doc to retry ci build

* feat: support if vlm models

* fix: add adapter_data param and avoid missing layers

* fix: add adapter_data param to phi and neox

* fix: update all models forwards to include adapter_data

* fix: add model_id to IdeficsCausalLM

* Update lora.md

Fixed a typo

* Update lora.md

Fixing spam image

* fix: add lora kernel to dockerfile, support running without kernels and refactors

* fix: avoid dockerfile conflict

* fix: refactors and adjust flash llama lora logic

* fix: skip llama test due to CI issue (temp)

* fix: skip llama test CI (temp) 2

* fix: revert skips and prefer updated ci token for tests

* fix: refactors and helpful comments

* fix: add noop in TensorParallelAdapterRowLinear too

* fix: refactor and move shard_lora_weights logic

* fix: exit early if no adapter_data

---------

Co-authored-by: Derek <datavistics@gmail.com>
											
										
										
											2024-06-25 12:46:27 -06:00
+								from collections import defaultdict
-												fix: refactor adapter weight loading and mapping (#2193)

* fix: refactor adapter weight loading and mapping

* feat: enable lora load from directory

* fix: adjust launcher for local lora adapters

* feat: improve weight loading and add tests

* fix: improve logging and rebase syntax issue

* fix: impove adapter merge comments and remove unused conditional

* fix: improve get_model_with_lora_adapters naming

* fix: comment typo
											
										
										
											2024-07-24 13:32:14 -06:00
+								from transformers import PreTrainedTokenizerBase
-												feat(server): Support all AutoModelForCausalLM on a best effort basis

											
										
										
											2022-10-28 11:24:00 -06:00
-												Fix typing in `Model.generate_token` (#733)

## What does this PR do?

This PR fixes a minor type annotation issue in the signature of
`Model.generate_token`.

All existing overrides of `Model.generate_token` return
`Tuple[List[Generation], Optional[B]]`:

https://github.com/huggingface/text-generation-inference/blob/3ef5ffbc6400370ff2e1546550a6bad3ac61b079/server/text_generation_server/models/causal_lm.py#L535-L537

https://github.com/huggingface/text-generation-inference/blob/3ef5ffbc6400370ff2e1546550a6bad3ac61b079/server/text_generation_server/models/flash_causal_lm.py#L802-L804

https://github.com/huggingface/text-generation-inference/blob/3ef5ffbc6400370ff2e1546550a6bad3ac61b079/server/text_generation_server/models/seq2seq_lm.py#L589-L591

I suspect that back in 017a2a8c when `GeneratedText` and `Generation`
were separated, the function signature was not updated.

## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [x] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?

CC @OlivierDehaene
											
										
										
											2023-07-31 06:35:14 -06:00
+								from text_generation_server.models.types import Batch, Generation
-												Speculative (#1308)


											
										
										
											2023-12-11 04:46:30 -07:00
+								from text_generation_server.utils.speculate import get_speculate
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 07:36:29 -06:00
+								from text_generation_server.pb.generate_pb2 import InfoResponse
-												Enable multiple LoRa adapters (#2010)

* feat: first draft load multiple lora

* feat: load weights within layer and refactor lora pass

* fix: refactor and reduce lora math

* feat: baseline impl single request multi lora support

* feat: prefer lorax implementation and port loading logic

* fix: prefer adapter_data and refactors

* feat: perfer loraxs custom punica kernels and add mlp loras

* fix: adjust batch for bgmv

* fix: adjust adapter_segments logic when in batch

* fix: refactor and move changes to v3 proto

* fix: pass model_id for all flash causal lms

* fix: pass model_id for all causal and seq2seq lms

* fix: add model_id to model test

* feat: add lora support to mistral and refactors

* feat: prefer model id in request

* fix: include rust code for adapter id

* feat: bump launcher and add new lora docs

* feat: support base model generation and refactors

* fix: rename doc to retry ci build

* feat: support if vlm models

* fix: add adapter_data param and avoid missing layers

* fix: add adapter_data param to phi and neox

* fix: update all models forwards to include adapter_data

* fix: add model_id to IdeficsCausalLM

* Update lora.md

Fixed a typo

* Update lora.md

Fixing spam image

* fix: add lora kernel to dockerfile, support running without kernels and refactors

* fix: avoid dockerfile conflict

* fix: refactors and adjust flash llama lora logic

* fix: skip llama test due to CI issue (temp)

* fix: skip llama test CI (temp) 2

* fix: revert skips and prefer updated ci token for tests

* fix: refactors and helpful comments

* fix: add noop in TensorParallelAdapterRowLinear too

* fix: refactor and move shard_lora_weights logic

* fix: exit early if no adapter_data

---------

Co-authored-by: Derek <datavistics@gmail.com>
											
										
										
											2024-06-25 12:46:27 -06:00
+								from text_generation_server.adapters.weights import LayerAdapterWeights
 								BASE_MODEL_ADAPTER_ID = "__base_model__"
-												feat(server): Support all AutoModelForCausalLM on a best effort basis

											
										
										
											2022-10-28 11:24:00 -06:00
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 11:03:04 -06:00
+								B = TypeVar("B", bound=Batch)
-												feat: add cuda memory fraction (#659)

Close #673
											
										
										
											2023-07-24 03:43:58 -06:00
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 09:07:54 -06:00
+								class Model(ABC):
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 04:03:10 -06:00
+								    def __init__(
 								        self,
-												Enable multiple LoRa adapters (#2010)

* feat: first draft load multiple lora

* feat: load weights within layer and refactor lora pass

* fix: refactor and reduce lora math

* feat: baseline impl single request multi lora support

* feat: prefer lorax implementation and port loading logic

* fix: prefer adapter_data and refactors

* feat: perfer loraxs custom punica kernels and add mlp loras

* fix: adjust batch for bgmv

* fix: adjust adapter_segments logic when in batch

* fix: refactor and move changes to v3 proto

* fix: pass model_id for all flash causal lms

* fix: pass model_id for all causal and seq2seq lms

* fix: add model_id to model test

* feat: add lora support to mistral and refactors

* feat: prefer model id in request

* fix: include rust code for adapter id

* feat: bump launcher and add new lora docs

* feat: support base model generation and refactors

* fix: rename doc to retry ci build

* feat: support if vlm models

* fix: add adapter_data param and avoid missing layers

* fix: add adapter_data param to phi and neox

* fix: update all models forwards to include adapter_data

* fix: add model_id to IdeficsCausalLM

* Update lora.md

Fixed a typo

* Update lora.md

Fixing spam image

* fix: add lora kernel to dockerfile, support running without kernels and refactors

* fix: avoid dockerfile conflict

* fix: refactors and adjust flash llama lora logic

* fix: skip llama test due to CI issue (temp)

* fix: skip llama test CI (temp) 2

* fix: revert skips and prefer updated ci token for tests

* fix: refactors and helpful comments

* fix: add noop in TensorParallelAdapterRowLinear too

* fix: refactor and move shard_lora_weights logic

* fix: exit early if no adapter_data

---------

Co-authored-by: Derek <datavistics@gmail.com>
											
										
										
											2024-06-25 12:46:27 -06:00
+								        model_id: str,
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								        model: torch.nn.Module,
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 04:03:10 -06:00
+								        tokenizer: PreTrainedTokenizerBase,
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 07:36:29 -06:00
+								        requires_padding: bool,
 								        dtype: torch.dtype,
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 04:03:10 -06:00
+								        device: torch.device,
-												feat(server): shard token decode (#303)


											
										
										
											2023-05-10 07:48:21 -06:00
+								        rank: int = 0,
 								        world_size: int = 1,
-												feat: add mistral model (#1071)


											
										
										
											2023-09-28 01:55:47 -06:00
+								        sliding_window: Optional[int] = None,
-												Speculative (#1308)


											
										
										
											2023-12-11 04:46:30 -07:00
+								        speculate: Optional[int] = None,
-												Enable multiple LoRa adapters (#2010)

* feat: first draft load multiple lora

* feat: load weights within layer and refactor lora pass

* fix: refactor and reduce lora math

* feat: baseline impl single request multi lora support

* feat: prefer lorax implementation and port loading logic

* fix: prefer adapter_data and refactors

* feat: perfer loraxs custom punica kernels and add mlp loras

* fix: adjust batch for bgmv

* fix: adjust adapter_segments logic when in batch

* fix: refactor and move changes to v3 proto

* fix: pass model_id for all flash causal lms

* fix: pass model_id for all causal and seq2seq lms

* fix: add model_id to model test

* feat: add lora support to mistral and refactors

* feat: prefer model id in request

* fix: include rust code for adapter id

* feat: bump launcher and add new lora docs

* feat: support base model generation and refactors

* fix: rename doc to retry ci build

* feat: support if vlm models

* fix: add adapter_data param and avoid missing layers

* fix: add adapter_data param to phi and neox

* fix: update all models forwards to include adapter_data

* fix: add model_id to IdeficsCausalLM

* Update lora.md

Fixed a typo

* Update lora.md

Fixing spam image

* fix: add lora kernel to dockerfile, support running without kernels and refactors

* fix: avoid dockerfile conflict

* fix: refactors and adjust flash llama lora logic

* fix: skip llama test due to CI issue (temp)

* fix: skip llama test CI (temp) 2

* fix: revert skips and prefer updated ci token for tests

* fix: refactors and helpful comments

* fix: add noop in TensorParallelAdapterRowLinear too

* fix: refactor and move shard_lora_weights logic

* fix: exit early if no adapter_data

---------

Co-authored-by: Derek <datavistics@gmail.com>
											
										
										
											2024-06-25 12:46:27 -06:00
+								        adapter_id: str = BASE_MODEL_ADAPTER_ID,
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 04:03:10 -06:00
+								    ):
-												Enable multiple LoRa adapters (#2010)

* feat: first draft load multiple lora

* feat: load weights within layer and refactor lora pass

* fix: refactor and reduce lora math

* feat: baseline impl single request multi lora support

* feat: prefer lorax implementation and port loading logic

* fix: prefer adapter_data and refactors

* feat: perfer loraxs custom punica kernels and add mlp loras

* fix: adjust batch for bgmv

* fix: adjust adapter_segments logic when in batch

* fix: refactor and move changes to v3 proto

* fix: pass model_id for all flash causal lms

* fix: pass model_id for all causal and seq2seq lms

* fix: add model_id to model test

* feat: add lora support to mistral and refactors

* feat: prefer model id in request

* fix: include rust code for adapter id

* feat: bump launcher and add new lora docs

* feat: support base model generation and refactors

* fix: rename doc to retry ci build

* feat: support if vlm models

* fix: add adapter_data param and avoid missing layers

* fix: add adapter_data param to phi and neox

* fix: update all models forwards to include adapter_data

* fix: add model_id to IdeficsCausalLM

* Update lora.md

Fixed a typo

* Update lora.md

Fixing spam image

* fix: add lora kernel to dockerfile, support running without kernels and refactors

* fix: avoid dockerfile conflict

* fix: refactors and adjust flash llama lora logic

* fix: skip llama test due to CI issue (temp)

* fix: skip llama test CI (temp) 2

* fix: revert skips and prefer updated ci token for tests

* fix: refactors and helpful comments

* fix: add noop in TensorParallelAdapterRowLinear too

* fix: refactor and move shard_lora_weights logic

* fix: exit early if no adapter_data

---------

Co-authored-by: Derek <datavistics@gmail.com>
											
										
										
											2024-06-25 12:46:27 -06:00
+								        self.model_id = model_id
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								        self.model = model.eval()
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 07:22:47 -06:00
+								        self.tokenizer = tokenizer
-												Use the generation config. (#1808)

# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
											
										
										
											2024-04-25 11:41:50 -06:00
 								        # all_special_ids is not set correctly if the rust tokenizer is unpacked
 								        # TODO report this to transformers.
 								        other_special_ids = {
 								            id for id, token in tokenizer.added_tokens_decoder.items() if token.special
 								        }
-												feat(server): add special token bool (#85)


											
										
										
											2023-02-24 07:55:57 -07:00
+								        self.all_special_ids = set(tokenizer.all_special_ids)
-												Use the generation config. (#1808)

# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
											
										
										
											2024-04-25 11:41:50 -06:00
+								        self.all_special_ids.update(other_special_ids)
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 07:36:29 -06:00
+								        self.requires_padding = requires_padding
 								        self.dtype = dtype
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 07:22:47 -06:00
+								        self.device = device
-												feat(server): shard token decode (#303)


											
										
										
											2023-05-10 07:48:21 -06:00
+								        self.rank = rank
 								        self.world_size = world_size
-												fix: fix logic if sliding window key is not present in config (#1352)


											
										
										
											2023-12-15 06:56:17 -07:00
+								        self.sliding_window = sliding_window if sliding_window != -1 else None
-												fix(server): fix has_position_ids (#395)

Fix #389
											
										
										
											2023-06-01 03:41:35 -06:00
-												Enable multiple LoRa adapters (#2010)

* feat: first draft load multiple lora

* feat: load weights within layer and refactor lora pass

* fix: refactor and reduce lora math

* feat: baseline impl single request multi lora support

* feat: prefer lorax implementation and port loading logic

* fix: prefer adapter_data and refactors

* feat: perfer loraxs custom punica kernels and add mlp loras

* fix: adjust batch for bgmv

* fix: adjust adapter_segments logic when in batch

* fix: refactor and move changes to v3 proto

* fix: pass model_id for all flash causal lms

* fix: pass model_id for all causal and seq2seq lms

* fix: add model_id to model test

* feat: add lora support to mistral and refactors

* feat: prefer model id in request

* fix: include rust code for adapter id

* feat: bump launcher and add new lora docs

* feat: support base model generation and refactors

* fix: rename doc to retry ci build

* feat: support if vlm models

* fix: add adapter_data param and avoid missing layers

* fix: add adapter_data param to phi and neox

* fix: update all models forwards to include adapter_data

* fix: add model_id to IdeficsCausalLM

* Update lora.md

Fixed a typo

* Update lora.md

Fixing spam image

* fix: add lora kernel to dockerfile, support running without kernels and refactors

* fix: avoid dockerfile conflict

* fix: refactors and adjust flash llama lora logic

* fix: skip llama test due to CI issue (temp)

* fix: skip llama test CI (temp) 2

* fix: revert skips and prefer updated ci token for tests

* fix: refactors and helpful comments

* fix: add noop in TensorParallelAdapterRowLinear too

* fix: refactor and move shard_lora_weights logic

* fix: exit early if no adapter_data

---------

Co-authored-by: Derek <datavistics@gmail.com>
											
										
										
											2024-06-25 12:46:27 -06:00
+								        self.layer_to_adapter_weights: Dict[str, LayerAdapterWeights] = defaultdict(
 								            LayerAdapterWeights
 								        )
 								        self.loaded_adapters = set()
 								        self.static_adapter_id = adapter_id
-												Speculative (#1308)


											
										
										
											2023-12-11 04:46:30 -07:00
+								        if speculate is None:
 								            speculate = get_speculate()
 								        self.speculate = speculate
-												fix(server): fix has_position_ids (#395)

Fix #389
											
										
										
											2023-06-01 03:41:35 -06:00
+								        self.has_position_ids = (
 								            inspect.signature(model.forward).parameters.get("position_ids", None)
 								            is not None
 								        )
-												Lifting check_unitialized. (#325)

# What does this PR do?

Lifting check_unitialized.

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
											
										
										
											2023-05-15 03:32:25 -06:00
+								        self.check_initialized()
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 07:22:47 -06:00
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 07:36:29 -06:00
+								    @property
 								    def info(self) -> InfoResponse:
-												feat: add mistral model (#1071)


											
										
										
											2023-09-28 01:55:47 -06:00
+								        if self.requires_padding and self.sliding_window is not None:
 								            raise NotImplementedError("sliding_window is not implemented with padding")
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 07:36:29 -06:00
+								        return InfoResponse(
 								            requires_padding=self.requires_padding,
 								            dtype=str(self.dtype),
 								            device_type=self.device.type,
-												feat: add mistral model (#1071)


											
										
										
											2023-09-28 01:55:47 -06:00
+								            window_size=self.sliding_window,
-												chore: formatting

											
										
										
											2023-12-11 06:49:52 -07:00
+								            speculate=self.speculate,
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 07:36:29 -06:00
+								        )
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 11:03:04 -06:00
+								    @property
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 09:07:54 -06:00
+								    @abstractmethod
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 11:03:04 -06:00
+								    def batch_type(self) -> Type[B]:
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 09:07:54 -06:00
+								        raise NotImplementedError
-												feat(server): Support all AutoModelForCausalLM on a best effort basis

											
										
										
											2022-10-28 11:24:00 -06:00
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 11:03:04 -06:00
+								    @abstractmethod
-												feat: add more latency metrics in forward (#1346)


											
										
										
											2023-12-14 07:59:38 -07:00
+								    def generate_token(
 								        self, batch: B
 								    ) -> Tuple[List[Generation], Optional[B], Tuple[int, int]]:
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 11:03:04 -06:00
+								        raise NotImplementedError
-												fix(server): fix generate_stream by forcing tokens to be decoded correctly (#100)


											
										
										
											2023-03-06 05:22:58 -07:00
-												feat(server): auto max_batch_total_tokens for flash att models (#630)


											
										
										
											2023-07-19 01:31:25 -06:00
+								    def warmup(self, batch: B) -> Optional[int]:
-												feat(server): add paged attention to flash models (#516)

Closes #478
											
										
										
											2023-06-30 11:09:59 -06:00
+								        self.generate_token(batch)
-												feat(server): auto max_batch_total_tokens for flash att models (#630)


											
										
										
											2023-07-19 01:31:25 -06:00
+								        return None
-												feat(server): add paged attention to flash models (#516)

Closes #478
											
										
										
											2023-06-30 11:09:59 -06:00
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 08:38:22 -06:00
+								    def decode_token(
 								        self,
 								        all_input_ids: List[int],
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								        prefix_offset: int = 0,
 								        read_offset: int = 0,
-												Remove the stripping of the prefix space (and any other mangling that tokenizers might do). (#1065)

Superseed #1024


# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->

---------

Co-authored-by: bangoz <ch_xie@pku.edu.cn>
											
										
										
											2023-09-27 04:13:45 -06:00
+								        skip_special_tokens: bool = False,
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								    ) -> Tuple[str, int, int]:
-												fix(server): fix generate_stream by forcing tokens to be decoded correctly (#100)


											
										
										
											2023-03-06 05:22:58 -07:00
+								        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 08:38:22 -06:00
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								        # The prefix text is necessary only to defeat cleanup algorithms in the decode
 								        # which decide to add a space or not depending on the surrounding ids.
 								        prefix_text = self.tokenizer.decode(
-												feat: format code (#1070)


											
										
										
											2023-09-27 04:22:09 -06:00
+								            all_input_ids[prefix_offset:read_offset],
 								            skip_special_tokens=skip_special_tokens,
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								        )
 								        new_text = self.tokenizer.decode(
-												Remove the stripping of the prefix space (and any other mangling that tokenizers might do). (#1065)

Superseed #1024


# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->

---------

Co-authored-by: bangoz <ch_xie@pku.edu.cn>
											
										
										
											2023-09-27 04:13:45 -06:00
+								            all_input_ids[prefix_offset:], skip_special_tokens=skip_special_tokens
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								        )
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 08:38:22 -06:00
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								        if len(new_text) > len(prefix_text) and not new_text.endswith("<EFBFBD>"):
 								            # utf-8 char at the end means it's a potential unfinished byte sequence
 								            # from byte fallback tokenization.
 								            # If it's in the middle, it's probably a real invalid id generated
 								            # by the model
 								            new_text = new_text[len(prefix_text) :]
 								            return new_text, read_offset, len(all_input_ids)
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 08:38:22 -06:00
+								        else:
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								            return "", prefix_offset, read_offset
-												Lifting check_unitialized. (#325)

# What does this PR do?

Lifting check_unitialized.

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
											
										
										
											2023-05-15 03:32:25 -06:00
 								    def check_initialized(self):
 								        uninitialized_parameters = []
 								        for n, p in self.model.named_parameters():
 								            if p.data.device == torch.device("meta"):
 								                uninitialized_parameters.append(n)
 								        if uninitialized_parameters:
 								            raise RuntimeError(
 								                f"found uninitialized parameters in model {self.__class__.__name__}: {uninitialized_parameters}"
 								            )