hf_text-generation-inference/server/text_generation_server/models/model.py

import inspect
import torch

from abc import ABC, abstractmethod
from typing import List, Tuple, Optional, TypeVar, Type
from transformers import PreTrainedTokenizerBase

from text_generation_server.models.types import Batch, GeneratedText
from text_generation_server.pb.generate_pb2 import InfoResponse

B = TypeVar("B", bound=Batch)


class Model(ABC):
    def __init__(
        self,
        model: torch.nn.Module,
        tokenizer: PreTrainedTokenizerBase,
        requires_padding: bool,
        dtype: torch.dtype,
        device: torch.device,
        rank: int = 0,
        world_size: int = 1,
    ):
        if torch.cuda.is_available():
            torch.cuda.set_per_process_memory_fraction(1.0)

        self.model = model.eval()
        self.tokenizer = tokenizer
        self.all_special_ids = set(tokenizer.all_special_ids)
        self.requires_padding = requires_padding
        self.dtype = dtype
        self.device = device
        self.rank = rank
        self.world_size = world_size

        self.has_position_ids = (
            inspect.signature(model.forward).parameters.get("position_ids", None)
            is not None
        )

        self.check_initialized()

    @property
    def info(self) -> InfoResponse:
        return InfoResponse(
            requires_padding=self.requires_padding,
            dtype=str(self.dtype),
            device_type=self.device.type,
        )

    @property
    @abstractmethod
    def batch_type(self) -> Type[B]:
        raise NotImplementedError

    @abstractmethod
    def generate_token(self, batch: B) -> Tuple[List[GeneratedText], Optional[B]]:
        raise NotImplementedError

    def warmup(self, batch: B) -> Optional[int]:
        self.generate_token(batch)
        return None

    def decode_token(
        self,
        all_input_ids: List[int],
        prefix_offset: int = 0,
        read_offset: int = 0,
    ) -> Tuple[str, int, int]:
        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""

        # The prefix text is necessary only to defeat cleanup algorithms in the decode
        # which decide to add a space or not depending on the surrounding ids.
        prefix_text = self.tokenizer.decode(
            all_input_ids[prefix_offset:read_offset], skip_special_tokens=False
        )
        new_text = self.tokenizer.decode(
            all_input_ids[prefix_offset:], skip_special_tokens=False
        )

        if len(new_text) > len(prefix_text) and not new_text.endswith("<EFBFBD>"):
            # utf-8 char at the end means it's a potential unfinished byte sequence
            # from byte fallback tokenization.
            # If it's in the middle, it's probably a real invalid id generated
            # by the model
            new_text = new_text[len(prefix_text) :]
            return new_text, read_offset, len(all_input_ids)
        else:
            return "", prefix_offset, read_offset

    def check_initialized(self):
        uninitialized_parameters = []
        for n, p in self.model.named_parameters():
            if p.data.device == torch.device("meta"):
                uninitialized_parameters.append(n)
        if uninitialized_parameters:
            raise RuntimeError(
                f"found uninitialized parameters in model {self.__class__.__name__}: {uninitialized_parameters}"
            )
-												fix(server): fix has_position_ids (#395)

Fix #389
											
										
										
											2023-06-01 03:41:35 -06:00
+								import inspect
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 07:22:47 -06:00
+								import torch
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 09:07:54 -06:00
+								from abc import ABC, abstractmethod
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 11:03:04 -06:00
+								from typing import List, Tuple, Optional, TypeVar, Type
-												fix(server): Minor refactorization using new_zeros (#24)

- Fix some type hints, in particular base tokenizer class
- Make use of `tensor.new_zero/empty` methods
- Simplify env var string parsing in launcher
											
										
										
											2023-01-17 01:10:22 -07:00
+								from transformers import PreTrainedTokenizerBase
-												feat(server): Support all AutoModelForCausalLM on a best effort basis

											
										
										
											2022-10-28 11:24:00 -06:00
-												feat(clients): Python client (#103)


											
										
										
											2023-03-07 10:52:22 -07:00
+								from text_generation_server.models.types import Batch, GeneratedText
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 07:36:29 -06:00
+								from text_generation_server.pb.generate_pb2 import InfoResponse
-												feat(server): Support all AutoModelForCausalLM on a best effort basis

											
										
										
											2022-10-28 11:24:00 -06:00
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 11:03:04 -06:00
+								B = TypeVar("B", bound=Batch)
-												feat(server): Support all AutoModelForCausalLM on a best effort basis

											
										
										
											2022-10-28 11:24:00 -06:00
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 09:07:54 -06:00
+								class Model(ABC):
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 04:03:10 -06:00
+								    def __init__(
 								        self,
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								        model: torch.nn.Module,
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 04:03:10 -06:00
+								        tokenizer: PreTrainedTokenizerBase,
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 07:36:29 -06:00
+								        requires_padding: bool,
 								        dtype: torch.dtype,
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 04:03:10 -06:00
+								        device: torch.device,
-												feat(server): shard token decode (#303)


											
										
										
											2023-05-10 07:48:21 -06:00
+								        rank: int = 0,
 								        world_size: int = 1,
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 04:03:10 -06:00
+								    ):
-												feat(server): add paged attention to flash models (#516)

Closes #478
											
										
										
											2023-06-30 11:09:59 -06:00
+								        if torch.cuda.is_available():
 								            torch.cuda.set_per_process_memory_fraction(1.0)
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								        self.model = model.eval()
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 07:22:47 -06:00
+								        self.tokenizer = tokenizer
-												feat(server): add special token bool (#85)


											
										
										
											2023-02-24 07:55:57 -07:00
+								        self.all_special_ids = set(tokenizer.all_special_ids)
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 07:36:29 -06:00
+								        self.requires_padding = requires_padding
 								        self.dtype = dtype
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 07:22:47 -06:00
+								        self.device = device
-												feat(server): shard token decode (#303)


											
										
										
											2023-05-10 07:48:21 -06:00
+								        self.rank = rank
 								        self.world_size = world_size
-												fix(server): fix has_position_ids (#395)

Fix #389
											
										
										
											2023-06-01 03:41:35 -06:00
 								        self.has_position_ids = (
 								            inspect.signature(model.forward).parameters.get("position_ids", None)
 								            is not None
 								        )
-												Lifting check_unitialized. (#325)

# What does this PR do?

Lifting check_unitialized.

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
											
										
										
											2023-05-15 03:32:25 -06:00
+								        self.check_initialized()
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 07:22:47 -06:00
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 07:36:29 -06:00
+								    @property
 								    def info(self) -> InfoResponse:
 								        return InfoResponse(
 								            requires_padding=self.requires_padding,
 								            dtype=str(self.dtype),
 								            device_type=self.device.type,
 								        )
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 11:03:04 -06:00
+								    @property
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 09:07:54 -06:00
+								    @abstractmethod
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 11:03:04 -06:00
+								    def batch_type(self) -> Type[B]:
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 09:07:54 -06:00
+								        raise NotImplementedError
-												feat(server): Support all AutoModelForCausalLM on a best effort basis

											
										
										
											2022-10-28 11:24:00 -06:00
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 11:03:04 -06:00
+								    @abstractmethod
 								    def generate_token(self, batch: B) -> Tuple[List[GeneratedText], Optional[B]]:
 								        raise NotImplementedError
-												fix(server): fix generate_stream by forcing tokens to be decoded correctly (#100)


											
										
										
											2023-03-06 05:22:58 -07:00
-												feat(server): auto max_batch_total_tokens for flash att models (#630)


											
										
										
											2023-07-19 01:31:25 -06:00
+								    def warmup(self, batch: B) -> Optional[int]:
-												feat(server): add paged attention to flash models (#516)

Closes #478
											
										
										
											2023-06-30 11:09:59 -06:00
+								        self.generate_token(batch)
-												feat(server): auto max_batch_total_tokens for flash att models (#630)


											
										
										
											2023-07-19 01:31:25 -06:00
+								        return None
-												feat(server): add paged attention to flash models (#516)

Closes #478
											
										
										
											2023-06-30 11:09:59 -06:00
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 08:38:22 -06:00
+								    def decode_token(
 								        self,
 								        all_input_ids: List[int],
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								        prefix_offset: int = 0,
 								        read_offset: int = 0,
 								    ) -> Tuple[str, int, int]:
-												fix(server): fix generate_stream by forcing tokens to be decoded correctly (#100)


											
										
										
											2023-03-06 05:22:58 -07:00
+								        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 08:38:22 -06:00
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								        # The prefix text is necessary only to defeat cleanup algorithms in the decode
 								        # which decide to add a space or not depending on the surrounding ids.
 								        prefix_text = self.tokenizer.decode(
 								            all_input_ids[prefix_offset:read_offset], skip_special_tokens=False
 								        )
 								        new_text = self.tokenizer.decode(
 								            all_input_ids[prefix_offset:], skip_special_tokens=False
 								        )
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 08:38:22 -06:00
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								        if len(new_text) > len(prefix_text) and not new_text.endswith("<EFBFBD>"):
 								            # utf-8 char at the end means it's a potential unfinished byte sequence
 								            # from byte fallback tokenization.
 								            # If it's in the middle, it's probably a real invalid id generated
 								            # by the model
 								            new_text = new_text[len(prefix_text) :]
 								            return new_text, read_offset, len(all_input_ids)
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 08:38:22 -06:00
+								        else:
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								            return "", prefix_offset, read_offset
-												Lifting check_unitialized. (#325)

# What does this PR do?

Lifting check_unitialized.

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
											
										
										
											2023-05-15 03:32:25 -06:00
 								    def check_initialized(self):
 								        uninitialized_parameters = []
 								        for n, p in self.model.named_parameters():
 								            if p.data.device == torch.device("meta"):
 								                uninitialized_parameters.append(n)
 								        if uninitialized_parameters:
 								            raise RuntimeError(
 								                f"found uninitialized parameters in model {self.__class__.__name__}: {uninitialized_parameters}"
 								            )