hf_text-generation-inference/server/text_generation_server/models/model.py

import inspect
import torch

from abc import ABC, abstractmethod
from typing import List, Tuple, Optional, TypeVar, Type
from transformers import PreTrainedTokenizerBase, PretrainedConfig

from text_generation_server.models.types import Batch, Generation
from text_generation_server.utils.speculate import get_speculate
from text_generation_server.pb.generate_pb2 import InfoResponse

B = TypeVar("B", bound=Batch)


class Model(ABC):
    def __init__(
        self,
        model: torch.nn.Module,
        tokenizer: PreTrainedTokenizerBase,
        requires_padding: bool,
        dtype: torch.dtype,
        device: torch.device,
        rank: int = 0,
        world_size: int = 1,
        sliding_window: Optional[int] = None,
        speculate: Optional[int] = None,
    ):
        self.model = model.eval()
        self.tokenizer = tokenizer
        self.all_special_ids = set(tokenizer.all_special_ids)
        self.requires_padding = requires_padding
        self.dtype = dtype
        self.device = device
        self.rank = rank
        self.world_size = world_size
        self.sliding_window = sliding_window

        if speculate is None:
            speculate = get_speculate()
        self.speculate = speculate

        self.has_position_ids = (
            inspect.signature(model.forward).parameters.get("position_ids", None)
            is not None
        )

        self.check_initialized()

    @property
    def info(self) -> InfoResponse:
        if self.requires_padding and self.sliding_window is not None:
            raise NotImplementedError("sliding_window is not implemented with padding")

        return InfoResponse(
            requires_padding=self.requires_padding,
            dtype=str(self.dtype),
            device_type=self.device.type,
            window_size=self.sliding_window,
            speculate=self.speculate
        )

    @property
    @abstractmethod
    def batch_type(self) -> Type[B]:
        raise NotImplementedError

    @abstractmethod
    def generate_token(self, batch: B) -> Tuple[List[Generation], Optional[B]]:
        raise NotImplementedError

    def warmup(self, batch: B) -> Optional[int]:
        self.generate_token(batch)
        return None

    def decode_token(
        self,
        all_input_ids: List[int],
        prefix_offset: int = 0,
        read_offset: int = 0,
        skip_special_tokens: bool = False,
    ) -> Tuple[str, int, int]:
        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""

        # The prefix text is necessary only to defeat cleanup algorithms in the decode
        # which decide to add a space or not depending on the surrounding ids.
        prefix_text = self.tokenizer.decode(
            all_input_ids[prefix_offset:read_offset],
            skip_special_tokens=skip_special_tokens,
        )
        new_text = self.tokenizer.decode(
            all_input_ids[prefix_offset:], skip_special_tokens=skip_special_tokens
        )

        if len(new_text) > len(prefix_text) and not new_text.endswith("<EFBFBD>"):
            # utf-8 char at the end means it's a potential unfinished byte sequence
            # from byte fallback tokenization.
            # If it's in the middle, it's probably a real invalid id generated
            # by the model
            new_text = new_text[len(prefix_text) :]
            return new_text, read_offset, len(all_input_ids)
        else:
            return "", prefix_offset, read_offset

    def check_initialized(self):
        uninitialized_parameters = []
        for n, p in self.model.named_parameters():
            if p.data.device == torch.device("meta"):
                uninitialized_parameters.append(n)
        if uninitialized_parameters:
            raise RuntimeError(
                f"found uninitialized parameters in model {self.__class__.__name__}: {uninitialized_parameters}"
            )
-												fix(server): fix has_position_ids (#395)

Fix #389
											
										
										
											2023-06-01 03:41:35 -06:00
+								import inspect
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 07:22:47 -06:00
+								import torch
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 09:07:54 -06:00
+								from abc import ABC, abstractmethod
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 11:03:04 -06:00
+								from typing import List, Tuple, Optional, TypeVar, Type
-												feat(server): Add exllama GPTQ CUDA kernel support #553 (#666)

Just trying to get the integration tests to pass.


# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->

---------

Co-authored-by: Felix Marty <9808326+fxmarty@users.noreply.github.com>
											
										
										
											2023-07-21 02:59:00 -06:00
+								from transformers import PreTrainedTokenizerBase, PretrainedConfig
-												feat(server): Support all AutoModelForCausalLM on a best effort basis

											
										
										
											2022-10-28 11:24:00 -06:00
-												Fix typing in `Model.generate_token` (#733)

## What does this PR do?

This PR fixes a minor type annotation issue in the signature of
`Model.generate_token`.

All existing overrides of `Model.generate_token` return
`Tuple[List[Generation], Optional[B]]`:

https://github.com/huggingface/text-generation-inference/blob/3ef5ffbc6400370ff2e1546550a6bad3ac61b079/server/text_generation_server/models/causal_lm.py#L535-L537

https://github.com/huggingface/text-generation-inference/blob/3ef5ffbc6400370ff2e1546550a6bad3ac61b079/server/text_generation_server/models/flash_causal_lm.py#L802-L804

https://github.com/huggingface/text-generation-inference/blob/3ef5ffbc6400370ff2e1546550a6bad3ac61b079/server/text_generation_server/models/seq2seq_lm.py#L589-L591

I suspect that back in 017a2a8c when `GeneratedText` and `Generation`
were separated, the function signature was not updated.

## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [x] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?

CC @OlivierDehaene
											
										
										
											2023-07-31 06:35:14 -06:00
+								from text_generation_server.models.types import Batch, Generation
-												Speculative (#1308)


											
										
										
											2023-12-11 04:46:30 -07:00
+								from text_generation_server.utils.speculate import get_speculate
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 07:36:29 -06:00
+								from text_generation_server.pb.generate_pb2 import InfoResponse
-												feat(server): Support all AutoModelForCausalLM on a best effort basis

											
										
										
											2022-10-28 11:24:00 -06:00
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 11:03:04 -06:00
+								B = TypeVar("B", bound=Batch)
-												feat: add cuda memory fraction (#659)

Close #673
											
										
										
											2023-07-24 03:43:58 -06:00
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 09:07:54 -06:00
+								class Model(ABC):
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 04:03:10 -06:00
+								    def __init__(
 								        self,
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								        model: torch.nn.Module,
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 04:03:10 -06:00
+								        tokenizer: PreTrainedTokenizerBase,
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 07:36:29 -06:00
+								        requires_padding: bool,
 								        dtype: torch.dtype,
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 04:03:10 -06:00
+								        device: torch.device,
-												feat(server): shard token decode (#303)


											
										
										
											2023-05-10 07:48:21 -06:00
+								        rank: int = 0,
 								        world_size: int = 1,
-												feat: add mistral model (#1071)


											
										
										
											2023-09-28 01:55:47 -06:00
+								        sliding_window: Optional[int] = None,
-												Speculative (#1308)


											
										
										
											2023-12-11 04:46:30 -07:00
+								        speculate: Optional[int] = None,
-												feat(server): optimize decode for sane tokenizers (#170)


											
										
										
											2023-04-12 04:03:10 -06:00
+								    ):
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								        self.model = model.eval()
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 07:22:47 -06:00
+								        self.tokenizer = tokenizer
-												feat(server): add special token bool (#85)


											
										
										
											2023-02-24 07:55:57 -07:00
+								        self.all_special_ids = set(tokenizer.all_special_ids)
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 07:36:29 -06:00
+								        self.requires_padding = requires_padding
 								        self.dtype = dtype
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 07:22:47 -06:00
+								        self.device = device
-												feat(server): shard token decode (#303)


											
										
										
											2023-05-10 07:48:21 -06:00
+								        self.rank = rank
 								        self.world_size = world_size
-												feat: add mistral model (#1071)


											
										
										
											2023-09-28 01:55:47 -06:00
+								        self.sliding_window = sliding_window
-												fix(server): fix has_position_ids (#395)

Fix #389
											
										
										
											2023-06-01 03:41:35 -06:00
-												Speculative (#1308)


											
										
										
											2023-12-11 04:46:30 -07:00
+								        if speculate is None:
 								            speculate = get_speculate()
 								        self.speculate = speculate
-												fix(server): fix has_position_ids (#395)

Fix #389
											
										
										
											2023-06-01 03:41:35 -06:00
+								        self.has_position_ids = (
 								            inspect.signature(model.forward).parameters.get("position_ids", None)
 								            is not None
 								        )
-												Lifting check_unitialized. (#325)

# What does this PR do?

Lifting check_unitialized.

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
											
										
										
											2023-05-15 03:32:25 -06:00
+								        self.check_initialized()
-												feat(server): Support generic AutoModelForCausalLM

											
										
										
											2022-11-04 07:22:47 -06:00
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 07:36:29 -06:00
+								    @property
 								    def info(self) -> InfoResponse:
-												feat: add mistral model (#1071)


											
										
										
											2023-09-28 01:55:47 -06:00
+								        if self.requires_padding and self.sliding_window is not None:
 								            raise NotImplementedError("sliding_window is not implemented with padding")
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 07:36:29 -06:00
+								        return InfoResponse(
 								            requires_padding=self.requires_padding,
 								            dtype=str(self.dtype),
 								            device_type=self.device.type,
-												feat: add mistral model (#1071)


											
										
										
											2023-09-28 01:55:47 -06:00
+								            window_size=self.sliding_window,
-												Speculative (#1308)


											
										
										
											2023-12-11 04:46:30 -07:00
+								            speculate=self.speculate
-												feat(router): add device and dtype info (#215)


											
										
										
											2023-04-21 07:36:29 -06:00
+								        )
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 11:03:04 -06:00
+								    @property
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 09:07:54 -06:00
+								    @abstractmethod
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 11:03:04 -06:00
+								    def batch_type(self) -> Type[B]:
-												fix(models): Revert buggy support for AutoModel

											
										
										
											2022-11-03 09:07:54 -06:00
+								        raise NotImplementedError
-												feat(server): Support all AutoModelForCausalLM on a best effort basis

											
										
										
											2022-10-28 11:24:00 -06:00
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 11:03:04 -06:00
+								    @abstractmethod
-												Fix typing in `Model.generate_token` (#733)

## What does this PR do?

This PR fixes a minor type annotation issue in the signature of
`Model.generate_token`.

All existing overrides of `Model.generate_token` return
`Tuple[List[Generation], Optional[B]]`:

https://github.com/huggingface/text-generation-inference/blob/3ef5ffbc6400370ff2e1546550a6bad3ac61b079/server/text_generation_server/models/causal_lm.py#L535-L537

https://github.com/huggingface/text-generation-inference/blob/3ef5ffbc6400370ff2e1546550a6bad3ac61b079/server/text_generation_server/models/flash_causal_lm.py#L802-L804

https://github.com/huggingface/text-generation-inference/blob/3ef5ffbc6400370ff2e1546550a6bad3ac61b079/server/text_generation_server/models/seq2seq_lm.py#L589-L591

I suspect that back in 017a2a8c when `GeneratedText` and `Generation`
were separated, the function signature was not updated.

## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [x] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?

CC @OlivierDehaene
											
										
										
											2023-07-31 06:35:14 -06:00
+								    def generate_token(self, batch: B) -> Tuple[List[Generation], Optional[B]]:
-												feat(server): Support AutoModelForSeq2SeqLM

											
										
										
											2022-11-04 11:03:04 -06:00
+								        raise NotImplementedError
-												fix(server): fix generate_stream by forcing tokens to be decoded correctly (#100)


											
										
										
											2023-03-06 05:22:58 -07:00
-												feat(server): auto max_batch_total_tokens for flash att models (#630)


											
										
										
											2023-07-19 01:31:25 -06:00
+								    def warmup(self, batch: B) -> Optional[int]:
-												feat(server): add paged attention to flash models (#516)

Closes #478
											
										
										
											2023-06-30 11:09:59 -06:00
+								        self.generate_token(batch)
-												feat(server): auto max_batch_total_tokens for flash att models (#630)


											
										
										
											2023-07-19 01:31:25 -06:00
+								        return None
-												feat(server): add paged attention to flash models (#516)

Closes #478
											
										
										
											2023-06-30 11:09:59 -06:00
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 08:38:22 -06:00
+								    def decode_token(
 								        self,
 								        all_input_ids: List[int],
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								        prefix_offset: int = 0,
 								        read_offset: int = 0,
-												Remove the stripping of the prefix space (and any other mangling that tokenizers might do). (#1065)

Superseed #1024


# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->

---------

Co-authored-by: bangoz <ch_xie@pku.edu.cn>
											
										
										
											2023-09-27 04:13:45 -06:00
+								        skip_special_tokens: bool = False,
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								    ) -> Tuple[str, int, int]:
-												fix(server): fix generate_stream by forcing tokens to be decoded correctly (#100)


											
										
										
											2023-03-06 05:22:58 -07:00
+								        """Hack to hopefully support generate_stream for the maximum number of tokenizers"""
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 08:38:22 -06:00
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								        # The prefix text is necessary only to defeat cleanup algorithms in the decode
 								        # which decide to add a space or not depending on the surrounding ids.
 								        prefix_text = self.tokenizer.decode(
-												feat: format code (#1070)


											
										
										
											2023-09-27 04:22:09 -06:00
+								            all_input_ids[prefix_offset:read_offset],
 								            skip_special_tokens=skip_special_tokens,
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								        )
 								        new_text = self.tokenizer.decode(
-												Remove the stripping of the prefix space (and any other mangling that tokenizers might do). (#1065)

Superseed #1024


# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->

---------

Co-authored-by: bangoz <ch_xie@pku.edu.cn>
											
										
										
											2023-09-27 04:13:45 -06:00
+								            all_input_ids[prefix_offset:], skip_special_tokens=skip_special_tokens
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								        )
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 08:38:22 -06:00
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								        if len(new_text) > len(prefix_text) and not new_text.endswith("<EFBFBD>"):
 								            # utf-8 char at the end means it's a potential unfinished byte sequence
 								            # from byte fallback tokenization.
 								            # If it's in the middle, it's probably a real invalid id generated
 								            # by the model
 								            new_text = new_text[len(prefix_text) :]
 								            return new_text, read_offset, len(all_input_ids)
-												feat(server): add flash attention llama (#144)


											
										
										
											2023-04-11 08:38:22 -06:00
+								        else:
-												fix(server): fix decode token (#334)

Fixes #333

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
											
										
										
											2023-05-16 15:23:27 -06:00
+								            return "", prefix_offset, read_offset
-												Lifting check_unitialized. (#325)

# What does this PR do?

Lifting check_unitialized.

<!--
Congratulations! You've made it this far! You're not quite done yet
though.

Once merged, your PR is going to appear in the release notes with the
title you set, so make sure it's a great title that fully reflects the
extent of your awesome contribution.

Then, please replace this with a description of the change and which
issue is fixed (if applicable). Please also include relevant motivation
and context. List any dependencies (if any) that are required for this
change.

Once you're done, someone will review your PR shortly (see the section
"Who can review?" below to tag some potential reviewers). They may
suggest changes to make the code even better. If no one reviewed your PR
after a week has passed, don't hesitate to post a new comment
@-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dismiss the
other checks if that's the case).
- [ ] Did you read the [contributor
guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the
[forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes?
Here are the
[documentation
guidelines](https://github.com/huggingface/transformers/tree/main/docs),
and
[here are tips on formatting
docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have
passed. Feel free to tag
members/contributors who may be interested in your PR.

<!-- Your PR will be replied to more quickly if you can figure out the
right person to tag with @


@OlivierDehaene OR @Narsil

 -->
											
										
										
											2023-05-15 03:32:25 -06:00
 								    def check_initialized(self):
 								        uninitialized_parameters = []
 								        for n, p in self.model.named_parameters():
 								            if p.data.device == torch.device("meta"):
 								                uninitialized_parameters.append(n)
 								        if uninitialized_parameters:
 								            raise RuntimeError(
 								                f"found uninitialized parameters in model {self.__class__.__name__}: {uninitialized_parameters}"
 								            )