hf_text-generation-inference/server/text_generation_server/models/galactica.py

import re
import torch
import torch.distributed


from transformers import (
    PreTrainedTokenizerBase,
)
from text_generation_server.models.causal_lm import CausalLMBatch
from text_generation_server.pb import generate_pb2
from text_generation_server.utils import (
    NextTokenChooser,
    StoppingCriteria,
)
from text_generation_server.utils.chunks import concat_text_chunks

# CREDIT: Papers with code => https://github.com/paperswithcode/galai/blob/main/galai/utils.py

# we split individual characters inside special tokens like [START_DNA]
CUSTOM_SEQ_RE = re.compile(r"(\[START_(DNA|SMILES|I_SMILES|AMINO)])(.*?)(\[END_\2])")

# token added to implement a custom sequence tokenization. This token is added at
# corpus cleaning step and removed in pretokenization. The digits are added to increase the chance
# that they do not occur in the corpus. The digits are escaped so that the token does not appear
# literally in the source code in case we ever include it in the training data.
SPLIT_MARKER = f"SPL{1}T-TH{1}S-Pl3A5E"


def _insert_split_marker(m: re.Match):
    """
    Applies split marker based on a regex match of special tokens such as
    [START_DNA].
    Parameters
    ----------
    n : str
        Input text to split
    Returns
    ----------
    str - the text with the split token added
    """
    start_token, _, sequence, end_token = m.groups()
    sequence = re.sub(r"(.)", rf"{SPLIT_MARKER}\1", sequence, flags=re.DOTALL)
    return f"{start_token}{sequence}{SPLIT_MARKER}{end_token}"


def escape_custom_split_sequence(text):
    """
    Applies custom splitting to the text for GALILEO's tokenization
    Parameters
    ----------
    text : str
        Input text to split
    Returns
    ----------
    str - the text with the split token added
    """
    return CUSTOM_SEQ_RE.sub(_insert_split_marker, text)


# END CREDIT


class GalacticaCausalLMBatch(CausalLMBatch):
    @classmethod
    def from_pb(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "GalacticaCausalLMBatch":
        inputs = []
        next_token_choosers = []
        stopping_criterias = []
        prefix_offsets = []
        top_n_tokens = []
        read_offsets = []
        requests_idx_mapping = {}

        # Parse batch
        max_truncation = 0
        padding_right_offset = 0
        max_decode_tokens = 0
        for i, r in enumerate(pb.requests):
            requests_idx_mapping[r.id] = i
            # Add escape_custom_split_sequence to the CausalLMBatch logic
            inputs.append(
                escape_custom_split_sequence(concat_text_chunks(r.input_chunks.chunks))
            )
            next_token_choosers.append(
                NextTokenChooser.from_pb(r.parameters, device, tokenizer)
            )
            stopping_criteria = StoppingCriteria.from_pb(
                r.stopping_parameters, tokenizer
            )
            stopping_criterias.append(stopping_criteria)
            top_n_tokens.append(r.top_n_tokens)
            max_truncation = max(max_truncation, r.truncate)
            max_decode_tokens += stopping_criteria.max_new_tokens
            padding_right_offset = max(
                padding_right_offset, stopping_criteria.max_new_tokens
            )

        tokenized_inputs = tokenizer(
            inputs,
            return_tensors="pt",
            padding=True,
            return_token_type_ids=False,
            truncation=True,
            max_length=max_truncation,
        ).to(device)
        for _ in pb.requests:
            input_len = tokenized_inputs["input_ids"].shape[1]
            prefix_offsets.append(0)
            read_offsets.append(input_len)

        input_lengths = tokenized_inputs["attention_mask"].sum(1)
        max_input_length = input_lengths.max()

        input_ids = tokenized_inputs["input_ids"]
        # Allocate maximum attention_mask
        attention_mask = input_ids.new_zeros(
            (pb.size, max_input_length + padding_right_offset)
        )
        # Copy tokenizer attention_mask into fully allocated attention_mask
        attention_mask[:, :max_input_length] = tokenized_inputs["attention_mask"]

        position_ids = tokenized_inputs["attention_mask"].long().cumsum(-1) - 1
        position_ids.masked_fill_(tokenized_inputs["attention_mask"] == 0, 1)
        all_input_ids = tokenized_inputs["input_ids"].T.split(1, dim=1)
        top_n_tokens_tensor = torch.tensor(
            top_n_tokens, device=device, dtype=torch.int64
        )

        max_tokens = len(inputs) * max_input_length + max_decode_tokens

        return cls(
            batch_id=pb.id,
            requests=pb.requests,
            requests_idx_mapping=requests_idx_mapping,
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=None,
            all_input_ids=list(all_input_ids),
            input_lengths=input_lengths.tolist(),
            prefix_offsets=prefix_offsets,
            read_offsets=read_offsets,
            next_token_choosers=next_token_choosers,
            stopping_criterias=stopping_criterias,
            top_n_tokens=top_n_tokens,
            top_n_tokens_tensor=top_n_tokens_tensor,
            max_input_length=max_input_length.item(),
            padding_right_offset=padding_right_offset,
            max_tokens=max_tokens,
        )
feat(server): Support Galactica (#4) 2022-12-01 11:31:54 -07:00			`import re`
			`import torch`
			`import torch.distributed`


feat(server): Support SantaCoder (#26) 2023-01-20 04:24:39 -07:00			`from transformers import (`
			`PreTrainedTokenizerBase,`
			`)`
feat(clients): Python client (#103) 2023-03-07 10:52:22 -07:00			`from text_generation_server.models.causal_lm import CausalLMBatch`
feat(server): support OPT models (#55) OPT models do not all have a `tokenizer.json` file on the hub at the moment. Can't merge for now. 2023-04-11 11:16:41 -06:00			`from text_generation_server.pb import generate_pb2`
feat(clients): Python client (#103) 2023-03-07 10:52:22 -07:00			`from text_generation_server.utils import (`
feat(server): Support Galactica (#4) 2022-12-01 11:31:54 -07:00			`NextTokenChooser,`
			`StoppingCriteria,`
			`)`
server: use chunked inputs The router will now send the input as chunks besides as a single string. This change modifies the server to process chunked input rather than strings. This also allows us to remove the image extraction code from the server. 2024-05-31 05:51:42 -06:00			`from text_generation_server.utils.chunks import concat_text_chunks`
feat(server): Support Galactica (#4) 2022-12-01 11:31:54 -07:00
			`# CREDIT: Papers with code => https://github.com/paperswithcode/galai/blob/main/galai/utils.py`

			`# we split individual characters inside special tokens like [START_DNA]`
			`CUSTOM_SEQ_RE = re.compile(r"(\[START_(DNA\|SMILES\|I_SMILES\|AMINO)])(.*?)(\[END_\2])")`

			`# token added to implement a custom sequence tokenization. This token is added at`
			`# corpus cleaning step and removed in pretokenization. The digits are added to increase the chance`
			`# that they do not occur in the corpus. The digits are escaped so that the token does not appear`
			`# literally in the source code in case we ever include it in the training data.`
			`SPLIT_MARKER = f"SPL{1}T-TH{1}S-Pl3A5E"`


			`def _insert_split_marker(m: re.Match):`
			`"""`
			`Applies split marker based on a regex match of special tokens such as`
			`[START_DNA].`
			`Parameters`
			`----------`
			`n : str`
			`Input text to split`
			`Returns`
			`----------`
			`str - the text with the split token added`
			`"""`
			`start_token, _, sequence, end_token = m.groups()`
			`sequence = re.sub(r"(.)", rf"{SPLIT_MARKER}\1", sequence, flags=re.DOTALL)`
			`return f"{start_token}{sequence}{SPLIT_MARKER}{end_token}"`


			`def escape_custom_split_sequence(text):`
			`"""`
			`Applies custom splitting to the text for GALILEO's tokenization`
			`Parameters`
			`----------`
			`text : str`
			`Input text to split`
			`Returns`
			`----------`
			`str - the text with the split token added`
			`"""`
			`return CUSTOM_SEQ_RE.sub(_insert_split_marker, text)`


			`# END CREDIT`


			`class GalacticaCausalLMBatch(CausalLMBatch):`
			`@classmethod`
			`def from_pb(`
feat(server): Support SantaCoder (#26) 2023-01-20 04:24:39 -07:00			`cls,`
			`pb: generate_pb2.Batch,`
			`tokenizer: PreTrainedTokenizerBase,`
feat(server): support vectorized warpers in flash causal lm (#317) Co-authored-by: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com> 2023-05-26 04:30:27 -06:00			`dtype: torch.dtype,`
feat(server): Support SantaCoder (#26) 2023-01-20 04:24:39 -07:00			`device: torch.device,`
feat(server): Add model tests (#6) 2022-12-08 10:49:33 -07:00			`) -> "GalacticaCausalLMBatch":`
feat(server): Support Galactica (#4) 2022-12-01 11:31:54 -07:00			`inputs = []`
			`next_token_choosers = []`
			`stopping_criterias = []`
fix(server): fix decode token (#334) Fixes #333 --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> 2023-05-16 15:23:27 -06:00			`prefix_offsets = []`
Fix missing arguments in Galactica's from_pb (#1022) # What does this PR do? Fixes #1004 <!-- Congratulations! You've made it this far! You're not quite done yet though. Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution. Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change. Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost. --> <!-- Remove if not applicable --> Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. <!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @ @OlivierDehaene OR @Narsil --> 2023-09-21 00:15:59 -06:00			`top_n_tokens = []`
fix(server): fix decode token (#334) Fixes #333 --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> 2023-05-16 15:23:27 -06:00			`read_offsets = []`
feat(router): drop requests when client closes the channel (#202) 2023-04-20 03:07:40 -06:00			`requests_idx_mapping = {}`
feat(server): Support Galactica (#4) 2022-12-01 11:31:54 -07:00
			`# Parse batch`
feat(router): make router input validation optional (#164) 2023-04-09 12:22:27 -06:00			`max_truncation = 0`
fix(server): fix galactica batch (#106) closes #105 2023-03-07 12:05:21 -07:00			`padding_right_offset = 0`
feat(router): use number of tokens in batch as input for dynamic batching (#226) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-04-24 09:59:00 -06:00			`max_decode_tokens = 0`
feat(router): drop requests when client closes the channel (#202) 2023-04-20 03:07:40 -06:00			`for i, r in enumerate(pb.requests):`
			`requests_idx_mapping[r.id] = i`
feat(server): Support Galactica (#4) 2022-12-01 11:31:54 -07:00			`# Add escape_custom_split_sequence to the CausalLMBatch logic`
server: use chunked inputs The router will now send the input as chunks besides as a single string. This change modifies the server to process chunked input rather than strings. This also allows us to remove the image extraction code from the server. 2024-05-31 05:51:42 -06:00			`inputs.append(`
			`escape_custom_split_sequence(concat_text_chunks(r.input_chunks.chunks))`
			`)`
chore: add pre-commit (#1569) 2024-02-16 03:58:58 -07:00			`next_token_choosers.append(`
			`NextTokenChooser.from_pb(r.parameters, device, tokenizer)`
			`)`
fix(server): fix galactica batch (#106) closes #105 2023-03-07 12:05:21 -07:00			`stopping_criteria = StoppingCriteria.from_pb(`
			`r.stopping_parameters, tokenizer`
			`)`
			`stopping_criterias.append(stopping_criteria)`
Fix missing arguments in Galactica's from_pb (#1022) # What does this PR do? Fixes #1004 <!-- Congratulations! You've made it this far! You're not quite done yet though. Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution. Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change. Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost. --> <!-- Remove if not applicable --> Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. <!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @ @OlivierDehaene OR @Narsil --> 2023-09-21 00:15:59 -06:00			`top_n_tokens.append(r.top_n_tokens)`
feat(router): make router input validation optional (#164) 2023-04-09 12:22:27 -06:00			`max_truncation = max(max_truncation, r.truncate)`
feat(router): use number of tokens in batch as input for dynamic batching (#226) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-04-24 09:59:00 -06:00			`max_decode_tokens += stopping_criteria.max_new_tokens`
fix(server): fix galactica batch (#106) closes #105 2023-03-07 12:05:21 -07:00			`padding_right_offset = max(`
			`padding_right_offset, stopping_criteria.max_new_tokens`
feat(server): Support Galactica (#4) 2022-12-01 11:31:54 -07:00			`)`

			`tokenized_inputs = tokenizer(`
feat(server): Support SantaCoder (#26) 2023-01-20 04:24:39 -07:00			`inputs,`
			`return_tensors="pt",`
			`padding=True,`
			`return_token_type_ids=False,`
feat(router): make router input validation optional (#164) 2023-04-09 12:22:27 -06:00			`truncation=True,`
			`max_length=max_truncation,`
feat(server): Support Galactica (#4) 2022-12-01 11:31:54 -07:00			`).to(device)`
fix(server): fix decode token (#334) Fixes #333 --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> 2023-05-16 15:23:27 -06:00			`for _ in pb.requests:`
			`input_len = tokenized_inputs["input_ids"].shape[1]`
			`prefix_offsets.append(0)`
			`read_offsets.append(input_len)`
feat(router): make router input validation optional (#164) 2023-04-09 12:22:27 -06:00
			`input_lengths = tokenized_inputs["attention_mask"].sum(1)`
			`max_input_length = input_lengths.max()`

fix(server): fix galactica batch (#106) closes #105 2023-03-07 12:05:21 -07:00			`input_ids = tokenized_inputs["input_ids"]`
			`# Allocate maximum attention_mask`
			`attention_mask = input_ids.new_zeros(`
feat(router): make router input validation optional (#164) 2023-04-09 12:22:27 -06:00			`(pb.size, max_input_length + padding_right_offset)`
fix(server): fix galactica batch (#106) closes #105 2023-03-07 12:05:21 -07:00			`)`
			`# Copy tokenizer attention_mask into fully allocated attention_mask`
feat(router): make router input validation optional (#164) 2023-04-09 12:22:27 -06:00			`attention_mask[:, :max_input_length] = tokenized_inputs["attention_mask"]`
fix(server): fix galactica batch (#106) closes #105 2023-03-07 12:05:21 -07:00
fix(server): Fix position ids (#28) 2023-01-20 07:35:22 -07:00			`position_ids = tokenized_inputs["attention_mask"].long().cumsum(-1) - 1`
			`position_ids.masked_fill_(tokenized_inputs["attention_mask"] == 0, 1)`
feat(router): drop requests when client closes the channel (#202) 2023-04-20 03:07:40 -06:00			`all_input_ids = tokenized_inputs["input_ids"].T.split(1, dim=1)`
Fix missing arguments in Galactica's from_pb (#1022) # What does this PR do? Fixes #1004 <!-- Congratulations! You've made it this far! You're not quite done yet though. Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution. Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change. Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost. --> <!-- Remove if not applicable --> Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. <!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @ @OlivierDehaene OR @Narsil --> 2023-09-21 00:15:59 -06:00			`top_n_tokens_tensor = torch.tensor(`
			`top_n_tokens, device=device, dtype=torch.int64`
			`)`
feat(server): Support Galactica (#4) 2022-12-01 11:31:54 -07:00
feat(router): use number of tokens in batch as input for dynamic batching (#226) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-04-24 09:59:00 -06:00			`max_tokens = len(inputs) * max_input_length + max_decode_tokens`

feat(server): Support Galactica (#4) 2022-12-01 11:31:54 -07:00			`return cls(`
			`batch_id=pb.id,`
			`requests=pb.requests,`
feat(router): drop requests when client closes the channel (#202) 2023-04-20 03:07:40 -06:00			`requests_idx_mapping=requests_idx_mapping,`
fix(server): fix galactica batch (#106) closes #105 2023-03-07 12:05:21 -07:00			`input_ids=input_ids,`
			`attention_mask=attention_mask,`
fix(server): Fix position ids (#28) 2023-01-20 07:35:22 -07:00			`position_ids=position_ids,`
feat(server): Support Galactica (#4) 2022-12-01 11:31:54 -07:00			`past_key_values=None,`
feat(router): drop requests when client closes the channel (#202) 2023-04-20 03:07:40 -06:00			`all_input_ids=list(all_input_ids),`
			`input_lengths=input_lengths.tolist(),`
fix(server): fix decode token (#334) Fixes #333 --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> 2023-05-16 15:23:27 -06:00			`prefix_offsets=prefix_offsets,`
			`read_offsets=read_offsets,`
feat(server): Support Galactica (#4) 2022-12-01 11:31:54 -07:00			`next_token_choosers=next_token_choosers,`
			`stopping_criterias=stopping_criterias,`
Fix missing arguments in Galactica's from_pb (#1022) # What does this PR do? Fixes #1004 <!-- Congratulations! You've made it this far! You're not quite done yet though. Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution. Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change. Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost. --> <!-- Remove if not applicable --> Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. <!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @ @OlivierDehaene OR @Narsil --> 2023-09-21 00:15:59 -06:00			`top_n_tokens=top_n_tokens,`
			`top_n_tokens_tensor=top_n_tokens_tensor,`
feat(router): drop requests when client closes the channel (#202) 2023-04-20 03:07:40 -06:00			`max_input_length=max_input_length.item(),`
fix(server): fix galactica batch (#106) closes #105 2023-03-07 12:05:21 -07:00			`padding_right_offset=padding_right_offset,`
feat(router): use number of tokens in batch as input for dynamic batching (#226) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-04-24 09:59:00 -06:00			`max_tokens=max_tokens,`
feat(server): Support Galactica (#4) 2022-12-01 11:31:54 -07:00			`)`