feat(server): Improved doc

This commit is contained in:
OlivierDehaene 2022-11-07 12:53:56 +01:00
parent cea6051eff
commit 4236e41b0d
9 changed files with 195 additions and 101 deletions

View File

@ -28,6 +28,7 @@ ENV LANG=C.UTF-8 \
MODEL_NAME=bigscience/bloom \
QUANTIZE=false \
NUM_GPUS=8 \
SAFETENSORS_FAST_GPU=1 \
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
NCCL_ASYNC_ERROR_HANDLING=1 \
CUDA_HOME=/usr/local/cuda \
@ -55,12 +56,6 @@ RUN cd server && make install-torch
# Install specific version of transformers
RUN cd server && make install-transformers
# Install specific version of safetensors
# FIXME: This is a temporary fix while we wait for a new release
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
RUN cd server && make install-safetensors
# Install server
COPY proto proto
COPY server server

View File

@ -6,7 +6,8 @@
</div>
A Rust and gRPC server for text generation inference.
A Rust and gRPC server for text generation inference. Used in production at [HuggingFace](https://huggingface.co)
to power Bloom, BloomZ and MT0-XXL api-inference widgets.
## Features
@ -15,11 +16,11 @@ A Rust and gRPC server for text generation inference.
- [Safetensors](https://github.com/huggingface/safetensors) weight loading
- 45ms per token generation for BLOOM with 8xA100 80GB
## Officialy supported models
## Officially supported models
- BLOOM
- BLOOMZ
- BLOOM-560m
- [BLOOM](https://huggingface.co/bigscience/bloom)
- [BLOOMZ](https://huggingface.co/bigscience/bloomz)
- [MT0-XXL](https://huggingface.co/bigscience/mt0-xxl)
Other models are supported on a best effort basis using:
@ -90,5 +91,4 @@ make router-dev
## TODO:
- [ ] Add tests for the `server/model` logic
- [ ] Backport custom CUDA kernels to Transformers
- [ ] Install safetensors with pip
- [ ] Backport custom CUDA kernels to Transformers

View File

@ -295,6 +295,10 @@ fn shard_manager(
"MASTER_PORT".parse().unwrap(),
master_port.to_string().parse().unwrap(),
),
(
"SAFETENSORS_FAST_GPU".parse().unwrap(),
"1".to_string().parse().unwrap(),
),
];
// If the HUGGINGFACE_HUB_CACHE env var is set, pass it to the shard

View File

@ -16,24 +16,13 @@ install-transformers:
mv transformers-7302a24535e8dc5637ea5b4e4572fc971d404098 transformers
cd transformers && python setup.py install
install-safetensors:
# Install specific version of safetensors
pip install setuptools_rust
rm safetensors || true
rm safetensors-634deccbcbad5eaf417935281f8b3be7ebca69c5 || true
curl -L -O https://github.com/huggingface/safetensors/archive/634deccbcbad5eaf417935281f8b3be7ebca69c5.zip
unzip 634deccbcbad5eaf417935281f8b3be7ebca69c5.zip
rm 634deccbcbad5eaf417935281f8b3be7ebca69c5.zip
mv safetensors-634deccbcbad5eaf417935281f8b3be7ebca69c5 safetensors
cd safetensors/bindings/python && python setup.py develop
install-torch:
# Install specific version of torch
pip install torch --extra-index-url https://download.pytorch.org/whl/cu116 --no-cache-dir
install: gen-server install-torch install-transformers install-safetensors
install: gen-server install-torch install-transformers
pip install pip --upgrade
pip install -e . --no-cache-dir
run-dev:
python -m torch.distributed.run --nproc_per_node=2 text_generation/cli.py serve bigscience/bloom-560m --sharded
SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation/cli.py serve bigscience/bloom-560m --sharded

47
server/poetry.lock generated
View File

@ -145,6 +145,18 @@ category = "main"
optional = false
python-versions = ">=3.6"
[[package]]
name = "safetensors"
version = "0.2.4"
description = "Fast and Safe Tensor serialization"
category = "main"
optional = false
python-versions = "*"
[package.extras]
dev = ["black (==22.3)", "flake8 (>=3.8.3)", "huggingface-hub", "isort (>=5.5.4)", "numpy", "pytest", "setuptools-rust"]
testing = ["black (==22.3)", "flake8 (>=3.8.3)", "huggingface-hub", "isort (>=5.5.4)", "numpy", "pytest", "setuptools-rust"]
[[package]]
name = "setuptools"
version = "65.5.0"
@ -208,7 +220,7 @@ bnb = ["bitsandbytes"]
[metadata]
lock-version = "1.1"
python-versions = "^3.9"
content-hash = "224b1e379d6105fe911bff4563946a90dfa6ff5918cf2e7be59f8d4f7c5cd7cf"
content-hash = "3266187ef14fe8f9e29b3b6530d07781ea952aa670c0fe0de34be43efa231a67"
[metadata.files]
accelerate = [
@ -459,6 +471,39 @@ PyYAML = [
{file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"},
{file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"},
]
safetensors = [
{file = "safetensors-0.2.4-cp310-cp310-macosx_10_11_x86_64.whl", hash = "sha256:79c4a7610d7699c64d8531c43f758ded4990ebaa7b0887c2078640e6de44e726"},
{file = "safetensors-0.2.4-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:ef425a4ddd29612fe733a6eeca6ad8f3ee3939f530a032114974aac4c4667b89"},
{file = "safetensors-0.2.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77758f8ba4de6e20bf394dd964854a926dee2efee82eaa95e6c0893e2a7d960c"},
{file = "safetensors-0.2.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fb956e9090cce515649f00b491b5ddc0f9c3d989139016a8d69f9dcf57e8d3d9"},
{file = "safetensors-0.2.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3e31b02d27249bd519f05ec9d189097c59fc6851c59daa1a86ef347659e33ac3"},
{file = "safetensors-0.2.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c2fead03a1497042efea4358574f3d7acf501b0c82e54d605f393f2b4e2aafe"},
{file = "safetensors-0.2.4-cp310-cp310-win32.whl", hash = "sha256:dce6ed3c7d13aafa574737eb3309c928adcb6781e879b41f0861be83b439cf3e"},
{file = "safetensors-0.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:1dfe727325a1342767c6725dc2cc1f00463eb40a1f5df37c338d8e03957e27ce"},
{file = "safetensors-0.2.4-cp37-cp37m-macosx_10_11_x86_64.whl", hash = "sha256:c066bc7b90a582a01ec468fef61a7581b5c726bf12c50491cb6ea5db215ea5e0"},
{file = "safetensors-0.2.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca6ed53dad5d7d0e67eb676528ff2ad345cac3a34010e4dc1e3736972de294a5"},
{file = "safetensors-0.2.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ada03b44acbb036cfabe7066a8df4ad9b1ac05bb585a6b6c0f285f08e016381d"},
{file = "safetensors-0.2.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:58a0902708daa7ec2b2293b46e85df61f4fa359ddfe648e7ac025a79e6f59627"},
{file = "safetensors-0.2.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e0a4e38f7cbb4bfc513588e52f349b906c941e74fbbe192f2b19fc34221d448"},
{file = "safetensors-0.2.4-cp37-cp37m-win32.whl", hash = "sha256:4f8695b77dd847203258f035f8468f8b701c90621cb6b457e109f8d89c27f16c"},
{file = "safetensors-0.2.4-cp37-cp37m-win_amd64.whl", hash = "sha256:16b08f33c753c7da64b3999beea7c30d58204a0820961e33881d05a331e3f5c0"},
{file = "safetensors-0.2.4-cp38-cp38-macosx_10_11_x86_64.whl", hash = "sha256:a381606804f23db9eede51135f5fbd1f75dda02100415ee150fd39eb1cd6be4c"},
{file = "safetensors-0.2.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7aceae84d0c7233d83923029aaf8d184848561e0211ec98c5317327b3db025d6"},
{file = "safetensors-0.2.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:da48fc929485cbd9ee22621e388764a7cef27b0205e73aee2ad75aadd7d67662"},
{file = "safetensors-0.2.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2619b88f934c4de6b59de90c9dc00eae2d0e30f254a1daebd6eb232ac1f9a7a7"},
{file = "safetensors-0.2.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1f78b987ae1f6b71da8ea110164e4cab2ee31b53835d2a66279df89c5d73f0e"},
{file = "safetensors-0.2.4-cp38-cp38-win32.whl", hash = "sha256:34b3e60b5130fb0fe07114705e51d30aa2c7eae4c1d1e77d6f260fa4ade70ede"},
{file = "safetensors-0.2.4-cp38-cp38-win_amd64.whl", hash = "sha256:debaa4fa98a7af44ba6dcb6945efee77b8480284c2cb05918ab97cf511c40826"},
{file = "safetensors-0.2.4-cp39-cp39-macosx_10_11_x86_64.whl", hash = "sha256:90baaafc0c872a736124b341db54b0bdd61765cbf3a61418371066a37905b18d"},
{file = "safetensors-0.2.4-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:b4bf7e23191d6a3ff00de141512869fc776e8ff159c872cb44af018cb04d45eb"},
{file = "safetensors-0.2.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf11a3aba8796e548ceb0a65f34dcd334dcf0c4c891dccabe18a8b53918ae8ab"},
{file = "safetensors-0.2.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:95c31935ea71d63a38c546654136d7f0dbf1e7aeb6564dbc2201bc1fe9b34e4c"},
{file = "safetensors-0.2.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ef31776e2e081d6f075408eed34a0fbd524cbd19e50268bef02c238b209213b7"},
{file = "safetensors-0.2.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d06bb1d68148f6d6934352124d8cbfcf0db092f969db7187e348bd5cbf183db5"},
{file = "safetensors-0.2.4-cp39-cp39-win32.whl", hash = "sha256:5d546152b9a5bd58eae97c2ddefba394404d37ddedec305f7639c9b6054513e5"},
{file = "safetensors-0.2.4-cp39-cp39-win_amd64.whl", hash = "sha256:553ecfd895d379c1e03a7c9241f7343b3af66573436969ed7eb95df81dfbe9af"},
{file = "safetensors-0.2.4.tar.gz", hash = "sha256:35c0719a898f1f1292464f4cd9370bb6c2698032f1db4d677489f078b66b5a75"},
]
setuptools = [
{file = "setuptools-65.5.0-py3-none-any.whl", hash = "sha256:f62ea9da9ed6289bfe868cd6845968a2c854d1427f8548d52cae02a42b4f0356"},
{file = "setuptools-65.5.0.tar.gz", hash = "sha256:512e5536220e38146176efb833d4a62aa726b7bbff82cfbc8ba9eaa3996e0b17"},

View File

@ -15,6 +15,7 @@ typer = "^0.6.1"
grpcio-reflection = "^1.49.1"
accelerate = "^0.12.0"
bitsandbytes = "^0.35.1"
safetensors = "^0.2.4"
[tool.poetry.extras]
bnb = ["bitsandbytes"]

View File

@ -9,17 +9,13 @@ __all__ = ["Model", "BLOOMSharded", "CausalLM", "Seq2SeqLM"]
def get_model(model_name: str, sharded: bool, quantize: bool) -> Model:
if model_name.startswith("bigscience/bloom"):
if sharded:
return BLOOMSharded(model_name, quantize)
return BLOOMSharded(model_name, quantize=quantize)
else:
if quantize:
raise ValueError("quantization is not supported for non-sharded BLOOM")
return CausalLM(model_name)
return CausalLM(model_name, quantize=quantize)
else:
if sharded:
raise ValueError("sharded is not supported for AutoModel")
if quantize:
raise ValueError("quantize is not supported for AutoModel")
try:
return CausalLM(model_name)
return CausalLM(model_name, quantize=quantize)
except Exception as e:
return Seq2SeqLM(model_name)
return Seq2SeqLM(model_name, quantize=quantize)

View File

@ -2,7 +2,7 @@ import torch
from dataclasses import dataclass
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import Optional, Tuple, List, Dict, Type
from typing import Optional, Tuple, List, Type
from text_generation.models import Model
from text_generation.models.types import GeneratedText
@ -14,11 +14,23 @@ from text_generation.utils import NextTokenChooser, StoppingCriteria
class CausalLMBatch:
batch_id: int
requests: List[generate_pb2.Request]
all_input_lengths: List[int]
input_ids: Dict[str, torch.Tensor]
# Decoder values
input_ids: torch.Tensor
attention_mask: torch.Tensor
past_key_values: Optional[List[Tuple]]
# All tokens
all_input_ids: List[torch.Tensor]
# Lengths of all generations present in the batch
input_lengths: List[int]
# Generation helpers
next_token_choosers: List[NextTokenChooser]
stopping_criterias: List[StoppingCriteria]
# Metadata used for padding
size: int
max_sequence_length: int
@ -36,12 +48,12 @@ class CausalLMBatch:
inputs = []
next_token_choosers = []
stopping_criterias = []
all_input_lengths = []
input_lengths = []
# Parse batch
for r in pb.requests:
inputs.append(r.inputs)
all_input_lengths.append(r.input_length)
input_lengths.append(r.input_length)
next_token_choosers.append(
NextTokenChooser(
temperature=r.parameters.temperature,
@ -56,21 +68,23 @@ class CausalLMBatch:
)
)
input_ids = tokenizer(
tokenized_inputs = tokenizer(
inputs, return_tensors="pt", padding=True, pad_to_multiple_of=8
).to(device)
all_input_ids = input_ids["input_ids"].unsqueeze(-1)
all_input_ids = tokenized_inputs["input_ids"].unsqueeze(-1)
return cls(
batch_id=pb.id,
requests=pb.requests,
all_input_lengths=all_input_lengths,
input_ids=input_ids,
input_ids=tokenized_inputs["input_ids"],
attention_mask=tokenized_inputs["attention_mask"],
past_key_values=None,
all_input_ids=all_input_ids,
input_lengths=input_lengths,
next_token_choosers=next_token_choosers,
stopping_criterias=stopping_criterias,
size=pb.size,
max_sequence_length=max(all_input_lengths),
max_sequence_length=max(input_lengths),
)
@classmethod
@ -80,19 +94,23 @@ class CausalLMBatch:
max_sequence_length = max(batch.max_sequence_length for batch in batches)
# Batch attributes
input_ids = {"input_ids": None, "attention_mask": None, "past_key_values": []}
requests = []
all_input_lengths = []
input_lengths = []
all_input_ids = []
next_token_choosers = []
stopping_criterias = []
# Batch tensors
input_ids = None
attention_mask = None
past_key_values = []
# Used for slicing correctly inside the tensors
# Equivalent to a cumsum on batch sizes
start_index = 0
for i, batch in enumerate(batches):
requests.extend(batch.requests)
all_input_lengths.extend(batch.all_input_lengths)
input_lengths.extend(batch.input_lengths)
all_input_ids.extend(batch.all_input_ids)
next_token_choosers.extend(batch.next_token_choosers)
stopping_criterias.extend(batch.stopping_criterias)
@ -101,32 +119,35 @@ class CausalLMBatch:
end_index = start_index + batch.size
# We only concatenate batches that did at least one step
if batch.input_ids["input_ids"].shape[1] > 1:
if batch.input_ids.shape[1] > 1:
raise ValueError("Batch input_ids should be of shape (batch_size, 1)")
# Initialize tensors
if i == 0:
input_ids["input_ids"] = torch.empty(
(total_batch_size, 1),
dtype=batch.input_ids["input_ids"].dtype,
device=batch.input_ids["input_ids"].device,
)
input_ids["attention_mask"] = torch.zeros(
(total_batch_size, max_sequence_length),
dtype=batch.input_ids["attention_mask"].dtype,
device=batch.input_ids["attention_mask"].device,
)
# input_ids["input_ids"] is always of shape [batch_size, 1]
# Create empty tensor
# input_ids is always of shape [batch_size, 1]
# We do not need to pad it
input_ids["input_ids"][start_index:end_index] = batch.input_ids["input_ids"]
if input_ids is None:
input_ids = torch.empty(
(total_batch_size, 1),
dtype=batch.input_ids.dtype,
device=batch.input_ids.device,
)
# Copy to correct indices
input_ids[start_index:end_index] = batch.input_ids
# Create padded tensor
if attention_mask is None:
attention_mask = torch.zeros(
(total_batch_size, max_sequence_length),
dtype=batch.attention_mask.dtype,
device=batch.attention_mask.device,
)
# We need to slice the attention mask to remove padding from previous steps
input_ids["attention_mask"][
attention_mask[
start_index:end_index, -batch.max_sequence_length :
] = batch.input_ids["attention_mask"][:, -batch.max_sequence_length :]
] = batch.attention_mask[:, -batch.max_sequence_length :]
for j, past in enumerate(batch.input_ids["past_key_values"]):
for j, past in enumerate(batch.past_key_values):
# Shenanigans to get dimensions because BLOOM outputs a past with a different shape
# BLOOM: [batch_size * num_heads, ...] vs [batch_size, num_heads, ...]
head_dim, padded_sequence_length = past[0].shape[-2:]
@ -137,8 +158,8 @@ class CausalLMBatch:
)
# This will run only once per layer
if j == len(input_ids["past_key_values"]):
input_ids["past_key_values"].append([])
if j == len(past_key_values):
past_key_values.append([])
# Decoder past
for k, t in enumerate(past):
@ -172,21 +193,21 @@ class CausalLMBatch:
# Initialize tensors
# This will run only once per layer and per past tensor
if k == len(input_ids["past_key_values"][j]):
input_ids["past_key_values"][j].append(
if k == len(past_key_values[j]):
past_key_values[j].append(
torch.zeros(padded_t_shape, dtype=t.dtype, device=t.device)
)
# We slice the past keys and values to remove the padding from previous batches
if not head_dim_last:
input_ids["past_key_values"][j][k][
past_key_values[j][k][
start_index:end_index,
:,
:,
-(batch.max_sequence_length - 1) :,
] = t[:, :, :, -(batch.max_sequence_length - 1) :]
else:
input_ids["past_key_values"][j][k][
past_key_values[j][k][
start_index:end_index,
:,
-(batch.max_sequence_length - 1) :,
@ -198,9 +219,11 @@ class CausalLMBatch:
return cls(
batch_id=batches[0].batch_id,
requests=requests,
all_input_lengths=all_input_lengths,
input_ids=input_ids,
attention_mask=attention_mask,
past_key_values=past_key_values,
all_input_ids=all_input_ids,
input_lengths=input_lengths,
next_token_choosers=next_token_choosers,
stopping_criterias=stopping_criterias,
size=total_batch_size,
@ -209,7 +232,7 @@ class CausalLMBatch:
class CausalLM(Model):
def __init__(self, model_name: str):
def __init__(self, model_name: str, quantize=False):
if torch.cuda.is_available():
device = torch.device("cuda")
dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
@ -223,6 +246,7 @@ class CausalLM(Model):
model_name,
torch_dtype=dtype,
device_map="auto" if torch.cuda.is_available() else None,
load_in_8bit=quantize,
).eval()
super(CausalLM, self).__init__(
@ -255,16 +279,19 @@ class CausalLM(Model):
torch.no_grad if self.device.type == "cpu" else torch.inference_mode
)
with context_manager():
logits, past = self.forward(**batch.input_ids)
logits, past = self.forward(
batch.input_ids, batch.attention_mask, batch.past_key_values
)
# List of indices to cache
next_batch_keep_indices = []
# New input_ids for next forward
# New values for next forward
next_batch_input_lengths = []
next_batch_input_ids = []
next_batch_all_input_ids = []
next_all_input_lengths = []
# Metadata
next_batch_size = 0
next_batch_max_sequence_length = 0
@ -274,7 +301,7 @@ class CausalLM(Model):
# Zipped iterator
iterator = zip(
batch.requests,
batch.all_input_lengths,
batch.input_lengths,
logits,
batch.next_token_choosers,
batch.stopping_criterias,
@ -313,7 +340,7 @@ class CausalLM(Model):
next_batch_all_input_ids.append(all_tokens)
next_batch_size += 1
new_input_length = input_length + 1
next_all_input_lengths.append(new_input_length)
next_batch_input_lengths.append(new_input_length)
next_batch_max_sequence_length = max(
next_batch_max_sequence_length, new_input_length
)
@ -322,15 +349,14 @@ class CausalLM(Model):
if not next_batch_keep_indices:
return generated_texts, None
# If we finished at least one generation
next_batch_input_ids = {"input_ids": torch.cat(next_batch_input_ids, dim=0)}
next_batch_input_ids = torch.cat(next_batch_input_ids, dim=0)
# If we finished at least one generation, we need to evict the indices of the generations that finished
# from the values of the next batch
if generated_texts:
# Apply indices to attention mask, past key values and other items that need to be cached
next_batch_input_ids["attention_mask"] = batch.input_ids["attention_mask"][
next_batch_keep_indices
]
next_batch_attention_mask = batch.attention_mask[next_batch_keep_indices]
# Force past to be of dim [batch_size, num_heads, ...] for easy indexing
next_batch_input_ids["past_key_values"] = [
next_batch_past_key_values = [
[
t.view(-1, self.num_heads, *t.shape[-2:])[next_batch_keep_indices]
for t in layer
@ -345,16 +371,16 @@ class CausalLM(Model):
batch.stopping_criterias[i] for i in next_batch_keep_indices
]
else:
next_batch_input_ids["attention_mask"] = batch.input_ids["attention_mask"]
next_batch_input_ids["past_key_values"] = past
next_batch_attention_mask = batch.attention_mask
next_batch_past_key_values = past
next_batch_requests = batch.requests
next_batch_next_token_choosers = batch.next_token_choosers
next_batch_stopping_criterias = batch.stopping_criterias
# Update attention_mask with padding as we added a new token to input_ids
next_batch_input_ids["attention_mask"] = torch.cat(
next_batch_attention_mask = torch.cat(
[
next_batch_input_ids["attention_mask"],
next_batch_attention_mask,
torch.ones((next_batch_size, 1)).to(self.device),
],
dim=1,
@ -363,9 +389,11 @@ class CausalLM(Model):
next_batch = CausalLMBatch(
batch_id=batch.batch_id,
requests=next_batch_requests,
all_input_lengths=next_all_input_lengths,
input_ids=next_batch_input_ids,
attention_mask=next_batch_attention_mask,
past_key_values=next_batch_past_key_values,
all_input_ids=next_batch_all_input_ids,
input_lengths=next_batch_input_lengths,
next_token_choosers=next_batch_next_token_choosers,
stopping_criterias=next_batch_stopping_criterias,
size=next_batch_size,

View File

@ -15,26 +15,33 @@ class Seq2SeqLMBatch:
batch_id: int
requests: List[generate_pb2.Request]
# Encoder values
input_ids: torch.Tensor
attention_mask: torch.Tensor
# Decoder values
decoder_input_ids: torch.Tensor
decoder_attention_mask: Optional[torch.Tensor]
encoder_last_hidden_state: Optional[torch.Tensor]
# Seq2SeqLM keeps track of both encoder and decoder attention keys and values
past_key_values: Optional[List[Tuple]]
# Lengths of all generations present in the batch
input_lengths: List[int]
decoder_input_lengths: List[int]
# Generation helpers
next_token_choosers: List[NextTokenChooser]
stopping_criterias: List[StoppingCriteria]
# Metadata used for padding
size: int
max_input_length: int
max_decoder_input_length: int
def to_pb(self):
"""Convert a Seq2SeqLMBatch to a text_generation.v1.Batch protobuf"""
return generate_pb2.Batch(
id=self.batch_id,
requests=self.requests,
@ -45,6 +52,7 @@ class Seq2SeqLMBatch:
def from_pb(
cls, pb: generate_pb2.Batch, tokenizer: AutoTokenizer, device: torch.device
) -> "Seq2SeqLMBatch":
"""Convert a text_generation.v1.Batch protobuf to a Seq2SeqLMBatch"""
inputs = []
next_token_choosers = []
stopping_criterias = []
@ -57,6 +65,7 @@ class Seq2SeqLMBatch:
for r in pb.requests:
inputs.append(r.inputs)
input_lengths.append(r.input_length)
# Decoder sequence only contains the bos_token
decoder_input_ids.append(tokenizer.bos_token_id)
decoder_input_lengths.append(1)
next_token_choosers.append(
@ -73,9 +82,11 @@ class Seq2SeqLMBatch:
)
)
# Tokenize batch
tokenized_inputs = tokenizer(
inputs, return_tensors="pt", padding=True, pad_to_multiple_of=8
).to(device)
# Convert decoder_input_ids to torch tensor of size [batch_size, 1]
decoder_input_ids = torch.tensor(decoder_input_ids).to(device).unsqueeze(-1)
return cls(
@ -98,6 +109,8 @@ class Seq2SeqLMBatch:
@classmethod
def concatenate(cls, batches: List["Seq2SeqLMBatch"]) -> "Seq2SeqLMBatch":
"""Concatenate multiple batches together by padding internal torch tensors"""
# Used for padding
total_batch_size = sum(batch.size for batch in batches)
max_input_length = max(batch.max_input_length for batch in batches)
@ -112,6 +125,7 @@ class Seq2SeqLMBatch:
next_token_choosers = []
stopping_criterias = []
# Batch tensors
input_ids = None
attention_mask = None
decoder_input_ids = None
@ -122,7 +136,9 @@ class Seq2SeqLMBatch:
# Used for slicing correctly inside the tensors
# Equivalent to a cumsum on batch sizes
start_index = 0
for i, batch in enumerate(batches):
# Extend all list attributes
requests.extend(batch.requests)
input_lengths.extend(batch.input_lengths)
decoder_input_lengths.extend(batch.decoder_input_lengths)
@ -136,51 +152,62 @@ class Seq2SeqLMBatch:
if batch.encoder_last_hidden_state is None:
raise ValueError("Batch encoder_last_hidden_state cannot be None")
# Create padded tensor
if input_ids is None:
input_ids = torch.zeros(
(total_batch_size, max_input_length),
dtype=batch.input_ids.dtype,
device=batch.input_ids.device,
)
# Copy to correct indices
input_ids[
start_index:end_index, -batch.max_input_length :
] = batch.input_ids[:, -batch.max_input_length :]
# Create padded tensor
if attention_mask is None:
attention_mask = torch.zeros(
(total_batch_size, max_input_length),
dtype=batch.attention_mask.dtype,
device=batch.attention_mask.device,
)
# Copy to correct indices
attention_mask[
start_index:end_index, -batch.max_input_length :
] = batch.attention_mask[:, -batch.max_input_length :]
# Create padded tensor
if decoder_input_ids is None:
decoder_input_ids = torch.zeros(
(total_batch_size, max_decoder_input_length),
dtype=batch.decoder_input_ids.dtype,
device=batch.decoder_input_ids.device,
)
# Copy to correct indices
decoder_input_ids[
start_index:end_index, -batch.max_decoder_input_length :
] = batch.decoder_input_ids[:, -batch.max_decoder_input_length :]
# Create padded tensor
if decoder_attention_mask is None:
decoder_attention_mask = torch.zeros(
(total_batch_size, max_decoder_input_length),
dtype=batch.attention_mask.dtype,
device=batch.attention_mask.device,
dtype=batch.attention_mask.dtype, # As decoder_attention_mask might not exist,
device=batch.attention_mask.device, # we use `batch.attention_maks` for device here
)
# If the decoder mask does not exist yet, all generations started at the same time and we never concatenated
# this batch. All generations are of length `batch.max_decoder_input_length`.
if batch.decoder_attention_mask is None:
decoder_attention_mask[
start_index:end_index, -batch.max_decoder_input_length :
] = 1
# If it exists, we need to index
else:
decoder_attention_mask[
start_index:end_index, -batch.max_decoder_input_length :
] = batch.decoder_attention_mask[:, -batch.max_decoder_input_length :]
# Create padded tensor
if encoder_last_hidden_state is None:
encoder_last_hidden_state = torch.zeros(
(
@ -192,10 +219,12 @@ class Seq2SeqLMBatch:
device=batch.encoder_last_hidden_state.device,
)
# Copy to correct indices
encoder_last_hidden_state[
start_index:end_index, -batch.max_decoder_input_length :, :
] = batch.encoder_last_hidden_state[:, -batch.max_decoder_input_length :, :]
# Iterate over attention layers
for j, past in enumerate(batch.past_key_values):
_, num_heads, _, head_dim = past[0].shape
@ -271,7 +300,7 @@ class Seq2SeqLMBatch:
class Seq2SeqLM(Model):
def __init__(self, model_name: str):
def __init__(self, model_name: str, quantize=False):
if torch.cuda.is_available():
device = torch.device("cuda")
dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
@ -283,6 +312,7 @@ class Seq2SeqLM(Model):
model_name,
torch_dtype=dtype,
device_map="auto" if torch.cuda.is_available() else None,
load_in_8bit=quantize,
).eval()
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
tokenizer.bos_token_id = self.model.config.decoder_start_token_id
@ -314,14 +344,17 @@ class Seq2SeqLM(Model):
if past_key_values is not None:
decoder_input_ids = decoder_input_ids[:, -1].unsqueeze(-1)
# Wrap `encoder_last_hidden_state` because for some reason, Transformers does a `encoder_last_hidden_state[0]`
# internally...
if encoder_last_hidden_state is not None:
encoder_last_hidden_state = [encoder_last_hidden_state]
outputs = self.model.forward(
input_ids=input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
encoder_outputs=[encoder_last_hidden_state]
if encoder_last_hidden_state is not None
else None,
encoder_outputs=encoder_last_hidden_state,
past_key_values=past_key_values,
use_cache=True,
)
@ -351,11 +384,12 @@ class Seq2SeqLM(Model):
# List of indices to cache
next_batch_keep_indices = []
# New input_ids for next forward
# New values for next forward
next_batch_input_lengths = []
next_batch_decoder_input_ids = []
next_batch_decoder_input_lengths = []
# Metadata
next_batch_size = 0
next_batch_max_input_length = 0
next_batch_max_decoder_input_length = 0
@ -395,7 +429,7 @@ class Seq2SeqLM(Model):
# Evaluate stopping criteria
if stopping_criteria(decoder_tokens):
# Decode all tokens
# Decode tokens
output = self.tokenizer.decode(decoder_tokens, skip_special_tokens=True)
# Add to the list of finished generations with the original request
generated_texts.append(
@ -420,9 +454,11 @@ class Seq2SeqLM(Model):
if not next_batch_keep_indices:
return generated_texts, None
# If we finished at least one generation
next_batch_decoder_input_ids = torch.cat(next_batch_decoder_input_ids)
# If we finished at least one generation, we need to evict the indices of the generations that finished
# from the values of the next batch
if generated_texts:
# Apply indices to attention mask, past key values and other items that need to be cached
next_batch_input_ids = batch.input_ids[next_batch_keep_indices]
next_batch_attention_mask = batch.attention_mask[next_batch_keep_indices]
@ -458,7 +494,7 @@ class Seq2SeqLM(Model):
next_batch_next_token_choosers = batch.next_token_choosers
next_batch_stopping_criterias = batch.stopping_criterias
# Update attention_mask with padding as we added a new token to input_ids
# Update decoder_attention_mask with padding as we added a new token to input_ids
if next_batch_decoder_attention_mask is not None:
next_batch_decoder_attention_mask = torch.cat(
[