hf_text-generation-inference/server/text_generation_server/models/flash_neox.py

70 lines
2.2 KiB
Python
Raw Normal View History

2023-03-24 07:02:14 -06:00
import torch
import torch.distributed
from opentelemetry import trace
2023-04-03 11:06:42 -06:00
from transformers import AutoTokenizer, AutoConfig
from typing import Optional
2023-03-24 07:02:14 -06:00
2023-04-03 11:06:42 -06:00
from text_generation_server.models import FlashCausalLM
from text_generation_server.models.custom_modeling.flash_neox_modeling import (
2023-03-24 07:02:14 -06:00
FlashGPTNeoXForCausalLM,
)
from text_generation_server.utils import (
initialize_torch_distributed,
weight_files,
Weights,
2023-03-24 07:02:14 -06:00
)
tracer = trace.get_tracer(__name__)
class FlashNeoXSharded(FlashCausalLM):
2023-03-24 07:02:14 -06:00
def __init__(
2023-05-15 15:36:30 -06:00
self,
model_id: str,
revision: Optional[str] = None,
quantize: Optional[str] = None,
dtype: Optional[torch.dtype] = None,
trust_remote_code: bool = False,
2023-03-24 07:02:14 -06:00
):
self.process_group, rank, world_size = initialize_torch_distributed()
2023-03-24 07:02:14 -06:00
if torch.cuda.is_available():
device = torch.device(f"cuda:{rank}")
dtype = torch.float16 if dtype is None else dtype
2023-03-24 07:02:14 -06:00
else:
raise NotImplementedError("FlashNeoX is only available on GPU")
tokenizer = AutoTokenizer.from_pretrained(
model_id,
revision=revision,
padding_side="left",
truncation_side="left",
trust_remote_code=trust_remote_code,
2023-03-24 07:02:14 -06:00
)
config = AutoConfig.from_pretrained(
model_id, revision=revision, trust_remote_code=trust_remote_code
2023-03-24 07:02:14 -06:00
)
config.quantize = quantize
2023-03-24 07:02:14 -06:00
torch.distributed.barrier(group=self.process_group)
filenames = weight_files(model_id, revision=revision, extension=".safetensors")
weights = Weights(
filenames, device=device, dtype=dtype, process_group=self.process_group
)
2023-03-24 07:02:14 -06:00
model = FlashGPTNeoXForCausalLM(config, weights)
2023-03-24 07:02:14 -06:00
torch.distributed.barrier(group=self.process_group)
super(FlashNeoXSharded, self).__init__(
model=model.to(device),
2023-03-24 07:02:14 -06:00
tokenizer=tokenizer,
num_layers=len(model.gpt_neox.layers),
num_kv_heads=model.gpt_neox.num_heads,
head_size=model.gpt_neox.head_size,
dtype=dtype,
2023-03-24 07:02:14 -06:00
device=device,
rank=rank,
world_size=world_size,
2023-03-24 07:02:14 -06:00
)