first tests
This commit is contained in:
parent
78063c0569
commit
60ed7b535c
|
@ -0,0 +1,36 @@
|
|||
inference:
|
||||
greedy: False # Whether or not to use sampling ; use greedy decoding otherwise
|
||||
top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering.
|
||||
top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
|
||||
temperature: 1.0 # sampling temperature
|
||||
add_BOS: True # add the bos token at the begining of the prompt
|
||||
tokens_to_generate: 30 # The minimum length of the sequence to be generated.
|
||||
all_probs: False # whether return the log prob for all the tokens in vocab
|
||||
repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty.
|
||||
min_tokens_to_generate: 0 # The minimum length of the sequence to be generated.
|
||||
compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False
|
||||
|
||||
|
||||
trainer:
|
||||
devices: 1
|
||||
num_nodes: 1
|
||||
accelerator: gpu
|
||||
logger: False # logger provided by exp_manager
|
||||
precision: 16 # 16, 32, or bf16
|
||||
|
||||
tensor_model_parallel_size: 1
|
||||
pipeline_model_parallel_size: 1
|
||||
pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model
|
||||
gpt_model_file: null # GPT nemo file path
|
||||
checkpoint_dir: null # checkpoint file dir. This is used to load the PTL checkpoint generated during the GPT training
|
||||
checkpoint_name: null # PTL checkpoint file name, only used for PTL checkpoint loading
|
||||
hparams_file: null # model configuration file, only used for PTL checkpoint loading
|
||||
prompts: # prompts for GPT inference
|
||||
- "Q: How are you?"
|
||||
- "Q: How big is the universe?"
|
||||
server: False # whether launch the API server
|
||||
port: 5555 # the port number for the inference server
|
||||
web_server: False # whether launch the web inference server
|
||||
share: False # whether create a public URL
|
||||
username: test # user name for web client
|
||||
password: test2 # password for web client
|
|
@ -0,0 +1,528 @@
|
|||
# Copyright (c) 2021, NVIDIA CORPORATION and Hugging Face authors. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import os
|
||||
import threading
|
||||
|
||||
import torch
|
||||
import torch.distributed
|
||||
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from accelerate import init_empty_weights
|
||||
from safetensors import safe_open
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForCausalLM,
|
||||
AutoConfig,
|
||||
)
|
||||
from transformers.models.gpt_neox.parallel_layers import (
|
||||
TensorParallelColumnLinear,
|
||||
TensorParallelEmbedding,
|
||||
TensorParallelRowLinear,
|
||||
)
|
||||
|
||||
from text_generation.models import CausalLM
|
||||
from text_generation.utils import (
|
||||
initialize_torch_distributed,
|
||||
weight_files,
|
||||
)
|
||||
|
||||
from omegaconf import OmegaConf, open_dict
|
||||
from pytorch_lightning.trainer.trainer import Trainer
|
||||
from torch.utils.data import DataLoader, Dataset
|
||||
|
||||
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
|
||||
from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
|
||||
from nemo.collections.nlp.modules.common.megatron_web_server import get_demo
|
||||
from nemo.collections.nlp.modules.common.text_generation_server import MegatronServer
|
||||
from nemo.collections.nlp.modules.common.text_generation_utils import generate
|
||||
from nemo.collections.nlp.modules.common.transformer.text_generation import LengthParam, SamplingParam
|
||||
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
|
||||
from nemo.core.config import hydra_runner
|
||||
from nemo.utils.app_state import AppState
|
||||
from nemo.utils.model_utils import inject_model_parallel_rank
|
||||
|
||||
try:
|
||||
from apex.transformer import parallel_state
|
||||
|
||||
HAVE_APEX = True
|
||||
except (ImportError, ModuleNotFoundError):
|
||||
HAVE_APEX = False
|
||||
|
||||
"""
|
||||
This is the script to run GPT text generation.
|
||||
|
||||
Usage:
|
||||
Assume the model has TP=1, PP=1 in the following use cases.
|
||||
a. run greedy inference from a nemo file:
|
||||
python megatron_gpt_eval.py \
|
||||
gpt_model_file=PATH_TO_MODEL \
|
||||
inference.greedy=True \
|
||||
inference.add_BOS=True \
|
||||
trainer.devices=1 \
|
||||
trainer.num_nodes=1 \
|
||||
tensor_model_parallel_size=1 \
|
||||
pipeline_model_parallel_size=1 \
|
||||
prompts=[prompt1,prompt2]
|
||||
|
||||
b. run greedy inference from a PTL checkpoint file:
|
||||
python megatron_gpt_eval.py \
|
||||
checkpoint_dir=PATH_TO_CHECKPOINT_FILE \
|
||||
checkpoint_name=CHECKPOINT_FILE_NAME \
|
||||
hparams_file=HPARAMS_FILE \
|
||||
inference.greedy=True \
|
||||
inference.add_BOS=True \
|
||||
trainer.devices=1 \
|
||||
trainer.num_nodes=1 \
|
||||
tensor_model_parallel_size=1 \
|
||||
pipeline_model_parallel_size=1 \
|
||||
prompts=[prompt1,prompt2]
|
||||
|
||||
c. run top_p inference from a nemo file:
|
||||
python megatron_gpt_eval.py \
|
||||
gpt_model_file=PATH_TO_MODEL \
|
||||
inference.greedy=False \
|
||||
inference.top_k=0 \
|
||||
inference.top_p=0.9 \
|
||||
inference.repetition_penalty=1.2 \
|
||||
inference.add_BOS=True \
|
||||
trainer.devices=1 \
|
||||
trainer.num_nodes=1 \
|
||||
tensor_model_parallel_size=1 \
|
||||
pipeline_model_parallel_size=1 \
|
||||
prompts=[prompt1,prompt2]
|
||||
|
||||
d. If you don't need to generate tokens and need model to compute logprobs:
|
||||
python megatron_gpt_eval.py \
|
||||
gpt_model_file=PATH_TO_MODEL \
|
||||
inference.compute_logprob=True \
|
||||
trainer.devices=1 \
|
||||
trainer.num_nodes=1 \
|
||||
tensor_model_parallel_size=1 \
|
||||
pipeline_model_parallel_size=1 \
|
||||
prompts=[text to get logprob]
|
||||
|
||||
e. Launch the inference server
|
||||
python megatron_gpt_eval.py \
|
||||
gpt_model_file=PATH_TO_MODEL \
|
||||
trainer.devices=1 \
|
||||
trainer.num_nodes=1 \
|
||||
tensor_model_parallel_size=1 \
|
||||
pipeline_model_parallel_size=1 \
|
||||
server=True
|
||||
|
||||
To send a request to the server, here is one example code:
|
||||
```python
|
||||
import json
|
||||
import requests
|
||||
|
||||
batch_size = 8
|
||||
port_num = 5555
|
||||
headers = {"Content-Type": "application/json"}
|
||||
|
||||
|
||||
def request_data(data):
|
||||
resp = requests.put('http://localhost:{}/generate'.format(port_num),
|
||||
data=json.dumps(data),
|
||||
headers=headers)
|
||||
sentences = resp.json()['sentences']
|
||||
return sentences
|
||||
|
||||
|
||||
data = {
|
||||
"sentences": [""] * batch_size,
|
||||
"tokens_to_generate": 300,
|
||||
"temperature": 1.0,
|
||||
"add_BOS": True,
|
||||
"top_k": 0,
|
||||
"top_p": 0.9,
|
||||
"greedy": False,
|
||||
"all_probs": False,
|
||||
"repetition_penalty": 1.2,
|
||||
"min_tokens_to_generate": 2,
|
||||
}
|
||||
|
||||
sentences = request_data(data)
|
||||
```
|
||||
"""
|
||||
|
||||
if not torch.cuda.is_available():
|
||||
raise EnvironmentError("GPU is needed for the inference")
|
||||
|
||||
|
||||
class RequestDataSet(Dataset):
|
||||
def __init__(self, sentences):
|
||||
super().__init__()
|
||||
self.sentences = sentences
|
||||
|
||||
def __len__(self,):
|
||||
return len(self.sentences)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return self.sentences[idx]
|
||||
|
||||
|
||||
@hydra_runner(config_path="conf", config_name="megatron_gpt_inference")
|
||||
def main(cfg) -> None:
|
||||
|
||||
# trainer required for restoring model parallel models
|
||||
trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
|
||||
assert (
|
||||
cfg.trainer.devices * cfg.trainer.num_nodes
|
||||
== cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
|
||||
), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size"
|
||||
|
||||
if cfg.gpt_model_file:
|
||||
save_restore_connector = NLPSaveRestoreConnector()
|
||||
if os.path.isdir(cfg.gpt_model_file):
|
||||
save_restore_connector.model_extracted_dir = cfg.gpt_model_file
|
||||
|
||||
pretrained_cfg = MegatronGPTModel.restore_from(
|
||||
restore_path=cfg.gpt_model_file,
|
||||
trainer=trainer,
|
||||
return_config=True,
|
||||
save_restore_connector=save_restore_connector,
|
||||
)
|
||||
OmegaConf.set_struct(pretrained_cfg, True)
|
||||
with open_dict(pretrained_cfg):
|
||||
pretrained_cfg.sequence_parallel = False
|
||||
pretrained_cfg.activations_checkpoint_granularity = None
|
||||
pretrained_cfg.activations_checkpoint_method = None
|
||||
model = MegatronGPTModel.restore_from(
|
||||
restore_path=cfg.gpt_model_file,
|
||||
trainer=trainer,
|
||||
override_config_path=pretrained_cfg,
|
||||
save_restore_connector=save_restore_connector,
|
||||
)
|
||||
elif cfg.checkpoint_dir:
|
||||
app_state = AppState()
|
||||
if cfg.tensor_model_parallel_size > 1 or cfg.pipeline_model_parallel_size > 1:
|
||||
app_state.model_parallel_size = cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
|
||||
app_state.tensor_model_parallel_size = cfg.tensor_model_parallel_size
|
||||
app_state.pipeline_model_parallel_size = cfg.pipeline_model_parallel_size
|
||||
(
|
||||
app_state.tensor_model_parallel_rank,
|
||||
app_state.pipeline_model_parallel_rank,
|
||||
app_state.model_parallel_size,
|
||||
app_state.data_parallel_size,
|
||||
app_state.pipeline_model_parallel_split_rank,
|
||||
app_state.virtual_pipeline_model_parallel_rank,
|
||||
) = fake_initialize_model_parallel(
|
||||
world_size=app_state.model_parallel_size,
|
||||
rank=trainer.global_rank,
|
||||
tensor_model_parallel_size_=cfg.tensor_model_parallel_size,
|
||||
pipeline_model_parallel_size_=cfg.pipeline_model_parallel_size,
|
||||
pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank,
|
||||
)
|
||||
checkpoint_path = inject_model_parallel_rank(os.path.join(cfg.checkpoint_dir, cfg.checkpoint_name))
|
||||
model = MegatronGPTModel.load_from_checkpoint(checkpoint_path, hparams_file=cfg.hparams_file, trainer=trainer)
|
||||
else:
|
||||
raise ValueError("need at least a nemo file or checkpoint dir")
|
||||
|
||||
model.freeze()
|
||||
|
||||
# Have to turn off activations_checkpoint_method for inference
|
||||
try:
|
||||
model.model.language_model.encoder.activations_checkpoint_method = None
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
length_params: LengthParam = {
|
||||
"max_length": cfg.inference.tokens_to_generate,
|
||||
"min_length": cfg.inference.min_tokens_to_generate,
|
||||
}
|
||||
|
||||
sampling_params: SamplingParam = {
|
||||
"use_greedy": cfg.inference.greedy,
|
||||
"temperature": cfg.inference.temperature,
|
||||
"top_k": cfg.inference.top_k,
|
||||
"top_p": cfg.inference.top_p,
|
||||
"repetition_penalty": cfg.inference.repetition_penalty,
|
||||
"add_BOS": cfg.inference.add_BOS,
|
||||
"all_probs": cfg.inference.all_probs,
|
||||
"compute_logprob": cfg.inference.compute_logprob,
|
||||
}
|
||||
|
||||
# First method of running text generation, call model.generate method
|
||||
response = model.generate(
|
||||
inputs=OmegaConf.to_container(cfg.prompts), length_params=length_params, sampling_params=sampling_params
|
||||
)
|
||||
|
||||
print("***************************")
|
||||
print(response)
|
||||
print("***************************")
|
||||
|
||||
# Second method of running text generation, call trainer.predict
|
||||
ds = RequestDataSet(OmegaConf.to_container(cfg.prompts))
|
||||
request_dl = DataLoader(dataset=ds, batch_size=2)
|
||||
config = OmegaConf.to_container(cfg.inference)
|
||||
model.set_inference_config(config)
|
||||
response = trainer.predict(model, request_dl)
|
||||
|
||||
print("***************************")
|
||||
print(response)
|
||||
print("***************************")
|
||||
|
||||
# Third method of running text generation, use inference server
|
||||
if cfg.server:
|
||||
if parallel_state.is_pipeline_first_stage() and parallel_state.get_tensor_model_parallel_rank() == 0:
|
||||
if cfg.web_server:
|
||||
thread = threading.Thread(target=get_demo, daemon=True, args=(cfg.share, cfg.username, cfg.password))
|
||||
thread.start()
|
||||
server = MegatronServer(model.cuda())
|
||||
server.run("0.0.0.0", port=cfg.port)
|
||||
|
||||
while True:
|
||||
choice = torch.cuda.LongTensor(1)
|
||||
torch.distributed.broadcast(choice, 0)
|
||||
if choice[0].item() == 0:
|
||||
generate(model.cuda())
|
||||
|
||||
|
||||
class MegatronNemo(CausalLM):
|
||||
def __init__(
|
||||
self, model_id: str, revision: Optional[str] = None, quantize: bool = False
|
||||
):
|
||||
save_restore_connector = NLPSaveRestoreConnector()
|
||||
if os.path.isdir(model_id):
|
||||
save_restore_connector.model_extracted_dir = model_id
|
||||
|
||||
pretrained_cfg = MegatronGPTModel.restore_from(
|
||||
restore_path=model_id,
|
||||
trainer=trainer,
|
||||
return_config=True,
|
||||
save_restore_connector=save_restore_connector,
|
||||
)
|
||||
OmegaConf.set_struct(pretrained_cfg, True)
|
||||
with open_dict(pretrained_cfg):
|
||||
pretrained_cfg.sequence_parallel = False
|
||||
pretrained_cfg.activations_checkpoint_granularity = None
|
||||
pretrained_cfg.activations_checkpoint_method = None
|
||||
model = MegatronGPTModel.restore_from(
|
||||
restore_path=model_id,
|
||||
trainer=trainer,
|
||||
override_config_path=pretrained_cfg,
|
||||
save_restore_connector=save_restore_connector,
|
||||
)
|
||||
|
||||
def forward(
|
||||
self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
|
||||
) -> Tuple[torch.Tensor, List[Tuple[torch.Tensor, torch.Tensor]]]:
|
||||
"""Overwrite forward to ignore position_ids"""
|
||||
|
||||
# Model Forward
|
||||
outputs = self.model.forward(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
past_key_values=past_key_values,
|
||||
use_cache=True,
|
||||
)
|
||||
return outputs.logits, outputs.past_key_values
|
||||
|
||||
|
||||
class GPTNeoxSharded(GPTNeox):
|
||||
def __init__(
|
||||
self, model_id: str, revision: Optional[str] = None, quantize: bool = False
|
||||
):
|
||||
self.process_group, self.rank, self.world_size = initialize_torch_distributed()
|
||||
self.master = self.rank == 0
|
||||
if torch.cuda.is_available():
|
||||
device = torch.device(f"cuda:{self.rank}")
|
||||
dtype = torch.bfloat16
|
||||
else:
|
||||
device = torch.device("cpu")
|
||||
dtype = torch.float32
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_id, revision=revision, padding_side="left"
|
||||
)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_id, revision=revision, tp_parallel=True
|
||||
)
|
||||
|
||||
torch.distributed.barrier(group=self.process_group)
|
||||
filenames = weight_files(model_id, revision=revision, extension=".safetensors")
|
||||
|
||||
with init_empty_weights():
|
||||
model = AutoModelForCausalLM.from_config(config)
|
||||
|
||||
torch.distributed.barrier(group=self.process_group)
|
||||
self.load_weights(
|
||||
model,
|
||||
filenames,
|
||||
quantize=quantize,
|
||||
device=device,
|
||||
rank=self.rank,
|
||||
world_size=self.world_size,
|
||||
)
|
||||
self.model = model.eval().to(dtype)
|
||||
torch.distributed.barrier(group=self.process_group)
|
||||
super(CausalLM, self).__init__(
|
||||
tokenizer=tokenizer,
|
||||
device=device,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def load_weights(
|
||||
model,
|
||||
filenames: List[str],
|
||||
quantize: bool,
|
||||
device: torch.device,
|
||||
rank: int,
|
||||
world_size: int,
|
||||
):
|
||||
parameters = dict(model.named_parameters())
|
||||
for file in filenames:
|
||||
with safe_open(
|
||||
file, framework="pt", device=str(device) if not quantize else "cpu"
|
||||
) as f:
|
||||
for name in f.keys():
|
||||
module_name, param_name = name.rsplit(".", 1)
|
||||
module = model.get_submodule(module_name)
|
||||
|
||||
current_parameter_tensor = parameters.get(name, None)
|
||||
|
||||
slice_ = f.get_slice(name)
|
||||
|
||||
if isinstance(module, TensorParallelColumnLinear):
|
||||
size = slice_.get_shape()[0]
|
||||
block_size = size // world_size
|
||||
start = rank * block_size
|
||||
stop = (rank + 1) * block_size
|
||||
tensor = slice_[start:stop]
|
||||
elif isinstance(module, TensorParallelRowLinear):
|
||||
if param_name == "weight":
|
||||
size = slice_.get_shape()[1]
|
||||
block_size = size // world_size
|
||||
start = rank * block_size
|
||||
stop = (rank + 1) * block_size
|
||||
tensor = slice_[:, start:stop]
|
||||
else:
|
||||
tensor = slice_[:]
|
||||
# XXX: Hack for Rowlinear to add the bias only once.
|
||||
if rank != 0:
|
||||
tensor = torch.zeros_like(tensor)
|
||||
elif isinstance(module, TensorParallelEmbedding):
|
||||
size = slice_.get_shape()[0]
|
||||
block_size = size // world_size
|
||||
start = rank * block_size
|
||||
stop = (rank + 1) * block_size
|
||||
tensor = slice_[start:stop]
|
||||
elif name == "embed_out.weight" and model.gpt_neox.tp_embeddings:
|
||||
size = slice_.get_shape()[0]
|
||||
block_size = size // world_size
|
||||
start = rank * block_size
|
||||
stop = (rank + 1) * block_size
|
||||
tensor = slice_[start:stop]
|
||||
else:
|
||||
try:
|
||||
tensor = slice_[:]
|
||||
except:
|
||||
tensor = f.get_tensor(name)
|
||||
|
||||
if (
|
||||
current_parameter_tensor is not None
|
||||
and current_parameter_tensor.shape != tensor.shape
|
||||
):
|
||||
raise ValueError(
|
||||
f"Name {name} -- Current {current_parameter_tensor.shape} and got {tensor.shape}"
|
||||
)
|
||||
|
||||
tensor = tensor.contiguous()
|
||||
|
||||
if quantize:
|
||||
if not HAS_BITS_AND_BYTES:
|
||||
raise ImportError(
|
||||
"bitsandbytes is not available on your machine either because it is not installed "
|
||||
"or you don't have a GPU.\n"
|
||||
"You can install it with `pip install bitsandbytes`."
|
||||
)
|
||||
|
||||
if (
|
||||
type(module)
|
||||
in [TensorParallelRowLinear, TensorParallelColumnLinear]
|
||||
and param_name == "weight"
|
||||
):
|
||||
tensor = Int8Params(
|
||||
tensor,
|
||||
has_fp16_weights=False,
|
||||
requires_grad=False,
|
||||
).to(device)
|
||||
state = bnb.MatmulLtState()
|
||||
state.threshold = 6.0
|
||||
state.has_fp16_weights = False
|
||||
state.memory_efficient_backward = False
|
||||
state.use_pool = True
|
||||
state.CB = tensor.CB
|
||||
state.SCB = tensor.SCB
|
||||
tensor.CB = None
|
||||
tensor.SCB = None
|
||||
|
||||
def replace_linear(state):
|
||||
def linear(input, weight, bias):
|
||||
out = bnb.matmul(
|
||||
input,
|
||||
weight,
|
||||
state=state,
|
||||
threshold=state.threshold,
|
||||
bias=bias,
|
||||
)
|
||||
|
||||
if state.CB is not None:
|
||||
# we converted 8-bit row major to turing/ampere format
|
||||
# in the first inference pass
|
||||
# we no longer need the row-major weight
|
||||
del state.CB
|
||||
weight.data = state.CxB
|
||||
|
||||
return out
|
||||
|
||||
return linear
|
||||
|
||||
module.linear = replace_linear(state)
|
||||
|
||||
else:
|
||||
tensor = tensor.to(device)
|
||||
|
||||
if current_parameter_tensor is not None:
|
||||
module._parameters[param_name] = tensor
|
||||
else:
|
||||
module._buffers[param_name] = tensor
|
||||
|
||||
def forward(
|
||||
self, input_ids, attention_mask, position_ids, past_key_values: Optional = None
|
||||
):
|
||||
if self.model.gpt_neox.tp_embeddings:
|
||||
outputs = self.model.forward(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
past_key_values=past_key_values,
|
||||
use_cache=True,
|
||||
)
|
||||
|
||||
# Logits are sharded, so we need to gather them
|
||||
logits = [torch.empty_like(outputs.logits) for _ in range(self.world_size)]
|
||||
torch.distributed.all_gather(
|
||||
logits, outputs.logits, group=self.process_group
|
||||
)
|
||||
logits = torch.cat(logits, dim=2)
|
||||
|
||||
return logits, outputs.past_key_values
|
||||
# While the model itself is sharded, the embeddings might not as they might not be dividable by num-shard
|
||||
else:
|
||||
return super(GPTNeoxSharded, self).forward(
|
||||
input_ids, attention_mask, position_ids, past_key_values
|
||||
)
|
Loading…
Reference in New Issue