feat(server): Add native support for PEFT Lora models (#762)
- Will detect `peft` model by finding `adapter_config.json`. - This triggers a totally dedicated `download-weights` path - This path, loads the adapter config, finds the base model_id - It loads the base_model - Then peft_model - Then `merge_and_unload()` - Then `save_pretrained(.., safe_serialization=True) - Add back the config + tokenizer.merge_and_unload()` - Then `save_pretrained(.., safe_serialization=True) - Add back the config + tokenizer. - The chosen location is a **local folder with the name of the user chosen model id** PROs: - Easier than to expect user to merge manually - Barely any change outside of `download-weights` command. - This means everything will work in a single load. - Should enable out of the box SM + HFE CONs: - Creates a local merged model in unusual location, potentially not saved across docker reloads, or ovewriting some files if the PEFT itself was local and containing other files in addition to the lora Alternatives considered: - Add `local_files_only=True` every where (discard because of massive code change for not a good enough reason) - Return something to `launcher` about the new model-id (a cleaner location for this new model), but it would introduce new communication somewhere where we didn't need it before. - Using the HF cache folder and *stopping* the flow after `download-weights` and asking user to restart with the actual local model location Fix #482 # What does this PR do? <!-- Congratulations! You've made it this far! You're not quite done yet though. Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution. Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change. Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost. --> <!-- Remove if not applicable --> Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. <!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @ @OlivierDehaene OR @Narsil -->
This commit is contained in:
parent
8b0d608f1f
commit
ac736fd89c
|
@ -716,6 +716,11 @@ fn download_convert_model(args: &Args, running: Arc<AtomicBool>) -> Result<(), L
|
|||
download_args.push(revision.to_string())
|
||||
}
|
||||
|
||||
// Trust remote code for automatic peft fusion
|
||||
if args.trust_remote_code {
|
||||
download_args.push("--trust-remote-code".to_string());
|
||||
}
|
||||
|
||||
// Copy current process env
|
||||
let mut envs: Vec<(OsString, OsString)> = env::vars_os().collect();
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -30,6 +30,7 @@ transformers = "4.29.2"
|
|||
einops = "^0.6.1"
|
||||
texttable = { version = "^1.6.7", optional = true }
|
||||
datasets = { version = "^2.14.0", optional = true }
|
||||
peft = "^0.4.0"
|
||||
|
||||
[tool.poetry.extras]
|
||||
accelerate = ["accelerate"]
|
||||
|
|
|
@ -1,22 +1,13 @@
|
|||
accelerate==0.19.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
aiohttp==3.8.5 ; python_version >= "3.9" and python_version < "4.0"
|
||||
aiosignal==1.3.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
async-timeout==4.0.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
attrs==23.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
backoff==2.2.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
bitsandbytes==0.38.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
certifi==2023.5.7 ; python_version >= "3.9" and python_version < "4.0"
|
||||
charset-normalizer==3.1.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
click==8.1.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "win32" or python_version >= "3.9" and python_version < "4.0" and platform_system == "Windows"
|
||||
datasets==2.14.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0" and (sys_platform == "win32" or platform_system == "Windows")
|
||||
deprecated==1.2.14 ; python_version >= "3.9" and python_version < "4.0"
|
||||
dill==0.3.7 ; python_version >= "3.9" and python_version < "4.0"
|
||||
einops==0.6.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
filelock==3.12.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
frozenlist==1.4.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
fsspec==2023.6.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
fsspec[http]==2023.6.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
googleapis-common-protos==1.59.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
grpc-interceptor==0.15.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
grpcio-reflection==1.56.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
|
@ -29,10 +20,8 @@ jinja2==3.1.2 ; python_version >= "3.9" and python_version < "4.0"
|
|||
loguru==0.6.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
markupsafe==2.1.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
mpmath==1.3.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
multidict==6.0.4 ; python_version >= "3.9" and python_version < "4.0"
|
||||
multiprocess==0.70.15 ; python_version >= "3.9" and python_version < "4.0"
|
||||
networkx==3.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
numpy==1.25.0 ; python_version < "4.0" and python_version >= "3.9"
|
||||
numpy==1.25.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
opentelemetry-api==1.15.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
opentelemetry-exporter-otlp-proto-grpc==1.15.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
opentelemetry-exporter-otlp-proto-http==1.15.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
|
@ -43,30 +32,22 @@ opentelemetry-proto==1.15.0 ; python_version >= "3.9" and python_version < "4.0"
|
|||
opentelemetry-sdk==1.15.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
opentelemetry-semantic-conventions==0.36b0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
packaging==23.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pandas==2.0.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
peft==0.4.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
protobuf==4.23.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
psutil==5.9.5 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pyarrow==12.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pytz==2023.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
pyyaml==6.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
regex==2023.6.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
requests==2.31.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
safetensors==0.3.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
sentencepiece==0.1.99 ; python_version >= "3.9" and python_version < "4.0"
|
||||
setuptools==68.0.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
six==1.16.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
sympy==1.12 ; python_version >= "3.9" and python_version < "4.0"
|
||||
texttable==1.6.7 ; python_version >= "3.9" and python_version < "4.0"
|
||||
tokenizers==0.13.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
torch==2.0.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
tqdm==4.65.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
transformers==4.29.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
typer==0.6.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
typing-extensions==4.7.1 ; python_version >= "3.9" and python_version < "4.0"
|
||||
tzdata==2023.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
urllib3==2.0.3 ; python_version >= "3.9" and python_version < "4.0"
|
||||
win32-setctime==1.1.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "win32"
|
||||
wrapt==1.15.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
xxhash==3.2.0 ; python_version >= "3.9" and python_version < "4.0"
|
||||
yarl==1.9.2 ; python_version >= "3.9" and python_version < "4.0"
|
||||
|
|
|
@ -6,6 +6,7 @@ from pathlib import Path
|
|||
from loguru import logger
|
||||
from typing import Optional
|
||||
from enum import Enum
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
|
||||
app = typer.Typer()
|
||||
|
@ -88,6 +89,7 @@ def download_weights(
|
|||
auto_convert: bool = True,
|
||||
logger_level: str = "INFO",
|
||||
json_output: bool = False,
|
||||
trust_remote_code: bool = False,
|
||||
):
|
||||
# Remove default handler
|
||||
logger.remove()
|
||||
|
@ -118,6 +120,12 @@ def download_weights(
|
|||
) is not None
|
||||
|
||||
if not is_local_model:
|
||||
try:
|
||||
adapter_config_filename = hf_hub_download(model_id, revision=revision, filename="adapter_config.json")
|
||||
utils.download_and_unload_peft(model_id, revision, trust_remote_code=trust_remote_code)
|
||||
except (utils.LocalEntryNotFoundError, utils.EntryNotFoundError):
|
||||
pass
|
||||
|
||||
# Try to download weights from the hub
|
||||
try:
|
||||
filenames = utils.weight_hub_files(model_id, revision, extension)
|
||||
|
|
|
@ -54,7 +54,7 @@ class FlashRWSharded(FlashCausalLM):
|
|||
device,
|
||||
dtype,
|
||||
process_group=self.process_group,
|
||||
aliases={"transformer.word_embeddings.weight": ["lm_head.weight"]},
|
||||
aliases={"lm_head.weight": ["transformer.word_embeddings.weight"]},
|
||||
)
|
||||
|
||||
config.quantize = quantize
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from text_generation_server.utils.convert import convert_file, convert_files
|
||||
from text_generation_server.utils.dist import initialize_torch_distributed
|
||||
from text_generation_server.utils.weights import Weights
|
||||
from text_generation_server.utils.peft import download_and_unload_peft
|
||||
from text_generation_server.utils.hub import (
|
||||
weight_files,
|
||||
weight_hub_files,
|
||||
|
@ -26,6 +27,7 @@ __all__ = [
|
|||
"weight_files",
|
||||
"weight_hub_files",
|
||||
"download_weights",
|
||||
"download_and_unload_peft",
|
||||
"EntryNotFoundError",
|
||||
"HeterogeneousNextTokenChooser",
|
||||
"LocalEntryNotFoundError",
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
import os
|
||||
import json
|
||||
from loguru import logger
|
||||
import torch
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
from peft import AutoPeftModelForCausalLM, AutoPeftModelForSeq2SeqLM
|
||||
|
||||
def download_and_unload_peft(model_id, revision, trust_remote_code):
|
||||
torch_dtype = torch.float16
|
||||
|
||||
logger.info("Peft model detected.")
|
||||
logger.info("Loading the model it might take a while without feedback")
|
||||
try:
|
||||
model = AutoPeftModelForCausalLM.from_pretrained(
|
||||
model_id,
|
||||
revision=revision,
|
||||
torch_dtype=torch_dtype,
|
||||
trust_remote_code=trust_remote_code,
|
||||
low_cpu_mem_usage=True,
|
||||
)
|
||||
except Exception:
|
||||
model = AutoPeftModelForSeq2SeqLM.from_pretrained(
|
||||
model_id,
|
||||
revision=revision,
|
||||
torch_dtype=torch_dtype,
|
||||
trust_remote_code=trust_remote_code,
|
||||
low_cpu_mem_usage=True,
|
||||
)
|
||||
logger.info(f"Loaded.")
|
||||
logger.info(f"Merging the lora weights.")
|
||||
|
||||
base_model_id = model.peft_config["default"].base_model_name_or_path
|
||||
|
||||
model = model.merge_and_unload()
|
||||
|
||||
os.makedirs(model_id, exist_ok=True)
|
||||
cache_dir = model_id
|
||||
logger.info(f"Saving the newly created merged model to {cache_dir}")
|
||||
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
|
||||
model.save_pretrained(cache_dir, safe_serialization=True)
|
||||
model.config.save_pretrained(cache_dir)
|
||||
tokenizer.save_pretrained(cache_dir)
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue