fix build tokenizer in quantize and remove duplicate import (#768)
# What does this PR do? Fixes #732 And remove duplicate AutoTokenizer import ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. <!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @ @OlivierDehaene OR @Narsil -->
This commit is contained in:
parent
6ec5288ab7
commit
f91e9d282d
|
@ -360,15 +360,21 @@ class GPTQ:
|
|||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
def get_wikitext2(nsamples, seed, seqlen, model_id):
|
||||
def get_wikitext2(nsamples, seed, seqlen, model_id, trust_remote_code):
|
||||
from datasets import load_dataset
|
||||
|
||||
traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
|
||||
testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_id, use_fast=False, trust_remote_code=trust_remote_code
|
||||
)
|
||||
except:
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_id, use_fast=True, trust_remote_code=trust_remote_code
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
|
||||
trainenc = tokenizer("\n\n".join(traindata["text"]), return_tensors="pt")
|
||||
testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt")
|
||||
|
||||
|
@ -386,18 +392,21 @@ def get_wikitext2(nsamples, seed, seqlen, model_id):
|
|||
return trainloader, testenc
|
||||
|
||||
|
||||
def get_ptb(nsamples, seed, seqlen, model_id):
|
||||
def get_ptb(nsamples, seed, seqlen, model_id, trust_remote_code):
|
||||
from datasets import load_dataset
|
||||
|
||||
traindata = load_dataset("ptb_text_only", "penn_treebank", split="train")
|
||||
valdata = load_dataset("ptb_text_only", "penn_treebank", split="validation")
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_id, use_fast=False, trust_remote_code=trust_remote_code
|
||||
)
|
||||
except:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_id, use_fast=True, trust_remote_code=trust_remote_code
|
||||
)
|
||||
|
||||
trainenc = tokenizer("\n\n".join(traindata["sentence"]), return_tensors="pt")
|
||||
testenc = tokenizer("\n\n".join(valdata["sentence"]), return_tensors="pt")
|
||||
|
||||
|
@ -415,7 +424,7 @@ def get_ptb(nsamples, seed, seqlen, model_id):
|
|||
return trainloader, testenc
|
||||
|
||||
|
||||
def get_c4(nsamples, seed, seqlen, model_id):
|
||||
def get_c4(nsamples, seed, seqlen, model_id, trust_remote_code):
|
||||
from datasets import load_dataset
|
||||
|
||||
traindata = load_dataset(
|
||||
|
@ -433,12 +442,14 @@ def get_c4(nsamples, seed, seqlen, model_id):
|
|||
use_auth_token=False,
|
||||
)
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_id, use_fast=False, trust_remote_code=trust_remote_code
|
||||
)
|
||||
except:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_id, use_fast=True, trust_remote_code=trust_remote_code
|
||||
)
|
||||
|
||||
import random
|
||||
|
||||
|
@ -481,18 +492,21 @@ def get_c4(nsamples, seed, seqlen, model_id):
|
|||
return trainloader, valenc
|
||||
|
||||
|
||||
def get_ptb_new(nsamples, seed, seqlen, model_id):
|
||||
def get_ptb_new(nsamples, seed, seqlen, model_id, trust_remote_code):
|
||||
from datasets import load_dataset
|
||||
|
||||
traindata = load_dataset("ptb_text_only", "penn_treebank", split="train")
|
||||
testdata = load_dataset("ptb_text_only", "penn_treebank", split="test")
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_id, use_fast=False, trust_remote_code=trust_remote_code
|
||||
)
|
||||
except:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_id, use_fast=True, trust_remote_code=trust_remote_code
|
||||
)
|
||||
|
||||
trainenc = tokenizer(" ".join(traindata["sentence"]), return_tensors="pt")
|
||||
testenc = tokenizer(" ".join(testdata["sentence"]), return_tensors="pt")
|
||||
|
||||
|
@ -510,7 +524,7 @@ def get_ptb_new(nsamples, seed, seqlen, model_id):
|
|||
return trainloader, testenc
|
||||
|
||||
|
||||
def get_c4_new(nsamples, seed, seqlen, model_id):
|
||||
def get_c4_new(nsamples, seed, seqlen, model_id, trust_remote_code):
|
||||
from datasets import load_dataset
|
||||
|
||||
traindata = load_dataset(
|
||||
|
@ -526,12 +540,14 @@ def get_c4_new(nsamples, seed, seqlen, model_id):
|
|||
split="validation",
|
||||
)
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_id, use_fast=False, trust_remote_code=trust_remote_code
|
||||
)
|
||||
except:
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_id, use_fast=True, trust_remote_code=trust_remote_code
|
||||
)
|
||||
|
||||
import random
|
||||
|
||||
|
@ -562,17 +578,17 @@ def get_c4_new(nsamples, seed, seqlen, model_id):
|
|||
return trainloader, valenc
|
||||
|
||||
|
||||
def get_loaders(name, nsamples=128, seed=0, seqlen=2048, model_id=""):
|
||||
def get_loaders(name, nsamples=128, seed=0, seqlen=2048, model_id="", trust_remote_code=False):
|
||||
if "wikitext2" in name:
|
||||
return get_wikitext2(nsamples, seed, seqlen, model_id)
|
||||
return get_wikitext2(nsamples, seed, seqlen, model_id, trust_remote_code)
|
||||
if "ptb" in name:
|
||||
if "new" in name:
|
||||
return get_ptb_new(nsamples, seed, seqlen, model_id)
|
||||
return get_ptb(nsamples, seed, seqlen, model_id)
|
||||
return get_ptb_new(nsamples, seed, seqlen, model_id, trust_remote_code)
|
||||
return get_ptb(nsamples, seed, seqlen, model_id, trust_remote_code)
|
||||
if "c4" in name:
|
||||
if "new" in name:
|
||||
return get_c4_new(nsamples, seed, seqlen, model_id)
|
||||
return get_c4(nsamples, seed, seqlen, model_id)
|
||||
return get_c4_new(nsamples, seed, seqlen, model_id, trust_remote_code)
|
||||
return get_c4(nsamples, seed, seqlen, model_id, trust_remote_code)
|
||||
|
||||
|
||||
def find_layers(module, layers=(nn.Conv2d, nn.Linear), name=""):
|
||||
|
@ -906,7 +922,12 @@ def quantize(
|
|||
seed = None
|
||||
|
||||
dataloader, testloader = get_loaders(
|
||||
dataset, nsamples=nsamples, seed=seed, model_id=model_id, seqlen=model.seqlen
|
||||
dataset,
|
||||
nsamples=nsamples,
|
||||
seed=seed,
|
||||
model_id=model_id,
|
||||
seqlen=model.seqlen,
|
||||
trust_remote_code=trust_remote_code
|
||||
)
|
||||
|
||||
tick = time.time()
|
||||
|
|
Loading…
Reference in New Issue