Add initial support for compressed-tensors checkpoints (#2732)

compressed-tensors is a safetensors extension for sparse, quantized
tensors. The format is more powerful than earlier AWQ/GPTQ/FP8
quantization, because

- Different quantizer configurations can be used for different targets.
- The format can specify input/output quantizers in addition to weight
  quantizers.
- Configurable exclusions for quantization.

This change adds a dependency on the `compressed-tensors` package for
its configuration parsing and layer matching functionality.

The following types of quantization are supported in this PR:

- W8A16 and W4A16 INT using GPTQ-Marlin kernels.
- W8A8 and W8A16 FP using FP8-Marlin and cutlass kernels.

Support for other quantization types will be added in subsequent PRs.
This commit is contained in:
Daniël de Kok 2024-11-10 13:54:07 +01:00 committed by GitHub
parent 97f7a22f0b
commit a785000842
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
28 changed files with 2052 additions and 78 deletions

View File

@ -247,7 +247,7 @@ COPY server/Makefile server/Makefile
RUN cd server && \
make gen-server && \
pip install -r requirements_cuda.txt && \
pip install ".[bnb, accelerate, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
pip install ".[bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
pip install nvidia-nccl-cu12==2.22.3
ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2

View File

@ -296,7 +296,7 @@ COPY server/Makefile server/Makefile
RUN cd server && \
make gen-server && \
pip install -r requirements_rocm.txt && \
pip install ".[accelerate, peft, outlines]" --no-cache-dir
pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
# Install benchmarker
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark

View File

@ -102,7 +102,7 @@ COPY server/Makefile server/Makefile
RUN cd server && \
make gen-server && \
pip install -r requirements_intel.txt && \
pip install ".[accelerate, peft, outlines]" --no-cache-dir
pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest

View File

@ -63,6 +63,7 @@ Options:
Possible values:
- awq: 4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency
- compressed-tensors: Compressed tensors, which can be a mixture of different quantization methods
- eetq: 8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
- exl2: Variable bit quantization. Requires a specific EXL2 quantized model: <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1)
- gptq: 4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels

View File

@ -978,15 +978,16 @@
"nixpkgs": "nixpkgs_6"
},
"locked": {
"lastModified": 1730724647,
"narHash": "sha256-SVv+50CGaCoU4zZwsg6ZAaOi/D5QJBL1P2SIB+3CEf4=",
"lastModified": 1730795478,
"narHash": "sha256-xpkXDKnkhXO4F6Ea3reHmqwXXRzQe2PsxdRQFPCViWs=",
"owner": "huggingface",
"repo": "text-generation-inference-nix",
"rev": "1512898a1e5ad9eff025205fa9c4d33a44506cf3",
"rev": "b7f6c07867d94d6e55f5352573a6b3dad1c88e56",
"type": "github"
},
"original": {
"owner": "huggingface",
"ref": "compressed-tensors-0.7.1",
"repo": "text-generation-inference-nix",
"type": "github"
}

View File

@ -5,7 +5,7 @@
inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
};
nix-filter.url = "github:numtide/nix-filter";
tgi-nix.url = "github:huggingface/text-generation-inference-nix";
tgi-nix.url = "github:huggingface/text-generation-inference-nix/compressed-tensors-0.7.1";
nixpkgs.follows = "tgi-nix/nixpkgs";
flake-utils.url = "github:numtide/flake-utils";
rust-overlay = {

View File

@ -0,0 +1,104 @@
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.609375,
"text": "What"
},
{
"id": 374,
"logprob": -0.92529297,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94628906,
"text": " learning"
},
{
"id": 30,
"logprob": -2.9042969,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 18682,
"logprob": -0.8769531,
"special": false,
"text": " Deep"
},
{
"id": 6975,
"logprob": -0.0076942444,
"special": false,
"text": " learning"
},
{
"id": 374,
"logprob": -0.25073242,
"special": false,
"text": " is"
},
{
"id": 264,
"logprob": -0.097595215,
"special": false,
"text": " a"
},
{
"id": 955,
"logprob": -0.921875,
"special": false,
"text": " type"
},
{
"id": 315,
"logprob": -0.00027918816,
"special": false,
"text": " of"
},
{
"id": 21075,
"logprob": -0.5527344,
"special": false,
"text": " artificial"
},
{
"id": 11478,
"logprob": -0.042541504,
"special": false,
"text": " intelligence"
},
{
"id": 320,
"logprob": -0.38891602,
"special": false,
"text": " ("
},
{
"id": 15836,
"logprob": -0.0011043549,
"special": false,
"text": "AI"
}
],
"top_tokens": null
},
"generated_text": " Deep learning is a type of artificial intelligence (AI"
}

View File

@ -0,0 +1,99 @@
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.609375,
"text": "What"
},
{
"id": 374,
"logprob": -0.92529297,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94628906,
"text": " learning"
}
],
"seed": 0,
"tokens": [
{
"id": 5380,
"logprob": -0.23840332,
"special": false,
"text": "?\n"
},
{
"id": 34564,
"logprob": 0.0,
"special": false,
"text": "Deep"
},
{
"id": 6975,
"logprob": 0.0,
"special": false,
"text": " learning"
},
{
"id": 11,
"logprob": 0.0,
"special": false,
"text": ","
},
{
"id": 1101,
"logprob": -1.2011719,
"special": false,
"text": " also"
},
{
"id": 3967,
"logprob": 0.0,
"special": false,
"text": " known"
},
{
"id": 439,
"logprob": 0.0,
"special": false,
"text": " as"
},
{
"id": 30828,
"logprob": 0.0,
"special": false,
"text": " neural"
},
{
"id": 4009,
"logprob": -0.6777344,
"special": false,
"text": " network"
},
{
"id": 477,
"logprob": 0.0,
"special": false,
"text": " or"
}
],
"top_tokens": null
},
"generated_text": "What is deep learning?\nDeep learning, also known as neural network or"
}

View File

@ -0,0 +1,418 @@
[
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.609375,
"text": "What"
},
{
"id": 374,
"logprob": -0.92529297,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94628906,
"text": " learning"
},
{
"id": 30,
"logprob": -2.9042969,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 18682,
"logprob": -0.8769531,
"special": false,
"text": " Deep"
},
{
"id": 6975,
"logprob": -0.0076942444,
"special": false,
"text": " learning"
},
{
"id": 374,
"logprob": -0.25146484,
"special": false,
"text": " is"
},
{
"id": 264,
"logprob": -0.097595215,
"special": false,
"text": " a"
},
{
"id": 955,
"logprob": -0.9248047,
"special": false,
"text": " type"
},
{
"id": 315,
"logprob": -0.00027513504,
"special": false,
"text": " of"
},
{
"id": 21075,
"logprob": -0.5527344,
"special": false,
"text": " artificial"
},
{
"id": 11478,
"logprob": -0.043151855,
"special": false,
"text": " intelligence"
},
{
"id": 320,
"logprob": -0.3840332,
"special": false,
"text": " ("
},
{
"id": 15836,
"logprob": -0.0011043549,
"special": false,
"text": "AI"
}
],
"top_tokens": null
},
"generated_text": " Deep learning is a type of artificial intelligence (AI"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.6054688,
"text": "What"
},
{
"id": 374,
"logprob": -0.92089844,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94433594,
"text": " learning"
},
{
"id": 30,
"logprob": -2.90625,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 18682,
"logprob": -0.875,
"special": false,
"text": " Deep"
},
{
"id": 6975,
"logprob": -0.007698059,
"special": false,
"text": " learning"
},
{
"id": 374,
"logprob": -0.25268555,
"special": false,
"text": " is"
},
{
"id": 264,
"logprob": -0.09753418,
"special": false,
"text": " a"
},
{
"id": 955,
"logprob": -0.92529297,
"special": false,
"text": " type"
},
{
"id": 315,
"logprob": -0.00027942657,
"special": false,
"text": " of"
},
{
"id": 21075,
"logprob": -0.5527344,
"special": false,
"text": " artificial"
},
{
"id": 11478,
"logprob": -0.042541504,
"special": false,
"text": " intelligence"
},
{
"id": 320,
"logprob": -0.3840332,
"special": false,
"text": " ("
},
{
"id": 15836,
"logprob": -0.0011053085,
"special": false,
"text": "AI"
}
],
"top_tokens": null
},
"generated_text": " Deep learning is a type of artificial intelligence (AI"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.6054688,
"text": "What"
},
{
"id": 374,
"logprob": -0.92089844,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94433594,
"text": " learning"
},
{
"id": 30,
"logprob": -2.90625,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 18682,
"logprob": -0.875,
"special": false,
"text": " Deep"
},
{
"id": 6975,
"logprob": -0.007698059,
"special": false,
"text": " learning"
},
{
"id": 374,
"logprob": -0.25268555,
"special": false,
"text": " is"
},
{
"id": 264,
"logprob": -0.09753418,
"special": false,
"text": " a"
},
{
"id": 955,
"logprob": -0.92529297,
"special": false,
"text": " type"
},
{
"id": 315,
"logprob": -0.00027942657,
"special": false,
"text": " of"
},
{
"id": 21075,
"logprob": -0.5527344,
"special": false,
"text": " artificial"
},
{
"id": 11478,
"logprob": -0.042541504,
"special": false,
"text": " intelligence"
},
{
"id": 320,
"logprob": -0.3840332,
"special": false,
"text": " ("
},
{
"id": 15836,
"logprob": -0.0011053085,
"special": false,
"text": "AI"
}
],
"top_tokens": null
},
"generated_text": " Deep learning is a type of artificial intelligence (AI"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 128000,
"logprob": null,
"text": "<|begin_of_text|>"
},
{
"id": 3923,
"logprob": -7.6054688,
"text": "What"
},
{
"id": 374,
"logprob": -0.92089844,
"text": " is"
},
{
"id": 5655,
"logprob": -10.0,
"text": " deep"
},
{
"id": 6975,
"logprob": -0.94433594,
"text": " learning"
},
{
"id": 30,
"logprob": -2.90625,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 18682,
"logprob": -0.875,
"special": false,
"text": " Deep"
},
{
"id": 6975,
"logprob": -0.007698059,
"special": false,
"text": " learning"
},
{
"id": 374,
"logprob": -0.25268555,
"special": false,
"text": " is"
},
{
"id": 264,
"logprob": -0.09753418,
"special": false,
"text": " a"
},
{
"id": 955,
"logprob": -0.92529297,
"special": false,
"text": " type"
},
{
"id": 315,
"logprob": -0.00027942657,
"special": false,
"text": " of"
},
{
"id": 21075,
"logprob": -0.5527344,
"special": false,
"text": " artificial"
},
{
"id": 11478,
"logprob": -0.042541504,
"special": false,
"text": " intelligence"
},
{
"id": 320,
"logprob": -0.3840332,
"special": false,
"text": " ("
},
{
"id": 15836,
"logprob": -0.0011053085,
"special": false,
"text": "AI"
}
],
"top_tokens": null
},
"generated_text": " Deep learning is a type of artificial intelligence (AI"
}
]

View File

@ -0,0 +1,104 @@
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 1841,
"logprob": -5.46875,
"text": "What"
},
{
"id": 603,
"logprob": -0.69140625,
"text": " is"
},
{
"id": 5271,
"logprob": -12.0,
"text": " deep"
},
{
"id": 6044,
"logprob": -0.32226562,
"text": " learning"
},
{
"id": 235336,
"logprob": -0.33203125,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 109,
"logprob": -0.24707031,
"special": false,
"text": "\n\n"
},
{
"id": 26843,
"logprob": -0.14550781,
"special": false,
"text": "Deep"
},
{
"id": 6044,
"logprob": -0.038330078,
"special": false,
"text": " learning"
},
{
"id": 603,
"logprob": -0.029907227,
"special": false,
"text": " is"
},
{
"id": 476,
"logprob": -0.020996094,
"special": false,
"text": " a"
},
{
"id": 38397,
"logprob": -0.828125,
"special": false,
"text": " subset"
},
{
"id": 576,
"logprob": -0.00049209595,
"special": false,
"text": " of"
},
{
"id": 6479,
"logprob": -0.057373047,
"special": false,
"text": " machine"
},
{
"id": 6044,
"logprob": -0.000207901,
"special": false,
"text": " learning"
},
{
"id": 674,
"logprob": -0.15429688,
"special": false,
"text": " that"
}
],
"top_tokens": null
},
"generated_text": "\n\nDeep learning is a subset of machine learning that"
}

View File

@ -0,0 +1,99 @@
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 1841,
"logprob": -5.46875,
"text": "What"
},
{
"id": 603,
"logprob": -0.69140625,
"text": " is"
},
{
"id": 5271,
"logprob": -12.0,
"text": " deep"
},
{
"id": 6044,
"logprob": -0.32226562,
"text": " learning"
}
],
"seed": 0,
"tokens": [
{
"id": 235336,
"logprob": 0.0,
"special": false,
"text": "?"
},
{
"id": 109,
"logprob": 0.0,
"special": false,
"text": "\n\n"
},
{
"id": 26843,
"logprob": 0.0,
"special": false,
"text": "Deep"
},
{
"id": 14715,
"logprob": -0.38671875,
"special": false,
"text": " Learning"
},
{
"id": 603,
"logprob": 0.0,
"special": false,
"text": " is"
},
{
"id": 476,
"logprob": 0.0,
"special": false,
"text": " a"
},
{
"id": 38397,
"logprob": -0.12695312,
"special": false,
"text": " subset"
},
{
"id": 576,
"logprob": 0.0,
"special": false,
"text": " of"
},
{
"id": 6479,
"logprob": 0.0,
"special": false,
"text": " machine"
},
{
"id": 6044,
"logprob": 0.0,
"special": false,
"text": " learning"
}
],
"top_tokens": null
},
"generated_text": "What is deep learning?\n\nDeep Learning is a subset of machine learning"
}

View File

@ -0,0 +1,418 @@
[
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 1841,
"logprob": -5.46875,
"text": "What"
},
{
"id": 603,
"logprob": -0.69140625,
"text": " is"
},
{
"id": 5271,
"logprob": -12.0,
"text": " deep"
},
{
"id": 6044,
"logprob": -0.32226562,
"text": " learning"
},
{
"id": 235336,
"logprob": -0.33203125,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 109,
"logprob": -0.24707031,
"special": false,
"text": "\n\n"
},
{
"id": 26843,
"logprob": -0.14550781,
"special": false,
"text": "Deep"
},
{
"id": 6044,
"logprob": -0.03857422,
"special": false,
"text": " learning"
},
{
"id": 603,
"logprob": -0.030883789,
"special": false,
"text": " is"
},
{
"id": 476,
"logprob": -0.020996094,
"special": false,
"text": " a"
},
{
"id": 38397,
"logprob": -0.828125,
"special": false,
"text": " subset"
},
{
"id": 576,
"logprob": -0.00051498413,
"special": false,
"text": " of"
},
{
"id": 6479,
"logprob": -0.05883789,
"special": false,
"text": " machine"
},
{
"id": 6044,
"logprob": -0.00020694733,
"special": false,
"text": " learning"
},
{
"id": 674,
"logprob": -0.15820312,
"special": false,
"text": " that"
}
],
"top_tokens": null
},
"generated_text": "\n\nDeep learning is a subset of machine learning that"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 1841,
"logprob": -5.46875,
"text": "What"
},
{
"id": 603,
"logprob": -0.71484375,
"text": " is"
},
{
"id": 5271,
"logprob": -12.0,
"text": " deep"
},
{
"id": 6044,
"logprob": -0.30859375,
"text": " learning"
},
{
"id": 235336,
"logprob": -0.3359375,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 109,
"logprob": -0.23828125,
"special": false,
"text": "\n\n"
},
{
"id": 26843,
"logprob": -0.14550781,
"special": false,
"text": "Deep"
},
{
"id": 6044,
"logprob": -0.038330078,
"special": false,
"text": " learning"
},
{
"id": 603,
"logprob": -0.030883789,
"special": false,
"text": " is"
},
{
"id": 476,
"logprob": -0.020996094,
"special": false,
"text": " a"
},
{
"id": 38397,
"logprob": -0.80859375,
"special": false,
"text": " subset"
},
{
"id": 576,
"logprob": -0.0005455017,
"special": false,
"text": " of"
},
{
"id": 6479,
"logprob": -0.05908203,
"special": false,
"text": " machine"
},
{
"id": 6044,
"logprob": -0.00020599365,
"special": false,
"text": " learning"
},
{
"id": 674,
"logprob": -0.17285156,
"special": false,
"text": " that"
}
],
"top_tokens": null
},
"generated_text": "\n\nDeep learning is a subset of machine learning that"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 1841,
"logprob": -5.46875,
"text": "What"
},
{
"id": 603,
"logprob": -0.71484375,
"text": " is"
},
{
"id": 5271,
"logprob": -12.0,
"text": " deep"
},
{
"id": 6044,
"logprob": -0.30859375,
"text": " learning"
},
{
"id": 235336,
"logprob": -0.3359375,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 109,
"logprob": -0.23828125,
"special": false,
"text": "\n\n"
},
{
"id": 26843,
"logprob": -0.14550781,
"special": false,
"text": "Deep"
},
{
"id": 6044,
"logprob": -0.038330078,
"special": false,
"text": " learning"
},
{
"id": 603,
"logprob": -0.030883789,
"special": false,
"text": " is"
},
{
"id": 476,
"logprob": -0.020996094,
"special": false,
"text": " a"
},
{
"id": 38397,
"logprob": -0.80859375,
"special": false,
"text": " subset"
},
{
"id": 576,
"logprob": -0.0005455017,
"special": false,
"text": " of"
},
{
"id": 6479,
"logprob": -0.05908203,
"special": false,
"text": " machine"
},
{
"id": 6044,
"logprob": -0.00020599365,
"special": false,
"text": " learning"
},
{
"id": 674,
"logprob": -0.17285156,
"special": false,
"text": " that"
}
],
"top_tokens": null
},
"generated_text": "\n\nDeep learning is a subset of machine learning that"
},
{
"details": {
"best_of_sequences": null,
"finish_reason": "length",
"generated_tokens": 10,
"prefill": [
{
"id": 2,
"logprob": null,
"text": "<bos>"
},
{
"id": 1841,
"logprob": -5.46875,
"text": "What"
},
{
"id": 603,
"logprob": -0.71484375,
"text": " is"
},
{
"id": 5271,
"logprob": -12.0,
"text": " deep"
},
{
"id": 6044,
"logprob": -0.30859375,
"text": " learning"
},
{
"id": 235336,
"logprob": -0.3359375,
"text": "?"
}
],
"seed": null,
"tokens": [
{
"id": 109,
"logprob": -0.23828125,
"special": false,
"text": "\n\n"
},
{
"id": 26843,
"logprob": -0.14550781,
"special": false,
"text": "Deep"
},
{
"id": 6044,
"logprob": -0.038330078,
"special": false,
"text": " learning"
},
{
"id": 603,
"logprob": -0.030883789,
"special": false,
"text": " is"
},
{
"id": 476,
"logprob": -0.020996094,
"special": false,
"text": " a"
},
{
"id": 38397,
"logprob": -0.80859375,
"special": false,
"text": " subset"
},
{
"id": 576,
"logprob": -0.0005455017,
"special": false,
"text": " of"
},
{
"id": 6479,
"logprob": -0.05908203,
"special": false,
"text": " machine"
},
{
"id": 6044,
"logprob": -0.00020599365,
"special": false,
"text": " learning"
},
{
"id": 674,
"logprob": -0.17285156,
"special": false,
"text": " that"
}
],
"top_tokens": null
},
"generated_text": "\n\nDeep learning is a subset of machine learning that"
}
]

View File

@ -0,0 +1,86 @@
import pytest
@pytest.fixture(scope="module")
def compressed_tensors_w8an_handle(launcher):
with launcher(
"neuralmagic/Llama-3.2-1B-Instruct-FP8",
num_shard=2,
quantize="compressed-tensors",
) as handle:
yield handle
@pytest.fixture(scope="module")
async def compressed_tensors_w8an(compressed_tensors_w8an_handle):
await compressed_tensors_w8an_handle.health(300)
return compressed_tensors_w8an_handle.client
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_w8an(compressed_tensors_w8an, response_snapshot):
response = await compressed_tensors_w8an.generate(
"What is deep learning?",
max_new_tokens=10,
decoder_input_details=True,
)
assert (
response.generated_text
== " Deep learning is a type of artificial intelligence (AI"
)
assert response.details.generated_tokens == 10
assert response == response_snapshot
@pytest.mark.asyncio
async def test_compressed_tensors_w8an_all_params(
compressed_tensors_w8an, response_snapshot
):
response = await compressed_tensors_w8an.generate(
"What is deep learning",
max_new_tokens=10,
repetition_penalty=1.2,
return_full_text=True,
stop_sequences=["test"],
temperature=0.5,
top_p=0.9,
top_k=10,
truncate=5,
typical_p=0.9,
watermark=True,
decoder_input_details=True,
seed=0,
)
assert response.details.generated_tokens == 10
assert (
response.generated_text
== "What is deep learning?\nDeep learning, also known as neural network or"
)
assert response == response_snapshot
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_w8an_load(
compressed_tensors_w8an, generate_load, response_snapshot
):
responses = await generate_load(
compressed_tensors_w8an,
"What is deep learning?",
max_new_tokens=10,
n=4,
)
assert (
responses[0].generated_text
== " Deep learning is a type of artificial intelligence (AI"
)
assert len(responses) == 4
assert all([r.generated_text == responses[0].generated_text for r in responses])
assert responses == response_snapshot

View File

@ -0,0 +1,86 @@
import pytest
@pytest.fixture(scope="module")
def compressed_tensors_wna16_handle(launcher):
with launcher(
"neuralmagic/gemma-2-2b-it-quantized.w4a16",
num_shard=2,
quantize="compressed-tensors",
) as handle:
yield handle
@pytest.fixture(scope="module")
async def compressed_tensors_wna16(compressed_tensors_wna16_handle):
await compressed_tensors_wna16_handle.health(300)
return compressed_tensors_wna16_handle.client
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_wna16(compressed_tensors_wna16, response_snapshot):
response = await compressed_tensors_wna16.generate(
"What is deep learning?",
max_new_tokens=10,
decoder_input_details=True,
)
assert (
response.generated_text
== "\n\nDeep learning is a subset of machine learning that"
)
assert response.details.generated_tokens == 10
assert response == response_snapshot
@pytest.mark.asyncio
async def test_compressed_tensors_wna16_all_params(
compressed_tensors_wna16, response_snapshot
):
response = await compressed_tensors_wna16.generate(
"What is deep learning",
max_new_tokens=10,
repetition_penalty=1.2,
return_full_text=True,
stop_sequences=["test"],
temperature=0.5,
top_p=0.9,
top_k=10,
truncate=5,
typical_p=0.9,
watermark=True,
decoder_input_details=True,
seed=0,
)
assert response.details.generated_tokens == 10
assert (
response.generated_text
== "What is deep learning?\n\nDeep Learning is a subset of machine learning"
)
assert response == response_snapshot
@pytest.mark.release
@pytest.mark.asyncio
@pytest.mark.private
async def test_compressed_tensors_wna16_load(
compressed_tensors_wna16, generate_load, response_snapshot
):
responses = await generate_load(
compressed_tensors_wna16,
"What is deep learning?",
max_new_tokens=10,
n=4,
)
assert (
responses[0].generated_text
== "\n\nDeep learning is a subset of machine learning that"
)
assert len(responses) == 4
assert all([r.generated_text == responses[0].generated_text for r in responses])
assert responses == response_snapshot

View File

@ -212,6 +212,8 @@ enum Quantization {
/// <https://hf.co/models?search=awq>.
/// Should replace GPTQ models wherever possible because of the better latency
Awq,
/// Compressed tensors, which can be a mixture of different quantization methods.
CompressedTensors,
/// 8 bit quantization, doesn't require specific model.
/// Should be a drop-in replacement to bitsandbytes with much better performance.
/// Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
@ -274,6 +276,9 @@ impl std::fmt::Display for Quantization {
Quantization::Awq => {
write!(f, "awq")
}
Quantization::CompressedTensors => {
write!(f, "compressed-tensors")
}
Quantization::Eetq => {
write!(f, "eetq")
}

View File

@ -5,6 +5,7 @@
mypy-protobuf,
awq-inference-engine,
causal-conv1d,
compressed-tensors,
eetq,
einops,
exllamav2,
@ -74,6 +75,7 @@ buildPythonPackage {
awq-inference-engine
eetq
causal-conv1d
compressed-tensors
einops
exllamav2
flashinfer

View File

@ -23,7 +23,7 @@ gen-server:
install-server: gen-server
pip install pip --upgrade
pip install -r requirements_cuda.txt
pip install -e ".[accelerate, quantize, peft, outlines]"
pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
install: install-cuda

24
server/poetry.lock generated
View File

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
[[package]]
name = "accelerate"
@ -388,6 +388,26 @@ files = [
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
]
[[package]]
name = "compressed-tensors"
version = "0.7.1"
description = "Library for utilization of compressed safetensors of neural network models"
optional = true
python-versions = "*"
files = [
{file = "compressed-tensors-0.7.1.tar.gz", hash = "sha256:3c7865ebfe4ea76ae94d7c674bcf93aedd2064571f682c09a377a219d5ebb3a0"},
{file = "compressed_tensors-0.7.1-py3-none-any.whl", hash = "sha256:22d11558a70f655ae647db9c8e9fb14a5e9d6983ca5aec3f267518625fd6dd0e"},
]
[package.dependencies]
pydantic = ">=2.0"
torch = ">=1.7.0"
transformers = "*"
[package.extras]
accelerate = ["accelerate"]
dev = ["black (==22.12.0)", "flake8 (>=3.8.3)", "isort (==5.8.0)", "nbconvert (>=7.16.3)", "pytest (>=6.0.0)", "wheel (>=0.36.2)"]
[[package]]
name = "datasets"
version = "2.21.0"
@ -3982,4 +4002,4 @@ torch = ["torch"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.9,<3.13"
content-hash = "b39033e573f50a0f046787aebf1702d86673aad0b2fcee818404fcea7f644b81"
content-hash = "4636689efd4c94559c3c23903aafcffd177533a3b9006b3b4f8491b158a3a754"

View File

@ -37,6 +37,7 @@ pillow = "^10.0.0"
outlines= { version = "^0.0.34", optional = true }
prometheus-client = "^0.20.0"
py-cpuinfo = "^9.0.0"
compressed-tensors = { version = "^0.7.1", optional = true }
# Remove later, temporary workaround for outlines.
numpy = "^1.26"
@ -58,6 +59,7 @@ rich = "^13.7.1"
torch = ["torch"]
accelerate = ["accelerate"]
bnb = ["bitsandbytes"]
compressed-tensors = ["compressed-tensors"]
marlin = ["marlin-kernels"]
moe = ["moe-kernels"]
peft = ["peft"]

View File

@ -19,6 +19,7 @@ class Quantization(str, Enum):
bitsandbytes_fp4 = "bitsandbytes-fp4"
gptq = "gptq"
awq = "awq"
compressed_tensors = "compressed-tensors"
eetq = "eetq"
exl2 = "exl2"
fp8 = "fp8"

View File

@ -0,0 +1,3 @@
from .loader import CompressedTensorsLoader
__all__ = ["CompressedTensorsLoader"]

View File

@ -0,0 +1,174 @@
from typing import Any, Dict, List, Union
from compressed_tensors import QuantizationConfig, QuantizationStatus
from compressed_tensors.config import CompressionFormat
from compressed_tensors.quantization import (
QuantizationScheme,
QuantizationType,
find_name_or_class_matches,
)
from loguru import logger
from pydantic import ValidationError
from torch import nn
from text_generation_server.layers.compressed_tensors.w8an_fp import W8ANFpLoader
from text_generation_server.layers.compressed_tensors.wna16_int import WNA16Loader
from text_generation_server.utils.log import log_once
from text_generation_server.utils.weights import (
DefaultWeightsLoader,
UnquantizedWeight,
Weights,
WeightsLoader,
)
# compressed-tensors can match modules as quantization targets. However,
# they need to be objects rather than classes or class names. Since we
# need to match `Linear` targets, make an instance that can be re-used.
_EMPTY_LINEAR: nn.Module = nn.Linear(0, 0)
class CompressedTensorsLoader(WeightsLoader):
"""Loader for checkpoints stored in the compressed-tensors format."""
def __init__(self, config: Dict[str, Any]):
quantization_config_raw = config.get("quantization_config")
if quantization_config_raw is None:
# `compression_config` was renamed to `quantization_config`; support
# retained for backward compatibility.
quantization_config_raw = config.get("compression_config")
if quantization_config_raw is None:
raise ValueError(
"Checkpoint does not have compressed-tensors configuration"
)
try:
quantization_config = QuantizationConfig.model_validate(
quantization_config_raw
)
except ValidationError as e:
raise ValueError("Cannot parse compressed-tensors configuration") from e
if quantization_config.quantization_status not in (
QuantizationStatus.COMPRESSED,
QuantizationStatus.FROZEN,
):
raise ValueError(
f"Model quantization was not finished, status was: {quantization_config.quantization_status}"
)
self.ignore = (
quantization_config.ignore if quantization_config.ignore is not None else []
)
self.loaders = self._get_target_loaders(quantization_config)
for target, loader in self.loaders.items():
log_once(
logger.info,
f"Using {loader} for compressed-tensors target '{target}'",
)
def get_weights(self, weights: Weights, prefix: str):
loader = self._lookup_loader(prefix)
return loader.get_weights(weights, prefix)
def get_weights_col_packed(
self,
weights: "Weights",
prefix: str,
block_sizes: Union[int, List[int]],
):
loader = self._lookup_loader(prefix)
return loader.get_weights_col_packed(weights, prefix, block_sizes)
def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
loader = self._lookup_loader(prefixes[0])
return loader.get_multi_weights_col(weights, prefixes, dim)
def get_weights_row(self, weights: Weights, prefix: str):
loader = self._lookup_loader(prefix)
return loader.get_weights_row(weights, prefix)
def _get_target_loaders(
self, quantization_config: QuantizationConfig
) -> Dict[str, WeightsLoader]:
"""
A compressed-tensors checkpoint can use different quantizations
for different targets. This method returns a dictionary with a
loader per target.
"""
loaders: Dict[str, WeightsLoader] = {}
format = quantization_config.format
for group_name, group in quantization_config.config_groups.items():
# The group configuration can be a string, but does that ever
# happen in a serialized quantization config?
assert isinstance(group, QuantizationScheme)
loader = self._create_loader_for_group(format, group_name, group)
# A quantized parameter group can have multiple targets, add the
# loader for all the targets.
for target in group.targets:
if target in loaders:
raise ValueError(
f"Target '{target} has multiple configured loaders'"
)
loaders[target] = loader
return loaders
def _create_loader_for_group(
self, format: str, group_name: str, group: QuantizationScheme
) -> WeightsLoader:
"""
Find and create a loader for the group with the given quantization
scheme.
"""
# NOTE: we ignore group.output_activations because we don't support
# output quantization yet.
input_activations = group.input_activations
weights = group.weights
if (
format
in {
CompressionFormat.float_quantized.value,
CompressionFormat.naive_quantized.value,
}
and weights is not None
and weights.type == QuantizationType.FLOAT
and weights.num_bits == 8
):
# FP W8A8 or W8A16.
return W8ANFpLoader(input_activations=input_activations, weights=weights)
elif (
format == CompressionFormat.pack_quantized.value
and weights is not None
and weights.type == QuantizationType.INT
and weights.num_bits in (4, 8)
):
# INT W4A16 or W8A16 (GPTQ/AWQ-like).
return WNA16Loader(weights)
else:
raise ValueError(
f"Group '{group_name}' has unsupported compressed-tensors configurtion"
)
def _lookup_loader(self, prefix: str) -> WeightsLoader:
"""
Look up the loader to use for a given parameter name (prefix).
"""
if len(find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.ignore)) > 0:
return DefaultWeightsLoader(UnquantizedWeight)
# We currently only handle linear layers, so unconditionally pass
# a `Linear` instance.
targets = find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.loaders.keys())
if len(targets) == 0:
raise ValueError(
f"Cannot find compressed-tensors target for prefix: {prefix}"
)
return self.loaders[targets[0]]

View File

@ -0,0 +1,174 @@
from typing import List, Optional, Union
import torch
from compressed_tensors.quantization import QuantizationArgs, QuantizationType
from text_generation_server.layers.fp8 import Fp8Weight, _load_scalar_or_matrix_scale
from text_generation_server.utils.weights import Weights, WeightsLoader
class W8ANFpLoader(WeightsLoader):
"""
Loader for W8A8/W8A16 FP compressed-tensors parameters.
"""
def __init__(
self,
*,
input_activations: Optional[QuantizationArgs],
weights: QuantizationArgs,
):
assert weights.type == QuantizationType.FLOAT and weights.num_bits == 8
# We ignore the `strategy` option which sets the scales to be
# per-tensor, per-channel or per-token. What scales are supported
# is dependent on the kernels used (e.g. cutlass can do tokenwise,
# Torch cannot, and FP8-Marlin does not quantize inputs at all).
# So, instead we try to use the best-possible configuration.
self.load_weight_scale = not weights.dynamic
self.load_input_scale = (
input_activations is not None and not input_activations.dynamic
)
self.force_w8a16 = (
input_activations is not None and input_activations.num_bits == 16
)
def __str__(self) -> str:
def scale_to_str(scale):
return "static" if scale else "dynamic"
quantization_type = f"W8A{16 if self.force_w8a16 else 8}"
return f"{self.__class__.__name__} ({quantization_type}, weight: {scale_to_str(self.load_weight_scale)}, input: {scale_to_str(self.load_input_scale)})"
def get_weights(self, weights: "Weights", prefix: str):
w = weights.get_tensor(f"{prefix}.weight")
weight_scale = None
if self.load_weight_scale:
weight_scale = (
weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
.reshape(-1)
.expand(w.shape[0])
)
input_scale = None
if self.load_input_scale:
input_scale = weights.get_tensor(
f"{prefix}.input_scale", to_dtype=False
).reshape(-1)
return Fp8Weight(
weight=w,
weight_scale=weight_scale,
input_scale=input_scale,
dtype=weights.dtype,
force_w8a16=self.force_w8a16,
)
def get_weights_col_packed(
self,
weights: Weights,
prefix: str,
block_sizes: Union[int, List[int]],
):
w = weights.get_packed_sharded(
f"{prefix}.weight", dim=0, block_sizes=block_sizes
)
weight_scale = None
if self.load_weight_scale:
weight_scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
if weight_scale.numel() > 1:
weight_scale = weights.get_packed_sharded(
f"{prefix}.weight_scale",
dim=0,
block_sizes=block_sizes,
to_dtype=False,
)
weight_scale = weight_scale.reshape(-1).expand(w.shape[0])
input_scale = None
if self.load_input_scale:
input_scale = weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
if input_scale.numel() > 1:
input_scale = weights.get_packed_sharded(
f"{prefix}.input_scale",
dim=0,
block_sizes=block_sizes,
to_dtype=False,
)
input_scale = input_scale.reshape(-1).max()
return Fp8Weight(
weight=w,
weight_scale=weight_scale,
input_scale=input_scale,
dtype=weights.dtype,
force_w8a16=self.force_w8a16,
)
def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
# FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
w = [
weights.get_sharded(f"{p}.weight", dim=0, to_device=False) for p in prefixes
]
shapes = [x.shape for x in w]
# Concat then send to the device
w = torch.cat(w, dim=dim).to(weights.device)
weight_scale = None
if self.load_weight_scale:
weight_scale = [
_load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape)
for p, shape in zip(prefixes, shapes)
]
weight_scale = torch.cat(weight_scale, dim=0).reshape(-1)
input_scale = None
if self.load_input_scale:
input_scale = [
_load_scalar_or_matrix_scale(weights, f"{p}.input_scale", shape)
for p, shape in zip(prefixes, shapes)
if weights.has_tensor(f"{p}.input_scale")
]
assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
input_scale = (
torch.cat(input_scale, dim=0).reshape(-1).max()
if len(input_scale) != 0
else None
)
return Fp8Weight(
weight=w,
weight_scale=weight_scale,
input_scale=input_scale,
dtype=weights.dtype,
force_w8a16=self.force_w8a16,
)
def get_weights_row(self, weights: "Weights", prefix: str):
w = weights.get_sharded(f"{prefix}.weight", dim=1)
weight_scale = None
if self.load_weight_scale:
weight_scale = (
weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
.reshape(-1)
.expand(w.shape[0])
)
input_scale = None
if self.load_input_scale:
input_scale = weights.get_tensor(
f"{prefix}.input_scale", to_dtype=False
).reshape(-1)
return Fp8Weight(
weight=w,
weight_scale=weight_scale,
input_scale=input_scale,
dtype=weights.dtype,
force_w8a16=self.force_w8a16,
)

View File

@ -0,0 +1,188 @@
from typing import List, Union
import torch
from compressed_tensors.quantization import ActivationOrdering, QuantizationArgs
from loguru import logger
from text_generation_server.layers.marlin.gptq import repack_gptq_for_marlin
from text_generation_server.utils.log import log_once
from text_generation_server.utils.weights import Weights, WeightsLoader
class WNA16Loader(WeightsLoader):
"""
Loader for W4A16/W8A16 INT compressed-tensors parameters.
"""
def __init__(self, weights: QuantizationArgs):
self.weights = weights
self.desc_act = self.weights.actorder == ActivationOrdering.GROUP
self.groupsize = (
-1 if self.weights.group_size is None else self.weights.group_size
)
def __str__(self) -> str:
quantization_type = f"W{self.weights.num_bits}8A16"
return f"{self.__class__.__name__} ({quantization_type})"
def get_weights(self, weights: Weights, prefix: str):
log_once(logger.info, "Using GPTQ-Marlin kernels")
try:
weight_packed = weights.get_tensor(f"{prefix}.weight_packed").t()
except RuntimeError:
raise RuntimeError(
f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
)
zero_point = None
if not self.weights.symmetric:
zero_point = weights.get_tensor(f"{prefix}.weight_zero_point").t()
g_idx = None
if self.desc_act:
g_idx = weights.get_tensor(f"{prefix}.weight_g_idx")
scales = weights.get_tensor(f"{prefix}.weight.scales").t()
return repack_gptq_for_marlin(
qweight=weight_packed.contiguous(),
scales=scales,
qzeros=zero_point,
g_idx=g_idx,
bits=self.weights.num_bits,
desc_act=self.desc_act,
groupsize=self.groupsize,
quant_method="compressed-tensors",
sym=self.weights.symmetric,
sharded_infeatures=False,
)
def get_weights_col_packed(
self,
weights: Weights,
prefix: str,
block_sizes: Union[int, List[int]],
):
try:
weight_packed = weights.get_packed_sharded(
f"{prefix}.weight_packed", dim=0, block_sizes=block_sizes
).t()
except RuntimeError:
raise RuntimeError(
f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
)
scales = weights.get_packed_sharded(
f"{prefix}.weight_scale", dim=0, block_sizes=block_sizes
).t()
scales = scales.to(dtype=weights.dtype)
zero_point = None
if not self.weights.symmetric:
zero_point = weights.get_packed_sharded(
f"{prefix}.qzeros", dim=0, block_sizes=block_sizes
).t()
g_idx = None
if self.desc_act:
g_idx = weights.get_tensor(f"{prefix}.g_idx")
return repack_gptq_for_marlin(
qweight=weight_packed.contiguous(),
scales=scales,
qzeros=zero_point,
g_idx=g_idx,
bits=self.weights.num_bits,
desc_act=self.desc_act,
groupsize=self.groupsize,
quant_method="compressed-tensors",
sym=self.weights.symmetric,
sharded_infeatures=False,
)
def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
try:
weight_packed = torch.cat(
[
weights.get_sharded(f"{p}.weight_packed", dim=0).t()
for p in prefixes
],
dim=1,
)
except RuntimeError:
raise RuntimeError(
f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
)
scales = torch.cat(
[weights.get_sharded(f"{p}.weight_scale", dim=0).t() for p in prefixes],
dim=1,
)
zero_point = None
if not self.weights.symmetric:
zero_point = torch.cat(
[weights.get_sharded(f"{p}.qzeros", dim=0).t() for p in prefixes], dim=1
).t()
g_idx = None
if self.desc_act:
w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
for w2 in w[1:]:
torch.testing.assert_close(w2, w[0])
g_idx = w[0]
return repack_gptq_for_marlin(
qweight=weight_packed.contiguous(),
scales=scales,
qzeros=zero_point,
g_idx=g_idx,
bits=self.weights.num_bits,
desc_act=self.desc_act,
groupsize=self.groupsize,
quant_method="compressed-tensors",
sym=self.weights.symmetric,
sharded_infeatures=False,
)
def get_weights_row(self, weights: Weights, prefix: str):
log_once(logger.info, "Using GPTQ-Marlin kernels")
try:
weight_packed = weights.get_sharded(f"{prefix}.weight_packed", dim=1).t()
except RuntimeError:
raise RuntimeError(
f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
)
zero_point = None
if not self.weights.symmetric:
if self.desc_act or self.groupsize == -1:
zero_point = weights.get_tensor(f"{prefix}.weight_zero_point").t()
else:
zero_point = weights.get_sharded(
f"{prefix}.weight_zero_point", dim=1
).t()
g_idx = None
if self.desc_act:
g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)
if self.desc_act or self.groupsize == -1:
scales = weights.get_tensor(f"{prefix}.weight_scale").t()
else:
scales = weights.get_sharded(f"{prefix}.weight_scale", dim=1).t()
sharded_in_features = weights.process_group.size() > 1
return repack_gptq_for_marlin(
qweight=weight_packed.contiguous(),
scales=scales,
qzeros=zero_point,
g_idx=g_idx,
bits=self.weights.num_bits,
desc_act=self.desc_act,
groupsize=self.groupsize,
quant_method="compressed-tensors",
sym=self.weights.symmetric,
sharded_infeatures=sharded_in_features,
)

View File

@ -29,7 +29,7 @@ else:
CUTLASS_FP8_AVAILABLE = False
def get_fp8_linear() -> Type[torch.nn.Module]:
def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]:
"""
Return an FP8 linear `Module` that is compatible with the current system.
"""
@ -37,7 +37,14 @@ def get_fp8_linear() -> Type[torch.nn.Module]:
if SYSTEM == "cuda":
major, _ = torch.cuda.get_device_capability()
if major == 8 and os.getenv("USE_CUTLASS_W8A8", "0") != "1":
# Marlin is W8A16, use it when:
#
# - On capability 8.x where x < 8: W8A8 FP8 GEMM is not supported.
# - On capability 8.9: W8A8 FP8 GEMM is supported, but Marlin-FP8 is faster.
# - On capability 9.x when force_w8a16: cutlass kernels do not support W8A16.
if (major == 8 or (major == 9 and force_w8a16)) and os.getenv(
"USE_CUTLASS_W8A8", "0"
) != "1":
# NOTE: Capability 8.9 is supported by cutlass kernels, but FP8-Marlin
# gives better decoding throughput on L4 and L40.
from text_generation_server.layers.marlin import GPTQMarlinFP8Linear
@ -283,14 +290,17 @@ class Fp8Weight(Weight):
weight_scale: Optional[torch.Tensor] = None
input_scale: Optional[torch.Tensor] = None
activation_scale_ub: Optional[float] = None
force_w8a16: bool = False
def get_linear(self, bias: torch.Tensor):
if self.weight_scale is None:
return get_fp8_linear().from_unquant(self.weight, bias, self.dtype)
return get_fp8_linear(force_w8a16=self.force_w8a16).from_unquant(
self.weight, bias, self.dtype
)
# This is not checked by the fbgemm kernels, but they require contiguous
# memory. Can be non-contiguous when we e.g. expand from scalars.
self.weight_scale = self.weight_scale.contiguous()
return get_fp8_linear().from_fp8(
return get_fp8_linear(force_w8a16=self.force_w8a16).from_fp8(
weight=self.weight,
scale=self.weight_scale,
dtype=self.dtype,

View File

@ -261,7 +261,7 @@ class GPTQMarlinWeight(Weight):
def __post_init__(self):
assert self.qweight.dtype == torch.int32
assert self.scales.dtype == torch.float16
assert self.scales.dtype in (torch.float16, torch.bfloat16)
assert self.g_idx.dtype == torch.int32
assert self.perm.dtype == torch.int32
@ -300,7 +300,7 @@ def repack_gptq_for_marlin(
raise RuntimeError(
f"Repacking GPTQ weights with group size {groupsize} as Marlin is not supported, must be one of: {supported_sizes}"
)
if not (sym or quant_method == "awq"):
if not (sym or quant_method == "awq" or quant_method == "compressed-tensors"):
raise RuntimeError(
"Repacking GPTQ weights with asymmetric quantization as Marlin is not supported."
)

View File

@ -370,46 +370,23 @@ def get_model(
compression_config = config_dict.get("compression_config", None)
if quantization_config is not None and quantize is None:
method = quantization_config.get("quant_method", None)
config_groups = quantization_config.get("config_groups", None)
if method in {"gptq", "awq", "exl2"}:
log_master(logger.info, f"Auto selecting quantization method {method}")
quantize = method
elif method == "fbgemm_fp8" or method == "fp8":
log_master(logger.info, "Auto selecting quantization method fp8")
quantize = "fp8"
elif config_groups is not None:
# TODO: at some point we should probably fully parse the compression
# configuration to know which parameters are compressed.
for _, group in config_groups.items():
weights_config = group.get("weights")
if weights_config is not None:
if (
weights_config["type"] == "float"
and weights_config["num_bits"] == 8
):
if method == "compressed-tensors":
log_master(
logger.info, "Auto selecting quantization method fp8"
logger.info, "Auto selecting quantization method compressed-tensors"
)
quantize = "fp8"
break
quantize = "compressed-tensors"
else:
log_master(logger.warning, f"Unknown quantization method {method}")
elif compression_config is not None:
# `compression_config` renamed to `quantization_config`; support retained for backward compatibility.
config_groups = compression_config.get("config_groups")
if config_groups is not None:
for _, group in config_groups.items():
weights_config = group.get("weights")
if weights_config is not None:
if (
weights_config["type"] == "float"
and weights_config["num_bits"] == 8
):
log_master(
logger.info, "Auto selecting quantization method fp8"
)
quantize = "fp8"
break
log_master(logger.info, "Auto selecting quantization method compressed-tensors")
quantize = "compressed-tensors"
if dtype is None:
if quantize in ["awq", "exl2", "gptq", "marlin"]:

View File

@ -27,7 +27,20 @@ class _FP8QuantizerConfig:
activation_scale_ub: float
# We should probably do this with Pytantic JSON deserialization,
def _get_config_json(model_id: str, revision: Optional[str], filename: str):
if os.path.exists(
os.path.join(
model_id,
)
):
filename = os.path.join(model_id, filename)
else:
filename = hf_hub_download(model_id, filename=filename, revision=revision)
with open(filename, "r") as f:
return json.load(f)
# We should probably do this with Pydantic JSON deserialization,
# but for now we'll stay close to the old _set_gptq_params.
def _get_quantizer_config(model_id, revision):
bits = 4
@ -39,12 +52,7 @@ def _get_quantizer_config(model_id, revision):
filename = "config.json"
try:
if os.path.exists(os.path.join(model_id, filename)):
filename = os.path.join(model_id, filename)
else:
filename = hf_hub_download(model_id, filename=filename, revision=revision)
with open(filename, "r") as f:
data = json.load(f)
data = _get_config_json(model_id, revision, filename)
# FP8 config
if data["quantization_config"]["quant_method"] == "fbgemm_fp8":
@ -67,14 +75,7 @@ def _get_quantizer_config(model_id, revision):
except Exception:
filename = "quantize_config.json"
try:
if os.path.exists(os.path.join(model_id, filename)):
filename = os.path.join(model_id, filename)
else:
filename = hf_hub_download(
model_id, filename=filename, revision=revision
)
with open(filename, "r") as f:
data = json.load(f)
data = _get_config_json(model_id, revision, filename)
bits = data["bits"]
groupsize = data["group_size"]
@ -90,14 +91,7 @@ def _get_quantizer_config(model_id, revision):
except Exception:
filename = "quant_config.json"
try:
if os.path.exists(os.path.join(model_id, filename)):
filename = os.path.join(model_id, filename)
else:
filename = hf_hub_download(
model_id, filename=filename, revision=revision
)
with open(filename, "r") as f:
data = json.load(f)
data = _get_config_json(model_id, revision, filename)
bits = data["w_bit"]
groupsize = data["q_group_size"]
desc_act = data["desc_act"]
@ -119,6 +113,14 @@ def _get_quantizer_config(model_id, revision):
def get_loader(
quantize: Optional[str], model_id: str, revision: Optional[str]
) -> WeightsLoader:
if quantize == "compressed-tensors":
config = _get_config_json(model_id, revision, "config.json")
from text_generation_server.layers.compressed_tensors import (
CompressedTensorsLoader,
)
return CompressedTensorsLoader(config)
quantizer_config = _get_quantizer_config(model_id, revision)
if quantize in {"awq", "gptq"}:
from text_generation_server.layers.gptq import GPTQWeightsLoader