Add initial support for compressed-tensors checkpoints (#2732)
compressed-tensors is a safetensors extension for sparse, quantized tensors. The format is more powerful than earlier AWQ/GPTQ/FP8 quantization, because - Different quantizer configurations can be used for different targets. - The format can specify input/output quantizers in addition to weight quantizers. - Configurable exclusions for quantization. This change adds a dependency on the `compressed-tensors` package for its configuration parsing and layer matching functionality. The following types of quantization are supported in this PR: - W8A16 and W4A16 INT using GPTQ-Marlin kernels. - W8A8 and W8A16 FP using FP8-Marlin and cutlass kernels. Support for other quantization types will be added in subsequent PRs.
This commit is contained in:
parent
97f7a22f0b
commit
a785000842
|
@ -247,7 +247,7 @@ COPY server/Makefile server/Makefile
|
|||
RUN cd server && \
|
||||
make gen-server && \
|
||||
pip install -r requirements_cuda.txt && \
|
||||
pip install ".[bnb, accelerate, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
|
||||
pip install ".[bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
|
||||
pip install nvidia-nccl-cu12==2.22.3
|
||||
|
||||
ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
|
||||
|
|
|
@ -296,7 +296,7 @@ COPY server/Makefile server/Makefile
|
|||
RUN cd server && \
|
||||
make gen-server && \
|
||||
pip install -r requirements_rocm.txt && \
|
||||
pip install ".[accelerate, peft, outlines]" --no-cache-dir
|
||||
pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
|
||||
|
||||
# Install benchmarker
|
||||
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
|
||||
|
|
|
@ -102,7 +102,7 @@ COPY server/Makefile server/Makefile
|
|||
RUN cd server && \
|
||||
make gen-server && \
|
||||
pip install -r requirements_intel.txt && \
|
||||
pip install ".[accelerate, peft, outlines]" --no-cache-dir
|
||||
pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
|
||||
|
||||
ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
|
||||
ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
|
||||
|
|
|
@ -63,6 +63,7 @@ Options:
|
|||
|
||||
Possible values:
|
||||
- awq: 4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency
|
||||
- compressed-tensors: Compressed tensors, which can be a mixture of different quantization methods
|
||||
- eetq: 8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
|
||||
- exl2: Variable bit quantization. Requires a specific EXL2 quantized model: <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1)
|
||||
- gptq: 4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
|
||||
|
|
|
@ -978,15 +978,16 @@
|
|||
"nixpkgs": "nixpkgs_6"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1730724647,
|
||||
"narHash": "sha256-SVv+50CGaCoU4zZwsg6ZAaOi/D5QJBL1P2SIB+3CEf4=",
|
||||
"lastModified": 1730795478,
|
||||
"narHash": "sha256-xpkXDKnkhXO4F6Ea3reHmqwXXRzQe2PsxdRQFPCViWs=",
|
||||
"owner": "huggingface",
|
||||
"repo": "text-generation-inference-nix",
|
||||
"rev": "1512898a1e5ad9eff025205fa9c4d33a44506cf3",
|
||||
"rev": "b7f6c07867d94d6e55f5352573a6b3dad1c88e56",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "huggingface",
|
||||
"ref": "compressed-tensors-0.7.1",
|
||||
"repo": "text-generation-inference-nix",
|
||||
"type": "github"
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
|
||||
};
|
||||
nix-filter.url = "github:numtide/nix-filter";
|
||||
tgi-nix.url = "github:huggingface/text-generation-inference-nix";
|
||||
tgi-nix.url = "github:huggingface/text-generation-inference-nix/compressed-tensors-0.7.1";
|
||||
nixpkgs.follows = "tgi-nix/nixpkgs";
|
||||
flake-utils.url = "github:numtide/flake-utils";
|
||||
rust-overlay = {
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 128000,
|
||||
"logprob": null,
|
||||
"text": "<|begin_of_text|>"
|
||||
},
|
||||
{
|
||||
"id": 3923,
|
||||
"logprob": -7.609375,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.92529297,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5655,
|
||||
"logprob": -10.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.94628906,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"logprob": -2.9042969,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18682,
|
||||
"logprob": -0.8769531,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.0076942444,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.25073242,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.097595215,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 955,
|
||||
"logprob": -0.921875,
|
||||
"special": false,
|
||||
"text": " type"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.00027918816,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 21075,
|
||||
"logprob": -0.5527344,
|
||||
"special": false,
|
||||
"text": " artificial"
|
||||
},
|
||||
{
|
||||
"id": 11478,
|
||||
"logprob": -0.042541504,
|
||||
"special": false,
|
||||
"text": " intelligence"
|
||||
},
|
||||
{
|
||||
"id": 320,
|
||||
"logprob": -0.38891602,
|
||||
"special": false,
|
||||
"text": " ("
|
||||
},
|
||||
{
|
||||
"id": 15836,
|
||||
"logprob": -0.0011043549,
|
||||
"special": false,
|
||||
"text": "AI"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": " Deep learning is a type of artificial intelligence (AI"
|
||||
}
|
|
@ -0,0 +1,99 @@
|
|||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 128000,
|
||||
"logprob": null,
|
||||
"text": "<|begin_of_text|>"
|
||||
},
|
||||
{
|
||||
"id": 3923,
|
||||
"logprob": -7.609375,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.92529297,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5655,
|
||||
"logprob": -10.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.94628906,
|
||||
"text": " learning"
|
||||
}
|
||||
],
|
||||
"seed": 0,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 5380,
|
||||
"logprob": -0.23840332,
|
||||
"special": false,
|
||||
"text": "?\n"
|
||||
},
|
||||
{
|
||||
"id": 34564,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": "Deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": ","
|
||||
},
|
||||
{
|
||||
"id": 1101,
|
||||
"logprob": -1.2011719,
|
||||
"special": false,
|
||||
"text": " also"
|
||||
},
|
||||
{
|
||||
"id": 3967,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " known"
|
||||
},
|
||||
{
|
||||
"id": 439,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " as"
|
||||
},
|
||||
{
|
||||
"id": 30828,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " neural"
|
||||
},
|
||||
{
|
||||
"id": 4009,
|
||||
"logprob": -0.6777344,
|
||||
"special": false,
|
||||
"text": " network"
|
||||
},
|
||||
{
|
||||
"id": 477,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " or"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": "What is deep learning?\nDeep learning, also known as neural network or"
|
||||
}
|
|
@ -0,0 +1,418 @@
|
|||
[
|
||||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 128000,
|
||||
"logprob": null,
|
||||
"text": "<|begin_of_text|>"
|
||||
},
|
||||
{
|
||||
"id": 3923,
|
||||
"logprob": -7.609375,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.92529297,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5655,
|
||||
"logprob": -10.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.94628906,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"logprob": -2.9042969,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18682,
|
||||
"logprob": -0.8769531,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.0076942444,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.25146484,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.097595215,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 955,
|
||||
"logprob": -0.9248047,
|
||||
"special": false,
|
||||
"text": " type"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.00027513504,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 21075,
|
||||
"logprob": -0.5527344,
|
||||
"special": false,
|
||||
"text": " artificial"
|
||||
},
|
||||
{
|
||||
"id": 11478,
|
||||
"logprob": -0.043151855,
|
||||
"special": false,
|
||||
"text": " intelligence"
|
||||
},
|
||||
{
|
||||
"id": 320,
|
||||
"logprob": -0.3840332,
|
||||
"special": false,
|
||||
"text": " ("
|
||||
},
|
||||
{
|
||||
"id": 15836,
|
||||
"logprob": -0.0011043549,
|
||||
"special": false,
|
||||
"text": "AI"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": " Deep learning is a type of artificial intelligence (AI"
|
||||
},
|
||||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 128000,
|
||||
"logprob": null,
|
||||
"text": "<|begin_of_text|>"
|
||||
},
|
||||
{
|
||||
"id": 3923,
|
||||
"logprob": -7.6054688,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.92089844,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5655,
|
||||
"logprob": -10.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.94433594,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"logprob": -2.90625,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18682,
|
||||
"logprob": -0.875,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.007698059,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.25268555,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.09753418,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 955,
|
||||
"logprob": -0.92529297,
|
||||
"special": false,
|
||||
"text": " type"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.00027942657,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 21075,
|
||||
"logprob": -0.5527344,
|
||||
"special": false,
|
||||
"text": " artificial"
|
||||
},
|
||||
{
|
||||
"id": 11478,
|
||||
"logprob": -0.042541504,
|
||||
"special": false,
|
||||
"text": " intelligence"
|
||||
},
|
||||
{
|
||||
"id": 320,
|
||||
"logprob": -0.3840332,
|
||||
"special": false,
|
||||
"text": " ("
|
||||
},
|
||||
{
|
||||
"id": 15836,
|
||||
"logprob": -0.0011053085,
|
||||
"special": false,
|
||||
"text": "AI"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": " Deep learning is a type of artificial intelligence (AI"
|
||||
},
|
||||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 128000,
|
||||
"logprob": null,
|
||||
"text": "<|begin_of_text|>"
|
||||
},
|
||||
{
|
||||
"id": 3923,
|
||||
"logprob": -7.6054688,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.92089844,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5655,
|
||||
"logprob": -10.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.94433594,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"logprob": -2.90625,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18682,
|
||||
"logprob": -0.875,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.007698059,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.25268555,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.09753418,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 955,
|
||||
"logprob": -0.92529297,
|
||||
"special": false,
|
||||
"text": " type"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.00027942657,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 21075,
|
||||
"logprob": -0.5527344,
|
||||
"special": false,
|
||||
"text": " artificial"
|
||||
},
|
||||
{
|
||||
"id": 11478,
|
||||
"logprob": -0.042541504,
|
||||
"special": false,
|
||||
"text": " intelligence"
|
||||
},
|
||||
{
|
||||
"id": 320,
|
||||
"logprob": -0.3840332,
|
||||
"special": false,
|
||||
"text": " ("
|
||||
},
|
||||
{
|
||||
"id": 15836,
|
||||
"logprob": -0.0011053085,
|
||||
"special": false,
|
||||
"text": "AI"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": " Deep learning is a type of artificial intelligence (AI"
|
||||
},
|
||||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 128000,
|
||||
"logprob": null,
|
||||
"text": "<|begin_of_text|>"
|
||||
},
|
||||
{
|
||||
"id": 3923,
|
||||
"logprob": -7.6054688,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.92089844,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5655,
|
||||
"logprob": -10.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.94433594,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"logprob": -2.90625,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18682,
|
||||
"logprob": -0.875,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.007698059,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.25268555,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.09753418,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 955,
|
||||
"logprob": -0.92529297,
|
||||
"special": false,
|
||||
"text": " type"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.00027942657,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 21075,
|
||||
"logprob": -0.5527344,
|
||||
"special": false,
|
||||
"text": " artificial"
|
||||
},
|
||||
{
|
||||
"id": 11478,
|
||||
"logprob": -0.042541504,
|
||||
"special": false,
|
||||
"text": " intelligence"
|
||||
},
|
||||
{
|
||||
"id": 320,
|
||||
"logprob": -0.3840332,
|
||||
"special": false,
|
||||
"text": " ("
|
||||
},
|
||||
{
|
||||
"id": 15836,
|
||||
"logprob": -0.0011053085,
|
||||
"special": false,
|
||||
"text": "AI"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": " Deep learning is a type of artificial intelligence (AI"
|
||||
}
|
||||
]
|
|
@ -0,0 +1,104 @@
|
|||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 2,
|
||||
"logprob": null,
|
||||
"text": "<bos>"
|
||||
},
|
||||
{
|
||||
"id": 1841,
|
||||
"logprob": -5.46875,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.69140625,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5271,
|
||||
"logprob": -12.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.32226562,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 235336,
|
||||
"logprob": -0.33203125,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 109,
|
||||
"logprob": -0.24707031,
|
||||
"special": false,
|
||||
"text": "\n\n"
|
||||
},
|
||||
{
|
||||
"id": 26843,
|
||||
"logprob": -0.14550781,
|
||||
"special": false,
|
||||
"text": "Deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.038330078,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.029907227,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 476,
|
||||
"logprob": -0.020996094,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 38397,
|
||||
"logprob": -0.828125,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 576,
|
||||
"logprob": -0.00049209595,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 6479,
|
||||
"logprob": -0.057373047,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.000207901,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 674,
|
||||
"logprob": -0.15429688,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": "\n\nDeep learning is a subset of machine learning that"
|
||||
}
|
|
@ -0,0 +1,99 @@
|
|||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 2,
|
||||
"logprob": null,
|
||||
"text": "<bos>"
|
||||
},
|
||||
{
|
||||
"id": 1841,
|
||||
"logprob": -5.46875,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.69140625,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5271,
|
||||
"logprob": -12.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.32226562,
|
||||
"text": " learning"
|
||||
}
|
||||
],
|
||||
"seed": 0,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 235336,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": "?"
|
||||
},
|
||||
{
|
||||
"id": 109,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": "\n\n"
|
||||
},
|
||||
{
|
||||
"id": 26843,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": "Deep"
|
||||
},
|
||||
{
|
||||
"id": 14715,
|
||||
"logprob": -0.38671875,
|
||||
"special": false,
|
||||
"text": " Learning"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 476,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 38397,
|
||||
"logprob": -0.12695312,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 576,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 6479,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": "What is deep learning?\n\nDeep Learning is a subset of machine learning"
|
||||
}
|
|
@ -0,0 +1,418 @@
|
|||
[
|
||||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 2,
|
||||
"logprob": null,
|
||||
"text": "<bos>"
|
||||
},
|
||||
{
|
||||
"id": 1841,
|
||||
"logprob": -5.46875,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.69140625,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5271,
|
||||
"logprob": -12.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.32226562,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 235336,
|
||||
"logprob": -0.33203125,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 109,
|
||||
"logprob": -0.24707031,
|
||||
"special": false,
|
||||
"text": "\n\n"
|
||||
},
|
||||
{
|
||||
"id": 26843,
|
||||
"logprob": -0.14550781,
|
||||
"special": false,
|
||||
"text": "Deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.03857422,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.030883789,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 476,
|
||||
"logprob": -0.020996094,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 38397,
|
||||
"logprob": -0.828125,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 576,
|
||||
"logprob": -0.00051498413,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 6479,
|
||||
"logprob": -0.05883789,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.00020694733,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 674,
|
||||
"logprob": -0.15820312,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": "\n\nDeep learning is a subset of machine learning that"
|
||||
},
|
||||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 2,
|
||||
"logprob": null,
|
||||
"text": "<bos>"
|
||||
},
|
||||
{
|
||||
"id": 1841,
|
||||
"logprob": -5.46875,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.71484375,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5271,
|
||||
"logprob": -12.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.30859375,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 235336,
|
||||
"logprob": -0.3359375,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 109,
|
||||
"logprob": -0.23828125,
|
||||
"special": false,
|
||||
"text": "\n\n"
|
||||
},
|
||||
{
|
||||
"id": 26843,
|
||||
"logprob": -0.14550781,
|
||||
"special": false,
|
||||
"text": "Deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.038330078,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.030883789,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 476,
|
||||
"logprob": -0.020996094,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 38397,
|
||||
"logprob": -0.80859375,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 576,
|
||||
"logprob": -0.0005455017,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 6479,
|
||||
"logprob": -0.05908203,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.00020599365,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 674,
|
||||
"logprob": -0.17285156,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": "\n\nDeep learning is a subset of machine learning that"
|
||||
},
|
||||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 2,
|
||||
"logprob": null,
|
||||
"text": "<bos>"
|
||||
},
|
||||
{
|
||||
"id": 1841,
|
||||
"logprob": -5.46875,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.71484375,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5271,
|
||||
"logprob": -12.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.30859375,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 235336,
|
||||
"logprob": -0.3359375,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 109,
|
||||
"logprob": -0.23828125,
|
||||
"special": false,
|
||||
"text": "\n\n"
|
||||
},
|
||||
{
|
||||
"id": 26843,
|
||||
"logprob": -0.14550781,
|
||||
"special": false,
|
||||
"text": "Deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.038330078,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.030883789,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 476,
|
||||
"logprob": -0.020996094,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 38397,
|
||||
"logprob": -0.80859375,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 576,
|
||||
"logprob": -0.0005455017,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 6479,
|
||||
"logprob": -0.05908203,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.00020599365,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 674,
|
||||
"logprob": -0.17285156,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": "\n\nDeep learning is a subset of machine learning that"
|
||||
},
|
||||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 2,
|
||||
"logprob": null,
|
||||
"text": "<bos>"
|
||||
},
|
||||
{
|
||||
"id": 1841,
|
||||
"logprob": -5.46875,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.71484375,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5271,
|
||||
"logprob": -12.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.30859375,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 235336,
|
||||
"logprob": -0.3359375,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 109,
|
||||
"logprob": -0.23828125,
|
||||
"special": false,
|
||||
"text": "\n\n"
|
||||
},
|
||||
{
|
||||
"id": 26843,
|
||||
"logprob": -0.14550781,
|
||||
"special": false,
|
||||
"text": "Deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.038330078,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.030883789,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 476,
|
||||
"logprob": -0.020996094,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 38397,
|
||||
"logprob": -0.80859375,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 576,
|
||||
"logprob": -0.0005455017,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 6479,
|
||||
"logprob": -0.05908203,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.00020599365,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 674,
|
||||
"logprob": -0.17285156,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": "\n\nDeep learning is a subset of machine learning that"
|
||||
}
|
||||
]
|
|
@ -0,0 +1,86 @@
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def compressed_tensors_w8an_handle(launcher):
|
||||
with launcher(
|
||||
"neuralmagic/Llama-3.2-1B-Instruct-FP8",
|
||||
num_shard=2,
|
||||
quantize="compressed-tensors",
|
||||
) as handle:
|
||||
yield handle
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
async def compressed_tensors_w8an(compressed_tensors_w8an_handle):
|
||||
await compressed_tensors_w8an_handle.health(300)
|
||||
return compressed_tensors_w8an_handle.client
|
||||
|
||||
|
||||
@pytest.mark.release
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.private
|
||||
async def test_compressed_tensors_w8an(compressed_tensors_w8an, response_snapshot):
|
||||
response = await compressed_tensors_w8an.generate(
|
||||
"What is deep learning?",
|
||||
max_new_tokens=10,
|
||||
decoder_input_details=True,
|
||||
)
|
||||
|
||||
assert (
|
||||
response.generated_text
|
||||
== " Deep learning is a type of artificial intelligence (AI"
|
||||
)
|
||||
assert response.details.generated_tokens == 10
|
||||
assert response == response_snapshot
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_compressed_tensors_w8an_all_params(
|
||||
compressed_tensors_w8an, response_snapshot
|
||||
):
|
||||
response = await compressed_tensors_w8an.generate(
|
||||
"What is deep learning",
|
||||
max_new_tokens=10,
|
||||
repetition_penalty=1.2,
|
||||
return_full_text=True,
|
||||
stop_sequences=["test"],
|
||||
temperature=0.5,
|
||||
top_p=0.9,
|
||||
top_k=10,
|
||||
truncate=5,
|
||||
typical_p=0.9,
|
||||
watermark=True,
|
||||
decoder_input_details=True,
|
||||
seed=0,
|
||||
)
|
||||
|
||||
assert response.details.generated_tokens == 10
|
||||
assert (
|
||||
response.generated_text
|
||||
== "What is deep learning?\nDeep learning, also known as neural network or"
|
||||
)
|
||||
assert response == response_snapshot
|
||||
|
||||
|
||||
@pytest.mark.release
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.private
|
||||
async def test_compressed_tensors_w8an_load(
|
||||
compressed_tensors_w8an, generate_load, response_snapshot
|
||||
):
|
||||
responses = await generate_load(
|
||||
compressed_tensors_w8an,
|
||||
"What is deep learning?",
|
||||
max_new_tokens=10,
|
||||
n=4,
|
||||
)
|
||||
|
||||
assert (
|
||||
responses[0].generated_text
|
||||
== " Deep learning is a type of artificial intelligence (AI"
|
||||
)
|
||||
assert len(responses) == 4
|
||||
assert all([r.generated_text == responses[0].generated_text for r in responses])
|
||||
|
||||
assert responses == response_snapshot
|
|
@ -0,0 +1,86 @@
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def compressed_tensors_wna16_handle(launcher):
|
||||
with launcher(
|
||||
"neuralmagic/gemma-2-2b-it-quantized.w4a16",
|
||||
num_shard=2,
|
||||
quantize="compressed-tensors",
|
||||
) as handle:
|
||||
yield handle
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
async def compressed_tensors_wna16(compressed_tensors_wna16_handle):
|
||||
await compressed_tensors_wna16_handle.health(300)
|
||||
return compressed_tensors_wna16_handle.client
|
||||
|
||||
|
||||
@pytest.mark.release
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.private
|
||||
async def test_compressed_tensors_wna16(compressed_tensors_wna16, response_snapshot):
|
||||
response = await compressed_tensors_wna16.generate(
|
||||
"What is deep learning?",
|
||||
max_new_tokens=10,
|
||||
decoder_input_details=True,
|
||||
)
|
||||
|
||||
assert (
|
||||
response.generated_text
|
||||
== "\n\nDeep learning is a subset of machine learning that"
|
||||
)
|
||||
assert response.details.generated_tokens == 10
|
||||
assert response == response_snapshot
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_compressed_tensors_wna16_all_params(
|
||||
compressed_tensors_wna16, response_snapshot
|
||||
):
|
||||
response = await compressed_tensors_wna16.generate(
|
||||
"What is deep learning",
|
||||
max_new_tokens=10,
|
||||
repetition_penalty=1.2,
|
||||
return_full_text=True,
|
||||
stop_sequences=["test"],
|
||||
temperature=0.5,
|
||||
top_p=0.9,
|
||||
top_k=10,
|
||||
truncate=5,
|
||||
typical_p=0.9,
|
||||
watermark=True,
|
||||
decoder_input_details=True,
|
||||
seed=0,
|
||||
)
|
||||
|
||||
assert response.details.generated_tokens == 10
|
||||
assert (
|
||||
response.generated_text
|
||||
== "What is deep learning?\n\nDeep Learning is a subset of machine learning"
|
||||
)
|
||||
assert response == response_snapshot
|
||||
|
||||
|
||||
@pytest.mark.release
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.private
|
||||
async def test_compressed_tensors_wna16_load(
|
||||
compressed_tensors_wna16, generate_load, response_snapshot
|
||||
):
|
||||
responses = await generate_load(
|
||||
compressed_tensors_wna16,
|
||||
"What is deep learning?",
|
||||
max_new_tokens=10,
|
||||
n=4,
|
||||
)
|
||||
|
||||
assert (
|
||||
responses[0].generated_text
|
||||
== "\n\nDeep learning is a subset of machine learning that"
|
||||
)
|
||||
assert len(responses) == 4
|
||||
assert all([r.generated_text == responses[0].generated_text for r in responses])
|
||||
|
||||
assert responses == response_snapshot
|
|
@ -212,6 +212,8 @@ enum Quantization {
|
|||
/// <https://hf.co/models?search=awq>.
|
||||
/// Should replace GPTQ models wherever possible because of the better latency
|
||||
Awq,
|
||||
/// Compressed tensors, which can be a mixture of different quantization methods.
|
||||
CompressedTensors,
|
||||
/// 8 bit quantization, doesn't require specific model.
|
||||
/// Should be a drop-in replacement to bitsandbytes with much better performance.
|
||||
/// Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
|
||||
|
@ -274,6 +276,9 @@ impl std::fmt::Display for Quantization {
|
|||
Quantization::Awq => {
|
||||
write!(f, "awq")
|
||||
}
|
||||
Quantization::CompressedTensors => {
|
||||
write!(f, "compressed-tensors")
|
||||
}
|
||||
Quantization::Eetq => {
|
||||
write!(f, "eetq")
|
||||
}
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
mypy-protobuf,
|
||||
awq-inference-engine,
|
||||
causal-conv1d,
|
||||
compressed-tensors,
|
||||
eetq,
|
||||
einops,
|
||||
exllamav2,
|
||||
|
@ -74,6 +75,7 @@ buildPythonPackage {
|
|||
awq-inference-engine
|
||||
eetq
|
||||
causal-conv1d
|
||||
compressed-tensors
|
||||
einops
|
||||
exllamav2
|
||||
flashinfer
|
||||
|
|
|
@ -23,7 +23,7 @@ gen-server:
|
|||
install-server: gen-server
|
||||
pip install pip --upgrade
|
||||
pip install -r requirements_cuda.txt
|
||||
pip install -e ".[accelerate, quantize, peft, outlines]"
|
||||
pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
|
||||
|
||||
|
||||
install: install-cuda
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "accelerate"
|
||||
|
@ -388,6 +388,26 @@ files = [
|
|||
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "compressed-tensors"
|
||||
version = "0.7.1"
|
||||
description = "Library for utilization of compressed safetensors of neural network models"
|
||||
optional = true
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "compressed-tensors-0.7.1.tar.gz", hash = "sha256:3c7865ebfe4ea76ae94d7c674bcf93aedd2064571f682c09a377a219d5ebb3a0"},
|
||||
{file = "compressed_tensors-0.7.1-py3-none-any.whl", hash = "sha256:22d11558a70f655ae647db9c8e9fb14a5e9d6983ca5aec3f267518625fd6dd0e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pydantic = ">=2.0"
|
||||
torch = ">=1.7.0"
|
||||
transformers = "*"
|
||||
|
||||
[package.extras]
|
||||
accelerate = ["accelerate"]
|
||||
dev = ["black (==22.12.0)", "flake8 (>=3.8.3)", "isort (==5.8.0)", "nbconvert (>=7.16.3)", "pytest (>=6.0.0)", "wheel (>=0.36.2)"]
|
||||
|
||||
[[package]]
|
||||
name = "datasets"
|
||||
version = "2.21.0"
|
||||
|
@ -3982,4 +4002,4 @@ torch = ["torch"]
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.9,<3.13"
|
||||
content-hash = "b39033e573f50a0f046787aebf1702d86673aad0b2fcee818404fcea7f644b81"
|
||||
content-hash = "4636689efd4c94559c3c23903aafcffd177533a3b9006b3b4f8491b158a3a754"
|
||||
|
|
|
@ -37,6 +37,7 @@ pillow = "^10.0.0"
|
|||
outlines= { version = "^0.0.34", optional = true }
|
||||
prometheus-client = "^0.20.0"
|
||||
py-cpuinfo = "^9.0.0"
|
||||
compressed-tensors = { version = "^0.7.1", optional = true }
|
||||
# Remove later, temporary workaround for outlines.
|
||||
numpy = "^1.26"
|
||||
|
||||
|
@ -58,6 +59,7 @@ rich = "^13.7.1"
|
|||
torch = ["torch"]
|
||||
accelerate = ["accelerate"]
|
||||
bnb = ["bitsandbytes"]
|
||||
compressed-tensors = ["compressed-tensors"]
|
||||
marlin = ["marlin-kernels"]
|
||||
moe = ["moe-kernels"]
|
||||
peft = ["peft"]
|
||||
|
|
|
@ -19,6 +19,7 @@ class Quantization(str, Enum):
|
|||
bitsandbytes_fp4 = "bitsandbytes-fp4"
|
||||
gptq = "gptq"
|
||||
awq = "awq"
|
||||
compressed_tensors = "compressed-tensors"
|
||||
eetq = "eetq"
|
||||
exl2 = "exl2"
|
||||
fp8 = "fp8"
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
from .loader import CompressedTensorsLoader
|
||||
|
||||
__all__ = ["CompressedTensorsLoader"]
|
|
@ -0,0 +1,174 @@
|
|||
from typing import Any, Dict, List, Union
|
||||
|
||||
from compressed_tensors import QuantizationConfig, QuantizationStatus
|
||||
from compressed_tensors.config import CompressionFormat
|
||||
from compressed_tensors.quantization import (
|
||||
QuantizationScheme,
|
||||
QuantizationType,
|
||||
find_name_or_class_matches,
|
||||
)
|
||||
from loguru import logger
|
||||
from pydantic import ValidationError
|
||||
from torch import nn
|
||||
|
||||
from text_generation_server.layers.compressed_tensors.w8an_fp import W8ANFpLoader
|
||||
from text_generation_server.layers.compressed_tensors.wna16_int import WNA16Loader
|
||||
from text_generation_server.utils.log import log_once
|
||||
from text_generation_server.utils.weights import (
|
||||
DefaultWeightsLoader,
|
||||
UnquantizedWeight,
|
||||
Weights,
|
||||
WeightsLoader,
|
||||
)
|
||||
|
||||
# compressed-tensors can match modules as quantization targets. However,
|
||||
# they need to be objects rather than classes or class names. Since we
|
||||
# need to match `Linear` targets, make an instance that can be re-used.
|
||||
_EMPTY_LINEAR: nn.Module = nn.Linear(0, 0)
|
||||
|
||||
|
||||
class CompressedTensorsLoader(WeightsLoader):
|
||||
"""Loader for checkpoints stored in the compressed-tensors format."""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
quantization_config_raw = config.get("quantization_config")
|
||||
if quantization_config_raw is None:
|
||||
# `compression_config` was renamed to `quantization_config`; support
|
||||
# retained for backward compatibility.
|
||||
quantization_config_raw = config.get("compression_config")
|
||||
if quantization_config_raw is None:
|
||||
raise ValueError(
|
||||
"Checkpoint does not have compressed-tensors configuration"
|
||||
)
|
||||
|
||||
try:
|
||||
quantization_config = QuantizationConfig.model_validate(
|
||||
quantization_config_raw
|
||||
)
|
||||
except ValidationError as e:
|
||||
raise ValueError("Cannot parse compressed-tensors configuration") from e
|
||||
|
||||
if quantization_config.quantization_status not in (
|
||||
QuantizationStatus.COMPRESSED,
|
||||
QuantizationStatus.FROZEN,
|
||||
):
|
||||
raise ValueError(
|
||||
f"Model quantization was not finished, status was: {quantization_config.quantization_status}"
|
||||
)
|
||||
|
||||
self.ignore = (
|
||||
quantization_config.ignore if quantization_config.ignore is not None else []
|
||||
)
|
||||
self.loaders = self._get_target_loaders(quantization_config)
|
||||
|
||||
for target, loader in self.loaders.items():
|
||||
log_once(
|
||||
logger.info,
|
||||
f"Using {loader} for compressed-tensors target '{target}'",
|
||||
)
|
||||
|
||||
def get_weights(self, weights: Weights, prefix: str):
|
||||
loader = self._lookup_loader(prefix)
|
||||
return loader.get_weights(weights, prefix)
|
||||
|
||||
def get_weights_col_packed(
|
||||
self,
|
||||
weights: "Weights",
|
||||
prefix: str,
|
||||
block_sizes: Union[int, List[int]],
|
||||
):
|
||||
loader = self._lookup_loader(prefix)
|
||||
return loader.get_weights_col_packed(weights, prefix, block_sizes)
|
||||
|
||||
def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
|
||||
loader = self._lookup_loader(prefixes[0])
|
||||
return loader.get_multi_weights_col(weights, prefixes, dim)
|
||||
|
||||
def get_weights_row(self, weights: Weights, prefix: str):
|
||||
loader = self._lookup_loader(prefix)
|
||||
return loader.get_weights_row(weights, prefix)
|
||||
|
||||
def _get_target_loaders(
|
||||
self, quantization_config: QuantizationConfig
|
||||
) -> Dict[str, WeightsLoader]:
|
||||
"""
|
||||
A compressed-tensors checkpoint can use different quantizations
|
||||
for different targets. This method returns a dictionary with a
|
||||
loader per target.
|
||||
"""
|
||||
|
||||
loaders: Dict[str, WeightsLoader] = {}
|
||||
|
||||
format = quantization_config.format
|
||||
|
||||
for group_name, group in quantization_config.config_groups.items():
|
||||
# The group configuration can be a string, but does that ever
|
||||
# happen in a serialized quantization config?
|
||||
assert isinstance(group, QuantizationScheme)
|
||||
|
||||
loader = self._create_loader_for_group(format, group_name, group)
|
||||
|
||||
# A quantized parameter group can have multiple targets, add the
|
||||
# loader for all the targets.
|
||||
for target in group.targets:
|
||||
if target in loaders:
|
||||
raise ValueError(
|
||||
f"Target '{target} has multiple configured loaders'"
|
||||
)
|
||||
loaders[target] = loader
|
||||
|
||||
return loaders
|
||||
|
||||
def _create_loader_for_group(
|
||||
self, format: str, group_name: str, group: QuantizationScheme
|
||||
) -> WeightsLoader:
|
||||
"""
|
||||
Find and create a loader for the group with the given quantization
|
||||
scheme.
|
||||
"""
|
||||
# NOTE: we ignore group.output_activations because we don't support
|
||||
# output quantization yet.
|
||||
|
||||
input_activations = group.input_activations
|
||||
weights = group.weights
|
||||
if (
|
||||
format
|
||||
in {
|
||||
CompressionFormat.float_quantized.value,
|
||||
CompressionFormat.naive_quantized.value,
|
||||
}
|
||||
and weights is not None
|
||||
and weights.type == QuantizationType.FLOAT
|
||||
and weights.num_bits == 8
|
||||
):
|
||||
# FP W8A8 or W8A16.
|
||||
return W8ANFpLoader(input_activations=input_activations, weights=weights)
|
||||
elif (
|
||||
format == CompressionFormat.pack_quantized.value
|
||||
and weights is not None
|
||||
and weights.type == QuantizationType.INT
|
||||
and weights.num_bits in (4, 8)
|
||||
):
|
||||
# INT W4A16 or W8A16 (GPTQ/AWQ-like).
|
||||
return WNA16Loader(weights)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Group '{group_name}' has unsupported compressed-tensors configurtion"
|
||||
)
|
||||
|
||||
def _lookup_loader(self, prefix: str) -> WeightsLoader:
|
||||
"""
|
||||
Look up the loader to use for a given parameter name (prefix).
|
||||
"""
|
||||
|
||||
if len(find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.ignore)) > 0:
|
||||
return DefaultWeightsLoader(UnquantizedWeight)
|
||||
|
||||
# We currently only handle linear layers, so unconditionally pass
|
||||
# a `Linear` instance.
|
||||
targets = find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.loaders.keys())
|
||||
if len(targets) == 0:
|
||||
raise ValueError(
|
||||
f"Cannot find compressed-tensors target for prefix: {prefix}"
|
||||
)
|
||||
return self.loaders[targets[0]]
|
|
@ -0,0 +1,174 @@
|
|||
from typing import List, Optional, Union
|
||||
|
||||
import torch
|
||||
from compressed_tensors.quantization import QuantizationArgs, QuantizationType
|
||||
|
||||
from text_generation_server.layers.fp8 import Fp8Weight, _load_scalar_or_matrix_scale
|
||||
from text_generation_server.utils.weights import Weights, WeightsLoader
|
||||
|
||||
|
||||
class W8ANFpLoader(WeightsLoader):
|
||||
"""
|
||||
Loader for W8A8/W8A16 FP compressed-tensors parameters.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
input_activations: Optional[QuantizationArgs],
|
||||
weights: QuantizationArgs,
|
||||
):
|
||||
assert weights.type == QuantizationType.FLOAT and weights.num_bits == 8
|
||||
|
||||
# We ignore the `strategy` option which sets the scales to be
|
||||
# per-tensor, per-channel or per-token. What scales are supported
|
||||
# is dependent on the kernels used (e.g. cutlass can do tokenwise,
|
||||
# Torch cannot, and FP8-Marlin does not quantize inputs at all).
|
||||
# So, instead we try to use the best-possible configuration.
|
||||
|
||||
self.load_weight_scale = not weights.dynamic
|
||||
self.load_input_scale = (
|
||||
input_activations is not None and not input_activations.dynamic
|
||||
)
|
||||
self.force_w8a16 = (
|
||||
input_activations is not None and input_activations.num_bits == 16
|
||||
)
|
||||
|
||||
def __str__(self) -> str:
|
||||
def scale_to_str(scale):
|
||||
return "static" if scale else "dynamic"
|
||||
|
||||
quantization_type = f"W8A{16 if self.force_w8a16 else 8}"
|
||||
|
||||
return f"{self.__class__.__name__} ({quantization_type}, weight: {scale_to_str(self.load_weight_scale)}, input: {scale_to_str(self.load_input_scale)})"
|
||||
|
||||
def get_weights(self, weights: "Weights", prefix: str):
|
||||
w = weights.get_tensor(f"{prefix}.weight")
|
||||
|
||||
weight_scale = None
|
||||
if self.load_weight_scale:
|
||||
weight_scale = (
|
||||
weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
|
||||
.reshape(-1)
|
||||
.expand(w.shape[0])
|
||||
)
|
||||
|
||||
input_scale = None
|
||||
if self.load_input_scale:
|
||||
input_scale = weights.get_tensor(
|
||||
f"{prefix}.input_scale", to_dtype=False
|
||||
).reshape(-1)
|
||||
|
||||
return Fp8Weight(
|
||||
weight=w,
|
||||
weight_scale=weight_scale,
|
||||
input_scale=input_scale,
|
||||
dtype=weights.dtype,
|
||||
force_w8a16=self.force_w8a16,
|
||||
)
|
||||
|
||||
def get_weights_col_packed(
|
||||
self,
|
||||
weights: Weights,
|
||||
prefix: str,
|
||||
block_sizes: Union[int, List[int]],
|
||||
):
|
||||
w = weights.get_packed_sharded(
|
||||
f"{prefix}.weight", dim=0, block_sizes=block_sizes
|
||||
)
|
||||
|
||||
weight_scale = None
|
||||
if self.load_weight_scale:
|
||||
weight_scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
|
||||
if weight_scale.numel() > 1:
|
||||
weight_scale = weights.get_packed_sharded(
|
||||
f"{prefix}.weight_scale",
|
||||
dim=0,
|
||||
block_sizes=block_sizes,
|
||||
to_dtype=False,
|
||||
)
|
||||
weight_scale = weight_scale.reshape(-1).expand(w.shape[0])
|
||||
|
||||
input_scale = None
|
||||
if self.load_input_scale:
|
||||
input_scale = weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
|
||||
if input_scale.numel() > 1:
|
||||
input_scale = weights.get_packed_sharded(
|
||||
f"{prefix}.input_scale",
|
||||
dim=0,
|
||||
block_sizes=block_sizes,
|
||||
to_dtype=False,
|
||||
)
|
||||
input_scale = input_scale.reshape(-1).max()
|
||||
|
||||
return Fp8Weight(
|
||||
weight=w,
|
||||
weight_scale=weight_scale,
|
||||
input_scale=input_scale,
|
||||
dtype=weights.dtype,
|
||||
force_w8a16=self.force_w8a16,
|
||||
)
|
||||
|
||||
def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
|
||||
# FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
|
||||
w = [
|
||||
weights.get_sharded(f"{p}.weight", dim=0, to_device=False) for p in prefixes
|
||||
]
|
||||
shapes = [x.shape for x in w]
|
||||
|
||||
# Concat then send to the device
|
||||
w = torch.cat(w, dim=dim).to(weights.device)
|
||||
|
||||
weight_scale = None
|
||||
if self.load_weight_scale:
|
||||
weight_scale = [
|
||||
_load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape)
|
||||
for p, shape in zip(prefixes, shapes)
|
||||
]
|
||||
weight_scale = torch.cat(weight_scale, dim=0).reshape(-1)
|
||||
|
||||
input_scale = None
|
||||
if self.load_input_scale:
|
||||
input_scale = [
|
||||
_load_scalar_or_matrix_scale(weights, f"{p}.input_scale", shape)
|
||||
for p, shape in zip(prefixes, shapes)
|
||||
if weights.has_tensor(f"{p}.input_scale")
|
||||
]
|
||||
assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
|
||||
input_scale = (
|
||||
torch.cat(input_scale, dim=0).reshape(-1).max()
|
||||
if len(input_scale) != 0
|
||||
else None
|
||||
)
|
||||
|
||||
return Fp8Weight(
|
||||
weight=w,
|
||||
weight_scale=weight_scale,
|
||||
input_scale=input_scale,
|
||||
dtype=weights.dtype,
|
||||
force_w8a16=self.force_w8a16,
|
||||
)
|
||||
|
||||
def get_weights_row(self, weights: "Weights", prefix: str):
|
||||
w = weights.get_sharded(f"{prefix}.weight", dim=1)
|
||||
weight_scale = None
|
||||
if self.load_weight_scale:
|
||||
weight_scale = (
|
||||
weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
|
||||
.reshape(-1)
|
||||
.expand(w.shape[0])
|
||||
)
|
||||
|
||||
input_scale = None
|
||||
if self.load_input_scale:
|
||||
input_scale = weights.get_tensor(
|
||||
f"{prefix}.input_scale", to_dtype=False
|
||||
).reshape(-1)
|
||||
|
||||
return Fp8Weight(
|
||||
weight=w,
|
||||
weight_scale=weight_scale,
|
||||
input_scale=input_scale,
|
||||
dtype=weights.dtype,
|
||||
force_w8a16=self.force_w8a16,
|
||||
)
|
|
@ -0,0 +1,188 @@
|
|||
from typing import List, Union
|
||||
|
||||
import torch
|
||||
from compressed_tensors.quantization import ActivationOrdering, QuantizationArgs
|
||||
from loguru import logger
|
||||
|
||||
from text_generation_server.layers.marlin.gptq import repack_gptq_for_marlin
|
||||
from text_generation_server.utils.log import log_once
|
||||
from text_generation_server.utils.weights import Weights, WeightsLoader
|
||||
|
||||
|
||||
class WNA16Loader(WeightsLoader):
|
||||
"""
|
||||
Loader for W4A16/W8A16 INT compressed-tensors parameters.
|
||||
"""
|
||||
|
||||
def __init__(self, weights: QuantizationArgs):
|
||||
self.weights = weights
|
||||
self.desc_act = self.weights.actorder == ActivationOrdering.GROUP
|
||||
self.groupsize = (
|
||||
-1 if self.weights.group_size is None else self.weights.group_size
|
||||
)
|
||||
|
||||
def __str__(self) -> str:
|
||||
quantization_type = f"W{self.weights.num_bits}8A16"
|
||||
|
||||
return f"{self.__class__.__name__} ({quantization_type})"
|
||||
|
||||
def get_weights(self, weights: Weights, prefix: str):
|
||||
log_once(logger.info, "Using GPTQ-Marlin kernels")
|
||||
try:
|
||||
weight_packed = weights.get_tensor(f"{prefix}.weight_packed").t()
|
||||
except RuntimeError:
|
||||
raise RuntimeError(
|
||||
f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
|
||||
)
|
||||
|
||||
zero_point = None
|
||||
if not self.weights.symmetric:
|
||||
zero_point = weights.get_tensor(f"{prefix}.weight_zero_point").t()
|
||||
|
||||
g_idx = None
|
||||
if self.desc_act:
|
||||
g_idx = weights.get_tensor(f"{prefix}.weight_g_idx")
|
||||
|
||||
scales = weights.get_tensor(f"{prefix}.weight.scales").t()
|
||||
|
||||
return repack_gptq_for_marlin(
|
||||
qweight=weight_packed.contiguous(),
|
||||
scales=scales,
|
||||
qzeros=zero_point,
|
||||
g_idx=g_idx,
|
||||
bits=self.weights.num_bits,
|
||||
desc_act=self.desc_act,
|
||||
groupsize=self.groupsize,
|
||||
quant_method="compressed-tensors",
|
||||
sym=self.weights.symmetric,
|
||||
sharded_infeatures=False,
|
||||
)
|
||||
|
||||
def get_weights_col_packed(
|
||||
self,
|
||||
weights: Weights,
|
||||
prefix: str,
|
||||
block_sizes: Union[int, List[int]],
|
||||
):
|
||||
try:
|
||||
weight_packed = weights.get_packed_sharded(
|
||||
f"{prefix}.weight_packed", dim=0, block_sizes=block_sizes
|
||||
).t()
|
||||
except RuntimeError:
|
||||
raise RuntimeError(
|
||||
f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
|
||||
)
|
||||
scales = weights.get_packed_sharded(
|
||||
f"{prefix}.weight_scale", dim=0, block_sizes=block_sizes
|
||||
).t()
|
||||
scales = scales.to(dtype=weights.dtype)
|
||||
|
||||
zero_point = None
|
||||
if not self.weights.symmetric:
|
||||
zero_point = weights.get_packed_sharded(
|
||||
f"{prefix}.qzeros", dim=0, block_sizes=block_sizes
|
||||
).t()
|
||||
|
||||
g_idx = None
|
||||
if self.desc_act:
|
||||
g_idx = weights.get_tensor(f"{prefix}.g_idx")
|
||||
|
||||
return repack_gptq_for_marlin(
|
||||
qweight=weight_packed.contiguous(),
|
||||
scales=scales,
|
||||
qzeros=zero_point,
|
||||
g_idx=g_idx,
|
||||
bits=self.weights.num_bits,
|
||||
desc_act=self.desc_act,
|
||||
groupsize=self.groupsize,
|
||||
quant_method="compressed-tensors",
|
||||
sym=self.weights.symmetric,
|
||||
sharded_infeatures=False,
|
||||
)
|
||||
|
||||
def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
|
||||
try:
|
||||
weight_packed = torch.cat(
|
||||
[
|
||||
weights.get_sharded(f"{p}.weight_packed", dim=0).t()
|
||||
for p in prefixes
|
||||
],
|
||||
dim=1,
|
||||
)
|
||||
except RuntimeError:
|
||||
raise RuntimeError(
|
||||
f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
|
||||
)
|
||||
|
||||
scales = torch.cat(
|
||||
[weights.get_sharded(f"{p}.weight_scale", dim=0).t() for p in prefixes],
|
||||
dim=1,
|
||||
)
|
||||
|
||||
zero_point = None
|
||||
if not self.weights.symmetric:
|
||||
zero_point = torch.cat(
|
||||
[weights.get_sharded(f"{p}.qzeros", dim=0).t() for p in prefixes], dim=1
|
||||
).t()
|
||||
|
||||
g_idx = None
|
||||
if self.desc_act:
|
||||
w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
|
||||
for w2 in w[1:]:
|
||||
torch.testing.assert_close(w2, w[0])
|
||||
g_idx = w[0]
|
||||
|
||||
return repack_gptq_for_marlin(
|
||||
qweight=weight_packed.contiguous(),
|
||||
scales=scales,
|
||||
qzeros=zero_point,
|
||||
g_idx=g_idx,
|
||||
bits=self.weights.num_bits,
|
||||
desc_act=self.desc_act,
|
||||
groupsize=self.groupsize,
|
||||
quant_method="compressed-tensors",
|
||||
sym=self.weights.symmetric,
|
||||
sharded_infeatures=False,
|
||||
)
|
||||
|
||||
def get_weights_row(self, weights: Weights, prefix: str):
|
||||
log_once(logger.info, "Using GPTQ-Marlin kernels")
|
||||
try:
|
||||
weight_packed = weights.get_sharded(f"{prefix}.weight_packed", dim=1).t()
|
||||
except RuntimeError:
|
||||
raise RuntimeError(
|
||||
f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
|
||||
)
|
||||
|
||||
zero_point = None
|
||||
if not self.weights.symmetric:
|
||||
if self.desc_act or self.groupsize == -1:
|
||||
zero_point = weights.get_tensor(f"{prefix}.weight_zero_point").t()
|
||||
else:
|
||||
zero_point = weights.get_sharded(
|
||||
f"{prefix}.weight_zero_point", dim=1
|
||||
).t()
|
||||
|
||||
g_idx = None
|
||||
if self.desc_act:
|
||||
g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)
|
||||
|
||||
if self.desc_act or self.groupsize == -1:
|
||||
scales = weights.get_tensor(f"{prefix}.weight_scale").t()
|
||||
else:
|
||||
scales = weights.get_sharded(f"{prefix}.weight_scale", dim=1).t()
|
||||
|
||||
sharded_in_features = weights.process_group.size() > 1
|
||||
|
||||
return repack_gptq_for_marlin(
|
||||
qweight=weight_packed.contiguous(),
|
||||
scales=scales,
|
||||
qzeros=zero_point,
|
||||
g_idx=g_idx,
|
||||
bits=self.weights.num_bits,
|
||||
desc_act=self.desc_act,
|
||||
groupsize=self.groupsize,
|
||||
quant_method="compressed-tensors",
|
||||
sym=self.weights.symmetric,
|
||||
sharded_infeatures=sharded_in_features,
|
||||
)
|
|
@ -29,7 +29,7 @@ else:
|
|||
CUTLASS_FP8_AVAILABLE = False
|
||||
|
||||
|
||||
def get_fp8_linear() -> Type[torch.nn.Module]:
|
||||
def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]:
|
||||
"""
|
||||
Return an FP8 linear `Module` that is compatible with the current system.
|
||||
"""
|
||||
|
@ -37,7 +37,14 @@ def get_fp8_linear() -> Type[torch.nn.Module]:
|
|||
if SYSTEM == "cuda":
|
||||
|
||||
major, _ = torch.cuda.get_device_capability()
|
||||
if major == 8 and os.getenv("USE_CUTLASS_W8A8", "0") != "1":
|
||||
# Marlin is W8A16, use it when:
|
||||
#
|
||||
# - On capability 8.x where x < 8: W8A8 FP8 GEMM is not supported.
|
||||
# - On capability 8.9: W8A8 FP8 GEMM is supported, but Marlin-FP8 is faster.
|
||||
# - On capability 9.x when force_w8a16: cutlass kernels do not support W8A16.
|
||||
if (major == 8 or (major == 9 and force_w8a16)) and os.getenv(
|
||||
"USE_CUTLASS_W8A8", "0"
|
||||
) != "1":
|
||||
# NOTE: Capability 8.9 is supported by cutlass kernels, but FP8-Marlin
|
||||
# gives better decoding throughput on L4 and L40.
|
||||
from text_generation_server.layers.marlin import GPTQMarlinFP8Linear
|
||||
|
@ -283,14 +290,17 @@ class Fp8Weight(Weight):
|
|||
weight_scale: Optional[torch.Tensor] = None
|
||||
input_scale: Optional[torch.Tensor] = None
|
||||
activation_scale_ub: Optional[float] = None
|
||||
force_w8a16: bool = False
|
||||
|
||||
def get_linear(self, bias: torch.Tensor):
|
||||
if self.weight_scale is None:
|
||||
return get_fp8_linear().from_unquant(self.weight, bias, self.dtype)
|
||||
return get_fp8_linear(force_w8a16=self.force_w8a16).from_unquant(
|
||||
self.weight, bias, self.dtype
|
||||
)
|
||||
# This is not checked by the fbgemm kernels, but they require contiguous
|
||||
# memory. Can be non-contiguous when we e.g. expand from scalars.
|
||||
self.weight_scale = self.weight_scale.contiguous()
|
||||
return get_fp8_linear().from_fp8(
|
||||
return get_fp8_linear(force_w8a16=self.force_w8a16).from_fp8(
|
||||
weight=self.weight,
|
||||
scale=self.weight_scale,
|
||||
dtype=self.dtype,
|
||||
|
|
|
@ -261,7 +261,7 @@ class GPTQMarlinWeight(Weight):
|
|||
|
||||
def __post_init__(self):
|
||||
assert self.qweight.dtype == torch.int32
|
||||
assert self.scales.dtype == torch.float16
|
||||
assert self.scales.dtype in (torch.float16, torch.bfloat16)
|
||||
assert self.g_idx.dtype == torch.int32
|
||||
assert self.perm.dtype == torch.int32
|
||||
|
||||
|
@ -300,7 +300,7 @@ def repack_gptq_for_marlin(
|
|||
raise RuntimeError(
|
||||
f"Repacking GPTQ weights with group size {groupsize} as Marlin is not supported, must be one of: {supported_sizes}"
|
||||
)
|
||||
if not (sym or quant_method == "awq"):
|
||||
if not (sym or quant_method == "awq" or quant_method == "compressed-tensors"):
|
||||
raise RuntimeError(
|
||||
"Repacking GPTQ weights with asymmetric quantization as Marlin is not supported."
|
||||
)
|
||||
|
|
|
@ -370,46 +370,23 @@ def get_model(
|
|||
compression_config = config_dict.get("compression_config", None)
|
||||
if quantization_config is not None and quantize is None:
|
||||
method = quantization_config.get("quant_method", None)
|
||||
config_groups = quantization_config.get("config_groups", None)
|
||||
if method in {"gptq", "awq", "exl2"}:
|
||||
log_master(logger.info, f"Auto selecting quantization method {method}")
|
||||
quantize = method
|
||||
elif method == "fbgemm_fp8" or method == "fp8":
|
||||
log_master(logger.info, "Auto selecting quantization method fp8")
|
||||
quantize = "fp8"
|
||||
elif config_groups is not None:
|
||||
# TODO: at some point we should probably fully parse the compression
|
||||
# configuration to know which parameters are compressed.
|
||||
for _, group in config_groups.items():
|
||||
weights_config = group.get("weights")
|
||||
if weights_config is not None:
|
||||
if (
|
||||
weights_config["type"] == "float"
|
||||
and weights_config["num_bits"] == 8
|
||||
):
|
||||
if method == "compressed-tensors":
|
||||
log_master(
|
||||
logger.info, "Auto selecting quantization method fp8"
|
||||
logger.info, "Auto selecting quantization method compressed-tensors"
|
||||
)
|
||||
quantize = "fp8"
|
||||
break
|
||||
quantize = "compressed-tensors"
|
||||
else:
|
||||
log_master(logger.warning, f"Unknown quantization method {method}")
|
||||
elif compression_config is not None:
|
||||
# `compression_config` renamed to `quantization_config`; support retained for backward compatibility.
|
||||
config_groups = compression_config.get("config_groups")
|
||||
if config_groups is not None:
|
||||
for _, group in config_groups.items():
|
||||
weights_config = group.get("weights")
|
||||
if weights_config is not None:
|
||||
if (
|
||||
weights_config["type"] == "float"
|
||||
and weights_config["num_bits"] == 8
|
||||
):
|
||||
log_master(
|
||||
logger.info, "Auto selecting quantization method fp8"
|
||||
)
|
||||
quantize = "fp8"
|
||||
break
|
||||
log_master(logger.info, "Auto selecting quantization method compressed-tensors")
|
||||
quantize = "compressed-tensors"
|
||||
|
||||
if dtype is None:
|
||||
if quantize in ["awq", "exl2", "gptq", "marlin"]:
|
||||
|
|
|
@ -27,7 +27,20 @@ class _FP8QuantizerConfig:
|
|||
activation_scale_ub: float
|
||||
|
||||
|
||||
# We should probably do this with Pytantic JSON deserialization,
|
||||
def _get_config_json(model_id: str, revision: Optional[str], filename: str):
|
||||
if os.path.exists(
|
||||
os.path.join(
|
||||
model_id,
|
||||
)
|
||||
):
|
||||
filename = os.path.join(model_id, filename)
|
||||
else:
|
||||
filename = hf_hub_download(model_id, filename=filename, revision=revision)
|
||||
with open(filename, "r") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
# We should probably do this with Pydantic JSON deserialization,
|
||||
# but for now we'll stay close to the old _set_gptq_params.
|
||||
def _get_quantizer_config(model_id, revision):
|
||||
bits = 4
|
||||
|
@ -39,12 +52,7 @@ def _get_quantizer_config(model_id, revision):
|
|||
|
||||
filename = "config.json"
|
||||
try:
|
||||
if os.path.exists(os.path.join(model_id, filename)):
|
||||
filename = os.path.join(model_id, filename)
|
||||
else:
|
||||
filename = hf_hub_download(model_id, filename=filename, revision=revision)
|
||||
with open(filename, "r") as f:
|
||||
data = json.load(f)
|
||||
data = _get_config_json(model_id, revision, filename)
|
||||
|
||||
# FP8 config
|
||||
if data["quantization_config"]["quant_method"] == "fbgemm_fp8":
|
||||
|
@ -67,14 +75,7 @@ def _get_quantizer_config(model_id, revision):
|
|||
except Exception:
|
||||
filename = "quantize_config.json"
|
||||
try:
|
||||
if os.path.exists(os.path.join(model_id, filename)):
|
||||
filename = os.path.join(model_id, filename)
|
||||
else:
|
||||
filename = hf_hub_download(
|
||||
model_id, filename=filename, revision=revision
|
||||
)
|
||||
with open(filename, "r") as f:
|
||||
data = json.load(f)
|
||||
data = _get_config_json(model_id, revision, filename)
|
||||
bits = data["bits"]
|
||||
groupsize = data["group_size"]
|
||||
|
||||
|
@ -90,14 +91,7 @@ def _get_quantizer_config(model_id, revision):
|
|||
except Exception:
|
||||
filename = "quant_config.json"
|
||||
try:
|
||||
if os.path.exists(os.path.join(model_id, filename)):
|
||||
filename = os.path.join(model_id, filename)
|
||||
else:
|
||||
filename = hf_hub_download(
|
||||
model_id, filename=filename, revision=revision
|
||||
)
|
||||
with open(filename, "r") as f:
|
||||
data = json.load(f)
|
||||
data = _get_config_json(model_id, revision, filename)
|
||||
bits = data["w_bit"]
|
||||
groupsize = data["q_group_size"]
|
||||
desc_act = data["desc_act"]
|
||||
|
@ -119,6 +113,14 @@ def _get_quantizer_config(model_id, revision):
|
|||
def get_loader(
|
||||
quantize: Optional[str], model_id: str, revision: Optional[str]
|
||||
) -> WeightsLoader:
|
||||
if quantize == "compressed-tensors":
|
||||
config = _get_config_json(model_id, revision, "config.json")
|
||||
from text_generation_server.layers.compressed_tensors import (
|
||||
CompressedTensorsLoader,
|
||||
)
|
||||
|
||||
return CompressedTensorsLoader(config)
|
||||
|
||||
quantizer_config = _get_quantizer_config(model_id, revision)
|
||||
if quantize in {"awq", "gptq"}:
|
||||
from text_generation_server.layers.gptq import GPTQWeightsLoader
|
||||
|
|
Loading…
Reference in New Issue