Use eetq kernel from the hub (#3029)
* Use eetq kernel from the hub * Fixing the CI. --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
parent
cfd4fbb479
commit
f0ed76583c
|
@ -121,13 +121,6 @@ COPY server/Makefile-awq Makefile
|
|||
# Build specific version of transformers
|
||||
RUN . .venv/bin/activate && make build-awq
|
||||
|
||||
# Build eetq kernels
|
||||
FROM kernel-builder AS eetq-kernels-builder
|
||||
WORKDIR /usr/src
|
||||
COPY server/Makefile-eetq Makefile
|
||||
# Build specific version of transformers
|
||||
RUN . .venv/bin/activate && make build-eetq
|
||||
|
||||
# Build Lorax Punica kernels
|
||||
FROM kernel-builder AS lorax-punica-builder
|
||||
WORKDIR /usr/src
|
||||
|
@ -216,8 +209,6 @@ COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311
|
|||
COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
||||
# Copy build artifacts from awq kernels builder
|
||||
COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
||||
# Copy build artifacts from eetq kernels builder
|
||||
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
||||
# Copy build artifacts from lorax punica kernels builder
|
||||
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
|
||||
# Copy build artifacts from mamba builder
|
||||
|
|
|
@ -978,15 +978,16 @@
|
|||
"nixpkgs": "nixpkgs_6"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1738769628,
|
||||
"narHash": "sha256-hgHf1mscFbH9XtT3dYtFQcxRfict9N+Vi6QSW1c+FjU=",
|
||||
"lastModified": 1739803255,
|
||||
"narHash": "sha256-lreIfcjSt6D0wOuZ6jm3WEBYvYvED63T+pOKmOgBLi8=",
|
||||
"owner": "huggingface",
|
||||
"repo": "text-generation-inference-nix",
|
||||
"rev": "9a5a58219dead9704d83d9d32f105b6b90bd31f2",
|
||||
"rev": "30ab7423277fc93c8fc0ca4df737478ebfdb8eec",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "huggingface",
|
||||
"ref": "eetq-0.0.1",
|
||||
"repo": "text-generation-inference-nix",
|
||||
"type": "github"
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
|
||||
};
|
||||
nix-filter.url = "github:numtide/nix-filter";
|
||||
tgi-nix.url = "github:huggingface/text-generation-inference-nix";
|
||||
tgi-nix.url = "github:huggingface/text-generation-inference-nix/eetq-0.0.1";
|
||||
nixpkgs.follows = "tgi-nix/nixpkgs";
|
||||
flake-utils.url = "github:numtide/flake-utils";
|
||||
rust-overlay = {
|
||||
|
|
|
@ -6,7 +6,6 @@
|
|||
awq-inference-engine,
|
||||
causal-conv1d,
|
||||
compressed-tensors,
|
||||
eetq,
|
||||
einops,
|
||||
exllamav2,
|
||||
flashinfer,
|
||||
|
@ -36,6 +35,7 @@
|
|||
py-cpuinfo,
|
||||
pydantic,
|
||||
quantization,
|
||||
quantization-eetq,
|
||||
safetensors,
|
||||
tokenizers,
|
||||
torch,
|
||||
|
@ -80,7 +80,6 @@ buildPythonPackage {
|
|||
|
||||
dependencies = [
|
||||
awq-inference-engine
|
||||
eetq
|
||||
causal-conv1d
|
||||
compressed-tensors
|
||||
einops
|
||||
|
@ -111,6 +110,7 @@ buildPythonPackage {
|
|||
py-cpuinfo
|
||||
pydantic
|
||||
quantization
|
||||
quantization-eetq
|
||||
safetensors
|
||||
sentencepiece
|
||||
tokenizers
|
||||
|
|
|
@ -2,7 +2,6 @@ include Makefile-flash-att
|
|||
include Makefile-flash-att-v2
|
||||
include Makefile-vllm
|
||||
include Makefile-awq
|
||||
include Makefile-eetq
|
||||
include Makefile-selective-scan
|
||||
include Makefile-lorax-punica
|
||||
include Makefile-exllamav2
|
||||
|
|
|
@ -6736,5 +6736,203 @@
|
|||
"blob_id": "d97e03913fa5980e0be73b160088c8e4f5f49a52"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"repo_id": "kernels-community/quantization-eetq",
|
||||
"sha": "a80ce846d6270ddddeee109523ed947f594f246b",
|
||||
"files": [
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/__init__.py",
|
||||
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/_ops.py",
|
||||
"blob_id": "9c191845fb7acbd7ea6bae36ce8c237b168557e1"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_v7rnpcck3kry4.abi3.so",
|
||||
"blob_id": "9edc9126b9ec8ce4f47a8e6688a5f0329c905329"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
|
||||
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/__init__.py",
|
||||
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/_ops.py",
|
||||
"blob_id": "ccec58b06a2282da51356fe5d04dd1e2757ce80c"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/_quantization_eetq_zcfiojfkx55be.abi3.so",
|
||||
"blob_id": "ea27fb040515267ec631cec5545b878da680e7cc"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu121-x86_64-linux/quantization_eetq/custom_ops.py",
|
||||
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/__init__.py",
|
||||
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/_ops.py",
|
||||
"blob_id": "bb409419898138ffa9ade9ba505a167a067ea378"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_btymam4x7xvs6.abi3.so",
|
||||
"blob_id": "0395dd048ccf10ed020a77fa04bcb026ba369d73"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx11-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
|
||||
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/__init__.py",
|
||||
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/_ops.py",
|
||||
"blob_id": "f250a00832d2044f7bbb87557a1c878d9c8dd24d"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_yy3p6bsf622sq.abi3.so",
|
||||
"blob_id": "c98d156835e442b039d38a82e9f111036750329c"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
|
||||
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/__init__.py",
|
||||
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/_ops.py",
|
||||
"blob_id": "b5259247e8fb3ed9429cf005a525edc8bcae4903"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/_quantization_eetq_imijtykkseqze.abi3.so",
|
||||
"blob_id": "c46908ce00d02376ae8e18efebb7fee55afbc3ac"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu121-x86_64-linux/quantization_eetq/custom_ops.py",
|
||||
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/__init__.py",
|
||||
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/_ops.py",
|
||||
"blob_id": "79f8d42700ad34b9b46e6e328f90885d1ee9beab"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_4qerj3t7ddiry.abi3.so",
|
||||
"blob_id": "9ba519d2fd4e347b784c21f4c171cbbab57c7774"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch25-cxx98-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
|
||||
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/__init__.py",
|
||||
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/_ops.py",
|
||||
"blob_id": "805ec785b7f5196f78dfe77b6cd7c2603c02490e"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_j23ltbqvrnixg.abi3.so",
|
||||
"blob_id": "77d53c16e57c658e8f9caa37b0084c4a3a7ffda1"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx11-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
|
||||
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/__init__.py",
|
||||
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/_ops.py",
|
||||
"blob_id": "7b590a5a6ede67e0ae13f97dbd7a82a4674e1b23"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_p5neqtnhdgxv2.abi3.so",
|
||||
"blob_id": "e3e5fbd8ce3232b6e9a7c3077eab9665b95bef49"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx11-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
|
||||
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/__init__.py",
|
||||
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/_ops.py",
|
||||
"blob_id": "0be7ffcb2e9590899683a197b977ec0b39ca7cb7"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/_quantization_eetq_idk3dezy35dfk.abi3.so",
|
||||
"blob_id": "61aa67cbe7ce810bf9792e6e8f19219c757ff181"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx11-cu126-x86_64-linux/quantization_eetq/custom_ops.py",
|
||||
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/__init__.py",
|
||||
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/_ops.py",
|
||||
"blob_id": "998eba3eddd0520769a2b4ecb3402c024bde44ea"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/_quantization_eetq_fpjoxzd7nm2qa.abi3.so",
|
||||
"blob_id": "31d835db1d0348e3f35c23e6a8f2532fd7e9fea7"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx98-cu118-x86_64-linux/quantization_eetq/custom_ops.py",
|
||||
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/__init__.py",
|
||||
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/_ops.py",
|
||||
"blob_id": "6d5320b05b03f2f3ddfd299d6e2a72aa6116264f"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/_quantization_eetq_k7mlunxe2ye4s.abi3.so",
|
||||
"blob_id": "1946e4c2fab63243d051012cb12e19895828145f"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx98-cu124-x86_64-linux/quantization_eetq/custom_ops.py",
|
||||
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/__init__.py",
|
||||
"blob_id": "c65d0601c655d7acf1a12e61b6549618b46a70d7"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/_ops.py",
|
||||
"blob_id": "9b15d85f44e4223ce1f16df987feafd6640dcc62"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/_quantization_eetq_7m7hz3sbwkaio.abi3.so",
|
||||
"blob_id": "eb1536ccd1dfa2655ea7de4445aa3c6790f3a0ae"
|
||||
},
|
||||
{
|
||||
"filename": "build/torch26-cxx98-cu126-x86_64-linux/quantization_eetq/custom_ops.py",
|
||||
"blob_id": "005b5a6e3cd5f7bcfd4aa5d7d80d60a5ed9fab88"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
|
|
@ -42,6 +42,7 @@ build-backend = "setuptools.build_meta"
|
|||
"kernels-community/paged-attention" = ">=0.0.2"
|
||||
"kernels-community/moe" = ">=0.1.1"
|
||||
"kernels-community/quantization" = ">=0.0.3"
|
||||
"kernels-community/quantization-eetq" = ">=0.0.1"
|
||||
|
||||
[project.scripts]
|
||||
text-generation-server = "text_generation_server.cli:app"
|
||||
|
|
|
@ -1,9 +1,13 @@
|
|||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
from EETQ import quant_weights, w8_a16_gemm
|
||||
from text_generation_server.utils.kernels import load_kernel
|
||||
from text_generation_server.utils.weights import UnquantizedWeight
|
||||
|
||||
quantization_eetq = load_kernel(
|
||||
module="quantization_eetq", repo_id="kernels-community/quantization-eetq"
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EETQWeight(UnquantizedWeight):
|
||||
|
@ -31,13 +35,13 @@ class EETQLinear(torch.nn.Module):
|
|||
if weight.dtype != torch.float16:
|
||||
weight = weight.to(dtype=torch.float16)
|
||||
weight = torch.t(weight).contiguous().cpu()
|
||||
weight, scale = quant_weights(weight, torch.int8, False)
|
||||
weight, scale = quantization_eetq.quant_weights(weight, torch.int8, False)
|
||||
|
||||
self.weight = weight.cuda(device)
|
||||
self.scale = scale.cuda(device)
|
||||
self.bias = bias.cuda(device) if bias is not None else None
|
||||
|
||||
def forward(self, input: torch.Tensor) -> torch.Tensor:
|
||||
output = w8_a16_gemm(input, self.weight, self.scale)
|
||||
output = quantization_eetq.w8_a16_gemm(input, self.weight, self.scale)
|
||||
output = output + self.bias if self.bias is not None else output
|
||||
return output
|
||||
|
|
Loading…
Reference in New Issue