Support eetq weight only quantization (#1068)

# What does this PR do?   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.  --------- Co-authored-by: zhaosida <zhaosida@corp.netease.com>
2023-09-27 11:42:57 +02:00 · 2023-09-27 11:42:57 +02:00 · 95a4bb696a
parent 36c2868853
commit 95a4bb696a
7 changed files with 94 additions and 13 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2896,18 +2896,18 @@ dependencies = [
 [[package]]
 name = "thiserror"
-version = "1.0.48"
+version = "1.0.49"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d6d7a740b8a666a7e828dd00da9c0dc290dff53154ea77ac109281de90589b7"
+checksum = "1177e8c6d7ede7afde3585fd2513e611227efd6481bd78d2e82ba1ce16557ed4"
 dependencies = [
 "thiserror-impl",
 ]
 [[package]]
 name = "thiserror-impl"
-version = "1.0.48"
+version = "1.0.49"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49922ecae66cc8a249b77e68d1d0623c1b2c514f0060c27cdc68bd62a1219d35"
+checksum = "10712f02019e9288794769fba95cd6847df9874d49d871d062172f9dd41bc4cc"
 dependencies = [
 "proc-macro2",
 "quote",
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -21,11 +21,32 @@ mod env_runtime;
 #[derive(Clone, Copy, Debug, ValueEnum)]
 enum Quantization {
-    Bitsandbytes,
+    /// 4 bit quantization. Requires a specific GTPQ quantized model:
-    BitsandbytesNF4,
+    ///   https://hf.co/models?search=awq.
-    BitsandbytesFP4,
+    /// Should replace GPTQ models whereever possible because of the better latency
    Gptq,
    Awq,
    /// 8 bit quantization, doesn't require specific model.
    /// Should be a drop-in replacement to bitsandbytes with much better performance.
    /// Kernels are from https://github.com/NetEase-FuXi/EETQ.git
    Eetq,
    /// 4 bit quantization. Requires a specific GTPQ quantized model: https://hf.co/models?search=gptq.
    /// text-generation-inference will use exllama (faster) kernels whereever possible, and use
    /// triton kernel (wider support) when it's not.
    /// AWQ has faster kernels.
    Gptq,
    /// Bitsandbytes 8bit. Can be applied on any model, will cut the memory requirement in half,
    /// but it is known that the model will be much slower to run than the native f16.
    #[deprecated(
        since = "1.1.0",
        note = "Use `eetq` instead, which provides better latencies overall and is drop-in in most cases"
    )]
    Bitsandbytes,
    /// Bitsandbytes 4bit. Can be applied on any model, will cut the memory requirement by 4x,
    /// but it is known that the model will be much slower to run than the native f16.
    BitsandbytesNF4,
    /// Bitsandbytes 4bit. nf4 should be preferred in most cases but maybe this one has better
    /// perplexity performance for you model
    BitsandbytesFP4,
 }
 impl std::fmt::Display for Quantization {
@ -47,6 +68,9 @@ impl std::fmt::Display for Quantization {
            Quantization::Awq => {
                write!(f, "awq")
            }
            Quantization::Eetq => {
                write!(f, "eetq")
            }
        }
    }
 }
@ -127,9 +151,7 @@ struct Args {
    #[clap(long, env)]
    num_shard: Option<usize>,
-    /// Whether you want the model to be quantized. This will use `bitsandbytes` for
+    /// Whether you want the model to be quantized.
    /// quantization on the fly, or `gptq`. 4bit quantization is available through
    /// `bitsandbytes` by providing the `bitsandbytes-fp4` or `bitsandbytes-nf4` options.
    #[clap(long, env, value_enum)]
    quantize: Option<Quantization>,
--- a/server/.gitignore
+++ b/server/.gitignore
@ -160,3 +160,4 @@ flash-attention/
 flash-attention-v2/
 vllm/
 llm-awq/
 eetq/
--- a/server/Makefile
+++ b/server/Makefile
@ -2,6 +2,7 @@ include Makefile-flash-att
 include Makefile-flash-att-v2
 include Makefile-vllm
 include Makefile-awq
 include Makefile-eetq
 unit-tests:
 	pytest -s -vv -m "not private" tests
--- a/server/Makefile-eetq
+++ b/server/Makefile-eetq
@ -0,0 +1,13 @@
 eetq_commit := 323827dd471458a84e9c840f614e4592b157a4b1
 eetq:
    # Clone eetq
 	pip install packaging
 	git clone https://github.com/NetEase-FuXi/EETQ.git eetq
 build-eetq: eetq
 	cd eetq && git fetch && git checkout $(eetq_commit)
 	cd eetq && python setup.py build
 install-eetq: build-eetq
 	cd eetq && python setup.py install
--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@ -18,6 +18,7 @@ class Quantization(str, Enum):
    bitsandbytes_fp4 = "bitsandbytes-fp4"
    gptq = "gptq"
    awq = "awq"
    eetq = "eetq"
 class Dtype(str, Enum):
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
@ -5,6 +5,8 @@ import torch.distributed
 from torch import nn
 from torch.nn import functional as F
 from typing import List
 from loguru import logger
 from functools import lru_cache
 HAS_BITS_AND_BYTES = True
 try:
@ -42,6 +44,13 @@ elif CAN_EXLLAMA:
 from typing import Optional
 HAS_EETQ = False
 try:
    from EETQ import quant_weights, w8_a16_gemm
    HAS_EETQ = True
 except ImportError:
    pass
 # Monkey patching
@classmethod
 def load_layer_norm(cls, prefix, weights, eps):
@ -120,6 +129,30 @@ class FastLinear(nn.Module):
        return F.linear(input, self.weight, self.bias)
 class EETQLinear(nn.Module):
    def __init__(
        self,
        weight,
        bias,
    ) -> None:
        super().__init__()
        device = weight.device
        weight = torch.t(weight).contiguous().cpu()
        weight, scale = quant_weights(weight, torch.int8, False)
        if bias:
            bias = weights.get_tensor(f"{prefix}.bias")
        else:
            bias = None
        self.weight = weight.cuda(device)
        self.scale = scale.cuda(device)
        self.bias = bias.cuda(device) if bias is not None else None
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        output = w8_a16_gemm(input, self.weight, self.scale)
        output = output + self.bias if self.bias is not None else output
        return output
 class Linear8bitLt(nn.Module):
    def __init__(
        self,
@ -211,10 +244,20 @@ class Linear4bit(nn.Module):
        return out
@lru_cache(1)
 def warn_deprecate_bnb():
    logger.warning("Bitsandbytes 8bit is deprecated, using `eetq` is a drop-in replacement, and has much better performnce")
 def get_linear(weight, bias, quantize):
    if quantize is None:
        linear = FastLinear(weight, bias)
    elif quantize == "eetq":
        if HAS_EETQ:
            linear = EETQLinear(weight, bias)
        else:
            raise ImportError("Please install EETQ from https://github.com/NetEase-FuXi/EETQ")
    elif quantize == "bitsandbytes":
        warn_deprecate_bnb()
        linear = Linear8bitLt(
            weight,
            bias,
@ -298,8 +341,8 @@ class TensorParallelHead(SuperLayer):
            weight = weights.get_tensor(f"{prefix}.weight")
            should_gather = False
-        # GPTQ and AWQ don't quantize heads (nor embeddings)
+        # GPTQ,AWQ,EETQ don't quantize heads (nor embeddings)
-        if config.quantize in ["gptq", "awq"]:
+        if config.quantize in ["gptq", "awq", "eetq"]:
            quantize = None
        else:
            quantize = config.quantize