From 7e773b0f20397ab260fdd551e628c2c896c889cc Mon Sep 17 00:00:00 2001 From: drbh Date: Mon, 12 Aug 2024 15:18:02 +0000 Subject: [PATCH] fix: superseed gptq changes with main --- .../layers/gptq/gptq_types.py | 69 ------------------- 1 file changed, 69 deletions(-) delete mode 100644 server/text_generation_server/layers/gptq/gptq_types.py diff --git a/server/text_generation_server/layers/gptq/gptq_types.py b/server/text_generation_server/layers/gptq/gptq_types.py deleted file mode 100644 index 9ac75385..00000000 --- a/server/text_generation_server/layers/gptq/gptq_types.py +++ /dev/null @@ -1,69 +0,0 @@ -from dataclasses import dataclass -from typing import Optional -import torch -from text_generation_server.utils.weights import Weight -from text_generation_server.utils.import_utils import SYSTEM - - -@dataclass -class GPTQWeight(Weight): - qweight: torch.Tensor - qzeros: torch.Tensor - scales: torch.Tensor - g_idx: Optional[torch.Tensor] - bits: int - groupsize: int - use_awq_kernel: bool - use_exllama: bool - - def __post_init__(self): - if self.scales.dtype == torch.float: - self.scales = self.scales.half() - - @property - def device(self) -> torch.device: - return self.qweight.device - - def get_linear(self, bias: torch.Tensor): - if self.use_awq_kernel: - if SYSTEM == "rocm": - raise NotImplementedError( - "AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead " - "to use Exllama/GPTQ kernels for AWQ inference." - ) - try: - from text_generation_server.layers.awq.quantize.qmodule import WQLinear - - return WQLinear( - w_bit=self.bits, - group_size=self.groupsize, - qweight=self.qweight, - qzeros=self.qzeros, - scales=self.scales, - bias=bias, - ) - except ImportError: - raise NotImplementedError( - "You do not seem to have awq installed, either install it (cd server && make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly" - ) - elif self.use_exllama: - try: - from text_generation_server.layers.gptq import ExllamaQuantLinear - except ImportError: - raise NotImplementedError( - "Exllama gptq kernels are not installed. Install them `cd server/exllama_kernels && python setup.py install && cd ../exllamav2_kernels && python setup.py install`" - ) - - return ExllamaQuantLinear(self, bias) - else: - from text_generation_server.layers.gptq.quant_linear import QuantLinear - - return QuantLinear( - self.qweight, - self.qzeros, - self.scales, - self.g_idx, - bias, - self.bits, - self.groupsize, - )