Fixing import exl2 (#2399)
This commit is contained in:
parent
730fa00e20
commit
84bc3d7b7d
|
@ -8,34 +8,6 @@ from text_generation_server.utils.import_utils import SYSTEM
|
||||||
from text_generation_server.utils.log import log_once
|
from text_generation_server.utils.log import log_once
|
||||||
from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
|
from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
|
||||||
|
|
||||||
try:
|
|
||||||
major, _minor = torch.cuda.get_device_capability()
|
|
||||||
except Exception:
|
|
||||||
major = 1
|
|
||||||
|
|
||||||
HAS_EXLLAMA = False
|
|
||||||
CAN_EXLLAMA = major >= 8 or SYSTEM == "rocm"
|
|
||||||
V2 = os.getenv("EXLLAMA_VERSION", "2") == "2"
|
|
||||||
if os.getenv("DISABLE_EXLLAMA") == "True":
|
|
||||||
HAS_EXLLAMA = False
|
|
||||||
elif CAN_EXLLAMA:
|
|
||||||
try:
|
|
||||||
if V2:
|
|
||||||
from text_generation_server.layers.gptq.exllamav2 import (
|
|
||||||
QuantLinear as ExllamaQuantLinear, # noqa: F401
|
|
||||||
)
|
|
||||||
|
|
||||||
HAS_EXLLAMA = "2"
|
|
||||||
else:
|
|
||||||
from text_generation_server.layers.gptq.exllama import (
|
|
||||||
Ex4bitLinear as ExllamaQuantLinear, # noqa: F401
|
|
||||||
)
|
|
||||||
|
|
||||||
HAS_EXLLAMA = "1"
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class GPTQWeight(Weight):
|
class GPTQWeight(Weight):
|
||||||
|
@ -432,3 +404,33 @@ class GPTQWeightsLoader(WeightsLoader):
|
||||||
else False
|
else False
|
||||||
)
|
)
|
||||||
self.quant_method = "gptq"
|
self.quant_method = "gptq"
|
||||||
|
|
||||||
|
|
||||||
|
# Needs to be at the end because circular import.
|
||||||
|
try:
|
||||||
|
major, _minor = torch.cuda.get_device_capability()
|
||||||
|
except Exception:
|
||||||
|
major = 1
|
||||||
|
|
||||||
|
HAS_EXLLAMA = False
|
||||||
|
CAN_EXLLAMA = major >= 8 or SYSTEM == "rocm"
|
||||||
|
V2 = os.getenv("EXLLAMA_VERSION", "2") == "2"
|
||||||
|
if os.getenv("DISABLE_EXLLAMA") == "True":
|
||||||
|
HAS_EXLLAMA = False
|
||||||
|
elif CAN_EXLLAMA:
|
||||||
|
try:
|
||||||
|
if V2:
|
||||||
|
from text_generation_server.layers.gptq.exllamav2 import (
|
||||||
|
QuantLinear as ExllamaQuantLinear, # noqa: F401
|
||||||
|
)
|
||||||
|
|
||||||
|
HAS_EXLLAMA = "2"
|
||||||
|
else:
|
||||||
|
from text_generation_server.layers.gptq.exllama import (
|
||||||
|
Ex4bitLinear as ExllamaQuantLinear, # noqa: F401
|
||||||
|
)
|
||||||
|
|
||||||
|
HAS_EXLLAMA = "1"
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
Loading…
Reference in New Issue