diff --git a/integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin.json b/integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin.json new file mode 100644 index 00000000..94883de5 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin.json @@ -0,0 +1,89 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4321, + "logprob": -9.0859375, + "text": "Test" + }, + { + "id": 2009, + "logprob": -16.359375, + "text": "request" + } + ], + "seed": null, + "tokens": [ + { + "id": 5229, + "logprob": -2.7988281, + "special": false, + "text": " failed" + }, + { + "id": 29901, + "logprob": -0.91259766, + "special": false, + "text": ":" + }, + { + "id": 853, + "logprob": -2.8496094, + "special": false, + "text": " Un" + }, + { + "id": 23765, + "logprob": -1.1894531, + "special": false, + "text": "supported" + }, + { + "id": 4714, + "logprob": -1.5917969, + "special": false, + "text": " browser" + }, + { + "id": 29892, + "logprob": -0.34765625, + "special": false, + "text": "," + }, + { + "id": 1873, + "logprob": -1.2695312, + "special": false, + "text": " version" + }, + { + "id": 470, + "logprob": -0.25170898, + "special": false, + "text": " or" + }, + { + "id": 7481, + "logprob": -0.21411133, + "special": false, + "text": " platform" + }, + { + "id": 13, + "logprob": -1.1162109, + "special": false, + "text": "\n" + } + ], + "top_tokens": null + }, + "generated_text": " failed: Unsupported browser, version or platform\n" +} diff --git a/integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin24_all_params.json b/integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin24_all_params.json new file mode 100644 index 00000000..58cacb80 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin24_all_params.json @@ -0,0 +1,89 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4321, + "logprob": -9.0859375, + "text": "Test" + }, + { + "id": 2009, + "logprob": -16.359375, + "text": "request" + } + ], + "seed": 0, + "tokens": [ + { + "id": 5229, + "logprob": -0.6645508, + "special": false, + "text": " failed" + }, + { + "id": 29901, + "logprob": 0.0, + "special": false, + "text": ":" + }, + { + "id": 6527, + "logprob": -2.2324219, + "special": false, + "text": " Could" + }, + { + "id": 451, + "logprob": 0.0, + "special": false, + "text": " not" + }, + { + "id": 6088, + "logprob": -1.6074219, + "special": false, + "text": " parse" + }, + { + "id": 1243, + "logprob": -1.6298828, + "special": false, + "text": " test" + }, + { + "id": 1206, + "logprob": -0.72558594, + "special": false, + "text": " case" + }, + { + "id": 1024, + "logprob": -0.40429688, + "special": false, + "text": " name" + }, + { + "id": 515, + "logprob": 0.0, + "special": false, + "text": " from" + }, + { + "id": 525, + "logprob": -1.2519531, + "special": false, + "text": " '" + } + ], + "top_tokens": null + }, + "generated_text": "Test request failed: Could not parse test case name from '" +} diff --git a/integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin24_load.json b/integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin24_load.json new file mode 100644 index 00000000..96a40fa4 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_llama_marlin_24/test_flash_llama_marlin24_load.json @@ -0,0 +1,358 @@ +[ + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4321, + "logprob": -9.0859375, + "text": "Test" + }, + { + "id": 2009, + "logprob": -16.359375, + "text": "request" + } + ], + "seed": null, + "tokens": [ + { + "id": 5229, + "logprob": -2.7988281, + "special": false, + "text": " failed" + }, + { + "id": 29901, + "logprob": -0.91259766, + "special": false, + "text": ":" + }, + { + "id": 853, + "logprob": -2.8496094, + "special": false, + "text": " Un" + }, + { + "id": 23765, + "logprob": -1.1894531, + "special": false, + "text": "supported" + }, + { + "id": 4714, + "logprob": -1.5917969, + "special": false, + "text": " browser" + }, + { + "id": 29892, + "logprob": -0.34765625, + "special": false, + "text": "," + }, + { + "id": 1873, + "logprob": -1.2695312, + "special": false, + "text": " version" + }, + { + "id": 470, + "logprob": -0.25170898, + "special": false, + "text": " or" + }, + { + "id": 7481, + "logprob": -0.21411133, + "special": false, + "text": " platform" + }, + { + "id": 13, + "logprob": -1.1162109, + "special": false, + "text": "\n" + } + ], + "top_tokens": null + }, + "generated_text": " failed: Unsupported browser, version or platform\n" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4321, + "logprob": -9.0859375, + "text": "Test" + }, + { + "id": 2009, + "logprob": -16.359375, + "text": "request" + } + ], + "seed": null, + "tokens": [ + { + "id": 5229, + "logprob": -2.7988281, + "special": false, + "text": " failed" + }, + { + "id": 29901, + "logprob": -0.91259766, + "special": false, + "text": ":" + }, + { + "id": 853, + "logprob": -2.8496094, + "special": false, + "text": " Un" + }, + { + "id": 23765, + "logprob": -1.1894531, + "special": false, + "text": "supported" + }, + { + "id": 4714, + "logprob": -1.5917969, + "special": false, + "text": " browser" + }, + { + "id": 29892, + "logprob": -0.34765625, + "special": false, + "text": "," + }, + { + "id": 1873, + "logprob": -1.2695312, + "special": false, + "text": " version" + }, + { + "id": 470, + "logprob": -0.25170898, + "special": false, + "text": " or" + }, + { + "id": 7481, + "logprob": -0.21411133, + "special": false, + "text": " platform" + }, + { + "id": 13, + "logprob": -1.1162109, + "special": false, + "text": "\n" + } + ], + "top_tokens": null + }, + "generated_text": " failed: Unsupported browser, version or platform\n" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4321, + "logprob": -9.0859375, + "text": "Test" + }, + { + "id": 2009, + "logprob": -16.359375, + "text": "request" + } + ], + "seed": null, + "tokens": [ + { + "id": 5229, + "logprob": -2.7988281, + "special": false, + "text": " failed" + }, + { + "id": 29901, + "logprob": -0.91259766, + "special": false, + "text": ":" + }, + { + "id": 853, + "logprob": -2.8496094, + "special": false, + "text": " Un" + }, + { + "id": 23765, + "logprob": -1.1894531, + "special": false, + "text": "supported" + }, + { + "id": 4714, + "logprob": -1.5917969, + "special": false, + "text": " browser" + }, + { + "id": 29892, + "logprob": -0.34765625, + "special": false, + "text": "," + }, + { + "id": 1873, + "logprob": -1.2695312, + "special": false, + "text": " version" + }, + { + "id": 470, + "logprob": -0.25170898, + "special": false, + "text": " or" + }, + { + "id": 7481, + "logprob": -0.21411133, + "special": false, + "text": " platform" + }, + { + "id": 13, + "logprob": -1.1162109, + "special": false, + "text": "\n" + } + ], + "top_tokens": null + }, + "generated_text": " failed: Unsupported browser, version or platform\n" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4321, + "logprob": -9.0859375, + "text": "Test" + }, + { + "id": 2009, + "logprob": -16.359375, + "text": "request" + } + ], + "seed": null, + "tokens": [ + { + "id": 5229, + "logprob": -2.7988281, + "special": false, + "text": " failed" + }, + { + "id": 29901, + "logprob": -0.91259766, + "special": false, + "text": ":" + }, + { + "id": 853, + "logprob": -2.8496094, + "special": false, + "text": " Un" + }, + { + "id": 23765, + "logprob": -1.1894531, + "special": false, + "text": "supported" + }, + { + "id": 4714, + "logprob": -1.5917969, + "special": false, + "text": " browser" + }, + { + "id": 29892, + "logprob": -0.34765625, + "special": false, + "text": "," + }, + { + "id": 1873, + "logprob": -1.2695312, + "special": false, + "text": " version" + }, + { + "id": 470, + "logprob": -0.25170898, + "special": false, + "text": " or" + }, + { + "id": 7481, + "logprob": -0.21411133, + "special": false, + "text": " platform" + }, + { + "id": 13, + "logprob": -1.1162109, + "special": false, + "text": "\n" + } + ], + "top_tokens": null + }, + "generated_text": " failed: Unsupported browser, version or platform\n" + } +] diff --git a/integration-tests/models/test_completion_prompts.py b/integration-tests/models/test_completion_prompts.py index 0efb6693..d787873b 100644 --- a/integration-tests/models/test_completion_prompts.py +++ b/integration-tests/models/test_completion_prompts.py @@ -100,6 +100,8 @@ async def test_flash_llama_completion_many_prompts_stream( chunk = [c.replace("data:", "") for c in chunk] # remove empty strings chunk = [c for c in chunk if c] + # remove completion marking chunk + chunk = [c for c in chunk if c != " [DONE]"] # parse json chunk = [json.loads(c) for c in chunk] diff --git a/integration-tests/models/test_flash_llama_marlin_24.py b/integration-tests/models/test_flash_llama_marlin_24.py new file mode 100644 index 00000000..3eb94f02 --- /dev/null +++ b/integration-tests/models/test_flash_llama_marlin_24.py @@ -0,0 +1,66 @@ +import pytest + + +@pytest.fixture(scope="module") +def flash_llama_marlin24_handle(launcher): + with launcher( + "nm-testing/Llama-2-7b-pruned2.4-Marlin_24", quantize="marlin" + ) as handle: + yield handle + + +@pytest.fixture(scope="module") +async def flash_llama_marlin(flash_llama_marlin24_handle): + await flash_llama_marlin24_handle.health(300) + return flash_llama_marlin24_handle.client + + +@pytest.mark.release +@pytest.mark.asyncio +@pytest.mark.private +async def test_flash_llama_marlin(flash_llama_marlin, response_snapshot): + response = await flash_llama_marlin.generate( + "Test request", max_new_tokens=10, decoder_input_details=True + ) + + assert response.details.generated_tokens == 10 + assert response == response_snapshot + + +@pytest.mark.release +@pytest.mark.asyncio +@pytest.mark.private +async def test_flash_llama_marlin24_all_params(flash_llama_marlin, response_snapshot): + response = await flash_llama_marlin.generate( + "Test request", + max_new_tokens=10, + repetition_penalty=1.2, + return_full_text=True, + temperature=0.5, + top_p=0.9, + top_k=10, + truncate=5, + typical_p=0.9, + watermark=True, + decoder_input_details=True, + seed=0, + ) + + assert response.details.generated_tokens == 10 + assert response == response_snapshot + + +@pytest.mark.release +@pytest.mark.asyncio +@pytest.mark.private +async def test_flash_llama_marlin24_load( + flash_llama_marlin, generate_load, response_snapshot +): + responses = await generate_load( + flash_llama_marlin, "Test request", max_new_tokens=10, n=4 + ) + + assert len(responses) == 4 + assert all([r.generated_text == responses[0].generated_text for r in responses]) + + assert responses == response_snapshot diff --git a/server/tests/utils/test_weights.py b/server/tests/utils/test_weights.py index 36b27be8..d2d2b76e 100644 --- a/server/tests/utils/test_weights.py +++ b/server/tests/utils/test_weights.py @@ -2,6 +2,7 @@ import pytest import torch from text_generation_server.utils.weights import ( DefaultWeightsLoader, + UnquantizedWeight, Weights, WeightsLoader, ) @@ -363,7 +364,10 @@ class MockWeights(Weights): self.process_group = process_group self.prefix = prefix self.weights_loader = ( - DefaultWeightsLoader() if weights_loader is None else weights_loader + # We don't need to get linear layers, so just wrap raw tensors. + DefaultWeightsLoader(lambda x: x) + if weights_loader is None + else weights_loader ) self._handles = {} @@ -632,6 +636,7 @@ def test_get_weights_col_awq(gptq_weights_loader_awq): g_idx=None, bits=8.0, groupsize=2.0, + use_awq_kernel=True, use_exllama=False, ) @@ -641,6 +646,7 @@ def test_get_weights_col_awq(gptq_weights_loader_awq): assert w.g_idx == expected_weight.g_idx, "g_idx mismatch" assert w.bits == expected_weight.bits, "bits mismatch" assert w.groupsize == expected_weight.groupsize, "groupsize mismatch" + assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch" assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch" @@ -669,6 +675,7 @@ def test_get_weights_col_gtpq(gptq_weights_loader): g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32), bits=8.0, groupsize=2.0, + use_awq_kernel=False, use_exllama=False, ) @@ -678,6 +685,7 @@ def test_get_weights_col_gtpq(gptq_weights_loader): assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch" assert w.bits == expected_weight.bits, "bits mismatch" assert w.groupsize == expected_weight.groupsize, "groupsize mismatch" + assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch" assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch" @@ -774,6 +782,7 @@ def test_get_weights_col_packed_awq(gptq_weights_loader_awq): g_idx=None, bits=8.0, groupsize=2.0, + use_awq_kernel=True, use_exllama=False, ) @@ -783,6 +792,7 @@ def test_get_weights_col_packed_awq(gptq_weights_loader_awq): assert w.g_idx == expected_weight.g_idx, "g_idx mismatch" assert w.bits == expected_weight.bits, "bits mismatch" assert w.groupsize == expected_weight.groupsize, "groupsize mismatch" + assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch" assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch" @@ -851,6 +861,7 @@ def test_get_weights_col_packed_gptq(gptq_weights_loader): g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32), bits=8.0, groupsize=2.0, + use_awq_kernel=False, use_exllama=False, ) @@ -860,6 +871,7 @@ def test_get_weights_col_packed_gptq(gptq_weights_loader): assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch" assert w.bits == expected_weight.bits, "bits mismatch" assert w.groupsize == expected_weight.groupsize, "groupsize mismatch" + assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch" assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch" @@ -922,6 +934,7 @@ def test_get_multi_weights_col_awq(gptq_weights_loader_awq): g_idx=None, bits=8.0, groupsize=2.0, + use_awq_kernel=True, use_exllama=False, ) @@ -931,6 +944,7 @@ def test_get_multi_weights_col_awq(gptq_weights_loader_awq): assert w.g_idx == expected_weight.g_idx, "g_idx mismatch" assert w.bits == expected_weight.bits, "bits mismatch" assert w.groupsize == expected_weight.groupsize, "groupsize mismatch" + assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch" assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch" @@ -983,6 +997,7 @@ def test_get_multi_weights_col_gptq(gptq_weights_loader): g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32), bits=8.0, groupsize=2.0, + use_awq_kernel=False, use_exllama=False, ) @@ -992,6 +1007,7 @@ def test_get_multi_weights_col_gptq(gptq_weights_loader): assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch" assert w.bits == expected_weight.bits, "bits mismatch" assert w.groupsize == expected_weight.groupsize, "groupsize mismatch" + assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch" assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch" @@ -1051,6 +1067,7 @@ def test_get_weights_row_awq(gptq_weights_loader_awq): g_idx=None, bits=8.0, groupsize=2.0, + use_awq_kernel=True, use_exllama=False, ) @@ -1060,6 +1077,7 @@ def test_get_weights_row_awq(gptq_weights_loader_awq): assert w.g_idx == expected_weight.g_idx, "g_idx mismatch" assert w.bits == expected_weight.bits, "bits mismatch" assert w.groupsize == expected_weight.groupsize, "groupsize mismatch" + assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch" assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch" @@ -1125,6 +1143,7 @@ def test_get_weights_row_gptq(gptq_weights_loader): g_idx=torch.tensor([0, 1, 0, 1], dtype=torch.int32), bits=8.0, groupsize=2.0, + use_awq_kernel=False, use_exllama=False, ) @@ -1134,6 +1153,7 @@ def test_get_weights_row_gptq(gptq_weights_loader): assert torch.allclose(w.g_idx, expected_weight.g_idx), "g_idx mismatch" assert w.bits == expected_weight.bits, "bits mismatch" assert w.groupsize == expected_weight.groupsize, "groupsize mismatch" + assert w.use_awq_kernel == expected_weight.use_awq_kernel, "use_awq_kernel mismatch" assert w.use_exllama == expected_weight.use_exllama, "use_exllama mismatch" diff --git a/server/text_generation_server/layers/bnb.py b/server/text_generation_server/layers/bnb.py index ca39919c..925b0b2d 100644 --- a/server/text_generation_server/layers/bnb.py +++ b/server/text_generation_server/layers/bnb.py @@ -1,8 +1,11 @@ -import torch -from loguru import logger +from dataclasses import dataclass from functools import lru_cache + import bitsandbytes as bnb +import torch from bitsandbytes.nn import Int8Params, Params4bit +from loguru import logger +from text_generation_server.utils.weights import Weight @lru_cache(1) @@ -12,6 +15,14 @@ def warn_deprecate_bnb(): ) +@dataclass +class BNBWeight(Weight): + weight: torch.Tensor + + def get_linear(self, bias: torch.Tensor): + return Linear8bitLt(self.weight, bias, has_fp16_weights=False, threshold=6.0) + + class Linear8bitLt(torch.nn.Module): def __init__( self, @@ -70,6 +81,22 @@ class Linear8bitLt(torch.nn.Module): return out +@dataclass +class BNBFP4Weight(Weight): + weight: torch.Tensor + + def get_linear(self, bias: torch.Tensor): + return Linear4bit(self.weight, bias, quant_type="fp4") + + +@dataclass +class BNBNF4Weight(Weight): + weight: torch.Tensor + + def get_linear(self, bias: torch.Tensor): + return Linear4bit(self.weight, bias, quant_type="nf4") + + class Linear4bit(torch.nn.Module): def __init__(self, weight, bias, quant_type): super().__init__() diff --git a/server/text_generation_server/layers/eetq.py b/server/text_generation_server/layers/eetq.py index fd22b5c6..f003f914 100644 --- a/server/text_generation_server/layers/eetq.py +++ b/server/text_generation_server/layers/eetq.py @@ -1,5 +1,23 @@ +from dataclasses import dataclass + import torch from EETQ import quant_weights, w8_a16_gemm +from text_generation_server.utils.weights import Weight + + +@dataclass +class EETQWeight(Weight): + weight: torch.Tensor + + def get_linear(self, bias: torch.Tensor): + try: + from text_generation_server.layers.eetq import EETQLinear + + return EETQLinear(self.weight, bias) + except ImportError: + raise ImportError( + "Please install EETQ from https://github.com/NetEase-FuXi/EETQ" + ) class EETQLinear(torch.nn.Module): diff --git a/server/text_generation_server/layers/exl2.py b/server/text_generation_server/layers/exl2.py index 55cba1cc..1a5acfa8 100644 --- a/server/text_generation_server/layers/exl2.py +++ b/server/text_generation_server/layers/exl2.py @@ -1,12 +1,12 @@ -import torch -from typing import List, Union from dataclasses import dataclass +from typing import List, Union -from text_generation_server.utils.weights import WeightsLoader, Weights +import torch +from text_generation_server.utils.weights import Weight, Weights, WeightsLoader @dataclass -class Exl2Weight: +class Exl2Weight(Weight): """ Exllama2 exl2 quantized weights. """ @@ -25,6 +25,11 @@ class Exl2Weight: def device(self) -> torch.device: return self.q_weight.device + def get_linear(self, bias: torch.Tensor): + from text_generation_server.layers.gptq import ExllamaQuantLinear + + return ExllamaQuantLinear(self, bias) + class Exl2WeightsLoader(WeightsLoader): """Loader for exl2-quantized weights.""" diff --git a/server/text_generation_server/layers/fp8.py b/server/text_generation_server/layers/fp8.py index b76af8f1..b56f568a 100644 --- a/server/text_generation_server/layers/fp8.py +++ b/server/text_generation_server/layers/fp8.py @@ -1,7 +1,8 @@ -from enum import Enum, auto +from dataclasses import dataclass import torch from text_generation_server.utils.import_utils import SYSTEM +from text_generation_server.utils.weights import Weight def get_fp8_linear() -> torch.nn.Module: @@ -37,6 +38,14 @@ def fp8_quantize(weight, qdtype=torch.float8_e4m3fn): return qweight, scale +@dataclass +class Fp8Weight(Weight): + weight: torch.Tensor + + def get_linear(self, bias: torch.Tensor): + return get_fp8_linear()(self.weight, bias) + + class Fp8Linear(torch.nn.Module): def __init__( self, diff --git a/server/text_generation_server/layers/gptq/__init__.py b/server/text_generation_server/layers/gptq/__init__.py index aaa7a68a..c98dbefc 100644 --- a/server/text_generation_server/layers/gptq/__init__.py +++ b/server/text_generation_server/layers/gptq/__init__.py @@ -1,24 +1,23 @@ -from dataclasses import dataclass -from loguru import logger import os +from dataclasses import dataclass from typing import List, Optional, Union -from safetensors import SafetensorError -from text_generation_server.utils.weights import Weights, WeightsLoader + import torch -from text_generation_server.utils.import_utils import ( - SYSTEM, -) +from loguru import logger +from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.utils.log import log_once +from text_generation_server.utils.weights import Weight, Weights, WeightsLoader @dataclass -class GPTQWeight: +class GPTQWeight(Weight): qweight: torch.Tensor qzeros: torch.Tensor scales: torch.Tensor g_idx: Optional[torch.Tensor] bits: int groupsize: int + use_awq_kernel: bool use_exllama: bool def __post_init__(self): @@ -29,6 +28,50 @@ class GPTQWeight: def device(self) -> torch.device: return self.qweight.device + def get_linear(self, bias: torch.Tensor): + if self.use_awq_kernel: + if SYSTEM == "rocm": + raise NotImplementedError( + "AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead " + "to use Exllama/GPTQ kernels for AWQ inference." + ) + try: + from text_generation_server.layers.awq.quantize.qmodule import WQLinear + + return WQLinear( + w_bit=self.bits, + group_size=self.groupsize, + qweight=self.qweight, + qzeros=self.qzeros, + scales=self.scales, + bias=bias, + ) + except ImportError: + raise NotImplementedError( + "You do not seem to have awq installed, either install it (cd server && make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly" + ) + elif self.use_exllama: + try: + from text_generation_server.layers.gptq import ExllamaQuantLinear + except ImportError: + raise NotImplementedError( + f"Exllama gptq kernels are not installed. Install them `cd server/exllama_kernels && python setup.py install && cd ../exllamav2_kernels && python setup.py install`" + ) + + return ExllamaQuantLinear(self, bias) + else: + from text_generation_server.layers.gptq.quant_linear import QuantLinear + + return QuantLinear( + self.qweight, + self.qzeros, + self.scales, + self.g_idx, + bias, + self.bits, + self.groupsize, + ) + try: major, _minor = torch.cuda.get_device_capability() @@ -45,6 +88,8 @@ elif CAN_EXLLAMA: if V2: from text_generation_server.layers.gptq.exllamav2 import ( QuantLinear as ExllamaQuantLinear, + ) + from text_generation_server.layers.gptq.exllamav2 import ( create_exllama_buffers, set_device, ) @@ -53,6 +98,8 @@ elif CAN_EXLLAMA: else: from text_generation_server.layers.gptq.exllama import ( Ex4bitLinear as ExllamaQuantLinear, + ) + from text_generation_server.layers.gptq.exllama import ( create_exllama_buffers, set_device, ) @@ -162,6 +209,7 @@ class GPTQWeightsLoader(WeightsLoader): g_idx=g_idx, bits=self.bits, groupsize=self.groupsize, + use_awq_kernel=self.quantize == "awq", use_exllama=False, ) @@ -255,6 +303,7 @@ class GPTQWeightsLoader(WeightsLoader): g_idx=g_idx, bits=self.bits, groupsize=self.groupsize, + use_awq_kernel=self.quantize == "awq", use_exllama=use_exllama, ) @@ -336,8 +385,8 @@ class GPTQWeightsLoader(WeightsLoader): use_exllama = False from text_generation_server.layers.gptq import ( - HAS_EXLLAMA, CAN_EXLLAMA, + HAS_EXLLAMA, GPTQWeight, ) @@ -389,6 +438,7 @@ class GPTQWeightsLoader(WeightsLoader): g_idx=g_idx, bits=self.bits, groupsize=self.groupsize, + use_awq_kernel=self.quantize == "awq", use_exllama=use_exllama, ) diff --git a/server/text_generation_server/layers/linear.py b/server/text_generation_server/layers/linear.py index babd86b0..a97cc43a 100644 --- a/server/text_generation_server/layers/linear.py +++ b/server/text_generation_server/layers/linear.py @@ -1,7 +1,8 @@ from typing import Optional + import torch -from torch.nn import functional as F from text_generation_server.utils.import_utils import SYSTEM +from torch.nn import functional as F if SYSTEM == "rocm": try: @@ -90,167 +91,14 @@ class FastLinearROCm(torch.nn.Module): return F.linear(inp, self.weight, self.bias) -def get_linear(weight, bias, quantize): - if quantize is None: +def get_linear(weight, bias): + # Weights that are loaded through methods that are not + # quantization-aware are still bare tensors. We may want + # to change this in the future. + if isinstance(weight, torch.Tensor): if SYSTEM == "rocm": - linear = FastLinearROCm(weight, bias) + return FastLinearROCm(weight, bias) else: - linear = FastLinear(weight, bias) - elif quantize == "eetq": - try: - from text_generation_server.layers.eetq import EETQLinear + return FastLinear(weight, bias) - linear = EETQLinear(weight, bias) - except ImportError: - raise ImportError( - "Please install EETQ from https://github.com/NetEase-FuXi/EETQ" - ) - elif quantize == "fp8": - from text_generation_server.layers.fp8 import get_fp8_linear - - linear = get_fp8_linear()(weight, bias) - elif quantize == "bitsandbytes": - try: - from text_generation_server.layers.bnb import ( - warn_deprecate_bnb, - Linear8bitLt, - ) - except ImportError: - raise NotImplementedError( - f"Bitsandbytes is missing install it with `pip install bitsandbytes`." - ) - warn_deprecate_bnb() - linear = Linear8bitLt( - weight, - bias, - has_fp16_weights=False, - threshold=6.0, - ) - if bias is not None: - linear.bias = nn.Parameter(bias) - elif quantize == "bitsandbytes-fp4": - try: - from text_generation_server.layers.bnb import Linear4bit - except ImportError: - raise NotImplementedError( - f"Bitsandbytes is missing install it with `pip install bitsandbytes`." - ) - linear = Linear4bit( - weight, - bias, - quant_type="fp4", - ) - elif quantize == "bitsandbytes-nf4": - try: - from text_generation_server.layers.bnb import Linear4bit - except ImportError: - raise NotImplementedError( - f"Bitsandbytes is missing install it with `pip install bitsandbytes`." - ) - linear = Linear4bit( - weight, - bias, - quant_type="nf4", - ) - elif quantize == "exl2": - from text_generation_server.layers.exl2 import Exl2Weight - - if not isinstance(weight, Exl2Weight): - raise NotImplementedError( - f"The passed weight is not `exl2` compatible, loader needs to be updated." - ) - - from text_generation_server.layers.gptq import ExllamaQuantLinear - - linear = ExllamaQuantLinear(weight, bias) - - elif quantize == "gptq": - from text_generation_server.layers.gptq import GPTQWeight - from text_generation_server.layers.marlin import ( - GPTQMarlinLinear, - GPTQMarlinWeight, - ) - - if isinstance(weight, GPTQMarlinWeight): - linear = GPTQMarlinLinear( - weight=weight, - bias=bias, - ) - elif isinstance(weight, GPTQWeight): - if weight.use_exllama: - try: - from text_generation_server.layers.gptq import ( - ExllamaQuantLinear, - ) - except ImportError: - raise NotImplementedError( - f"Exllama gptq kernels are not installed. Install them `cd server/exllama_kernels && python setup.py install && cd ../exllamav2_kernels && python setup.py install`" - ) - - linear = ExllamaQuantLinear(weight, bias) - else: - from text_generation_server.layers.gptq.quant_linear import QuantLinear - - linear = QuantLinear( - weight.qweight, - weight.qzeros, - weight.scales, - weight.g_idx, - bias, - weight.bits, - weight.groupsize, - ) - else: - raise NotImplementedError( - f"The passed weight is not `gptq` compatible, loader needs to be updated." - ) - - elif quantize == "awq": - from text_generation_server.layers.gptq import GPTQWeight - - if not isinstance(weight, GPTQWeight): - raise NotImplementedError( - f"The passed weight is not `awq` compatible, loader needs to be updated." - ) - if SYSTEM == "rocm": - raise NotImplementedError( - "AWQ GEMM kernel can't be used on ROCm systems, please use `--quantize gptq` instead " - "to use Exllama/GPTQ kernels for AWQ inference." - ) - try: - from text_generation_server.layers.awq.quantize.qmodule import WQLinear - - linear = WQLinear( - w_bit=weight.bits, - group_size=weight.groupsize, - qweight=weight.qweight, - qzeros=weight.qzeros, - scales=weight.scales, - bias=bias, - ) - except ImportError: - raise NotImplementedError( - "You do not seem to have awq installed, either install it (cd server && make install-awq), or try using GPTQ `---quantize gptq` a conversion AWQ->GPTQ will happen on the fly" - ) - elif quantize == "marlin": - from text_generation_server.layers.marlin import ( - GPTQMarlin24Linear, - GPTQMarlin24Weight, - MarlinLinear, - MarlinWeight, - ) - - if isinstance(weight, GPTQMarlin24Weight): - linear = GPTQMarlin24Linear( - weight=weight, - bias=bias, - ) - elif isinstance(weight, MarlinWeight): - linear = MarlinLinear(weight=weight, bias=bias) - else: - raise NotImplementedError( - f"The passed weight is not `marlin` compatible, loader needs to be updated." - ) - else: - raise NotImplementedError(f"Quantization `{quantize}` is not implemented yet.") - return linear + return weight.get_linear(bias) diff --git a/server/text_generation_server/layers/marlin.py b/server/text_generation_server/layers/marlin.py index 9777a47e..e7f017a4 100644 --- a/server/text_generation_server/layers/marlin.py +++ b/server/text_generation_server/layers/marlin.py @@ -7,7 +7,7 @@ from loguru import logger from text_generation_server.layers.fp8 import fp8_quantize from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.utils.log import log_once -from text_generation_server.utils.weights import Weights, WeightsLoader +from text_generation_server.utils.weights import Weight, Weights, WeightsLoader try: import marlin_kernels @@ -63,8 +63,7 @@ class MarlinWeightsLoader(WeightsLoader): return weight def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int): - is_marlin_24 = getattr(self, "gptq_checkpoint_format", None) == "marlin_24" - if is_marlin_24: + if self.is_marlin_24: try: B = torch.cat( [weights.get_sharded(f"{p}.B_24", dim=1) for p in prefixes], dim=1 @@ -101,8 +100,7 @@ class MarlinWeightsLoader(WeightsLoader): return weight def get_weights_row(self, weights: Weights, prefix: str): - is_marlin_24 = getattr(self, "gptq_checkpoint_format", None) == "marlin_24" - if is_marlin_24: + if self.is_marlin_24: try: B = weights.get_sharded(f"{prefix}.B_24", dim=0) except RuntimeError: @@ -201,7 +199,7 @@ def permute_scales(scales: torch.Tensor): @dataclass -class GPTQMarlinWeight: +class GPTQMarlinWeight(Weight): """ Repacked GPTQ Marlin weights. """ @@ -219,6 +217,12 @@ class GPTQMarlinWeight: assert self.g_idx.dtype == torch.int32 assert self.perm.dtype == torch.int32 + def get_linear(self, bias: torch.Tensor): + return GPTQMarlinLinear( + weight=self, + bias=bias, + ) + def repack_gptq_for_marlin( *, @@ -376,6 +380,12 @@ class GPTQMarlin24Weight: assert self.B_meta.dtype == torch.int16 assert self.s.dtype == torch.float16 + def get_linear(self, bias: torch.Tensor): + return GPTQMarlin24Linear( + weight=self, + bias=bias, + ) + class GPTQMarlin24Linear(nn.Module): def __init__(self, *, weight: GPTQMarlin24Weight, bias: Optional[torch.Tensor]): @@ -567,7 +577,7 @@ def repack_fp8_for_marlin(weight: torch.Tensor, scale: torch.Tensor): @dataclass -class MarlinWeight: +class MarlinWeight(Weight): """ Marlin weights. @@ -583,6 +593,9 @@ class MarlinWeight: assert self.B.dtype == torch.int32 assert self.s.dtype == torch.float16 + def get_linear(self, bias: torch.Tensor): + return MarlinLinear(weight=self, bias=bias) + class MarlinLinear(nn.Module): def __init__(self, *, weight: MarlinWeight, bias: Optional[torch.Tensor]): diff --git a/server/text_generation_server/layers/tensor_parallel.py b/server/text_generation_server/layers/tensor_parallel.py index 011f105b..9dddb8ae 100644 --- a/server/text_generation_server/layers/tensor_parallel.py +++ b/server/text_generation_server/layers/tensor_parallel.py @@ -77,7 +77,7 @@ class TensorParallelHead(SuperLayer): quantize = config.quantize return TensorParallelHead( - get_linear(weight, bias=None, quantize=quantize), + get_linear(weight, bias=None), process_group=weights.process_group, should_gather=should_gather, ) @@ -134,7 +134,7 @@ class TensorParallelColumnLinear(SuperLayer): raise NotImplementedError("packed_gate_up only implemented without bias") else: bias = None - linear = get_linear(weight, bias, config.quantize) + linear = get_linear(weight, bias) return cls(linear) @classmethod @@ -157,7 +157,7 @@ class TensorParallelColumnLinear(SuperLayer): raise NotImplementedError("packed_qkv only implemented for baichuan") else: bias = None - linear = get_linear(weight, bias, config.quantize) + linear = get_linear(weight, bias) return cls(linear) @classmethod @@ -167,7 +167,7 @@ class TensorParallelColumnLinear(SuperLayer): bias = weights.get_sharded(f"{prefix}.bias", dim=0) else: bias = None - linear = get_linear(weight, bias, config.quantize) + linear = get_linear(weight, bias) return cls(linear) @classmethod @@ -177,7 +177,7 @@ class TensorParallelColumnLinear(SuperLayer): for prefix in prefixes: weight = weights.get_weights_col(prefix) b = weights.get_tensor(f"{prefix}.bias") if bias else None - linears.append(get_linear(weight, b, config.quantize)) + linears.append(get_linear(weight, b)) linear = LayerConcat(linears) else: weight = weights.get_multi_weights_col(prefixes, dim=dim) @@ -186,7 +186,7 @@ class TensorParallelColumnLinear(SuperLayer): bias = torch.cat(b, dim=dim) else: bias = None - linear = get_linear(weight, bias, config.quantize) + linear = get_linear(weight, bias) return cls(linear) @@ -205,7 +205,7 @@ class TensorParallelRowLinear(SuperLayer): else: bias = None return cls( - get_linear(weight, bias, config.quantize), + get_linear(weight, bias), process_group=weights.process_group, ) diff --git a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py index 49f5a81b..c7b29d13 100644 --- a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py @@ -186,9 +186,7 @@ def _load_gqa(config, prefix: str, weights): else: bias = None - return TensorParallelColumnLinear( - get_linear(weight, bias=bias, quantize=config.quantize) - ) + return TensorParallelColumnLinear(get_linear(weight, bias=bias)) class FlashCohereAttention(torch.nn.Module): diff --git a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py index 44411687..7426fc55 100644 --- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py @@ -247,10 +247,10 @@ def _load_experts_quantized(config, prefix, weights, cls): if cls == TensorParallelRowLinear: expert_slice = expert_slice.t().contiguous() - linear = get_linear(expert_slice, None, config.quantize) + linear = get_linear(expert_slice, None) experts.append(cls(linear, weights.process_group)) else: - linear = get_linear(expert_slice, None, config.quantize) + linear = get_linear(expert_slice, None) experts.append(cls(linear)) return experts diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py index a3ce5521..5273b15d 100644 --- a/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py @@ -155,9 +155,7 @@ def _load_gqa(config, prefix: str, weights): config.hidden_size, ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}" - return TensorParallelColumnLinear( - get_linear(weight, bias=None, quantize=config.quantize) - ) + return TensorParallelColumnLinear(get_linear(weight, bias=None)) class FlashGemma2Attention(torch.nn.Module): diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py index 34a7efa2..829ad427 100644 --- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py @@ -155,9 +155,7 @@ def _load_gqa(config, prefix: str, weights): config.hidden_size, ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}" - return TensorParallelColumnLinear( - get_linear(weight, bias=None, quantize=config.quantize) - ) + return TensorParallelColumnLinear(get_linear(weight, bias=None)) class FlashGemmaAttention(torch.nn.Module): diff --git a/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py index cbfcb1b8..a55a4af3 100644 --- a/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py @@ -82,7 +82,7 @@ def _load_qkv_gptq(config, prefix: str, weights): bias = torch.cat(tensors, dim=0) bias = bias.to(device=weights.device) - return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize)) + return TensorParallelColumnLinear(get_linear(weight, bias)) def _load_qkv(config, prefix: str, weights, head_size, num_heads): @@ -129,7 +129,7 @@ def _load_qkv(config, prefix: str, weights, head_size, num_heads): 3 * num_heads * head_size ], f"{weight.shape} != {[3 * num_heads * head_size]}" - return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize)) + return TensorParallelColumnLinear(get_linear(weight, bias)) def load_row(config, prefix: str, weights, bias: bool): @@ -147,7 +147,7 @@ def load_row(config, prefix: str, weights, bias: bool): bias = None return TensorParallelRowLinear( - get_linear(weight, bias, config.quantize), process_group=weights.process_group + get_linear(weight, bias), process_group=weights.process_group ) @@ -163,7 +163,7 @@ def load_col(config, prefix: str, weights, bias: bool): else: bias = None - return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize)) + return TensorParallelColumnLinear(get_linear(weight, bias)) class FlashGPT2Attention(torch.nn.Module): diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py index 78832341..5237a484 100644 --- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py @@ -18,6 +18,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from contextlib import contextmanager from typing import List, Optional, Tuple import torch @@ -25,7 +26,6 @@ import torch.distributed from torch import nn from transformers.activations import ACT2FN -from typing import Optional, List, Tuple from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.layers.attention import ( @@ -42,10 +42,16 @@ from text_generation_server.layers import ( TensorParallelMultiAdapterLinear, TensorParallelAdapterRowLinear, ) +from text_generation_server.layers.fp8 import Fp8Weight from text_generation_server.layers.rotary import PositionRotaryEmbedding from text_generation_server.layers.layernorm import ( FastRMSNorm, ) +from text_generation_server.utils.weights import ( + DefaultWeightsLoader, + UnquantizedWeight, + Weights, +) if SYSTEM == "rocm": try: @@ -105,6 +111,19 @@ def load_attention(config, prefix: str, weights, layer_id): ) +@contextmanager +def no_fp8(weights: Weights): + weights_loader = weights.weights_loader + if ( + isinstance(weights_loader, DefaultWeightsLoader) + and weights_loader.weight_class is Fp8Weight + ): + weights_loader = DefaultWeightsLoader(UnquantizedWeight) + + with weights.use_loader(weights_loader): + yield + + class FlashLlamaAttention(torch.nn.Module): def __init__( self, @@ -330,12 +349,15 @@ class LlamaMLP(nn.Module): class FlashLlamaLayer(nn.Module): def __init__(self, index, prefix, config, weights): super().__init__() - self.self_attn = FlashLlamaAttention( - index=index, - prefix=f"{prefix}.self_attn", - config=config, - weights=weights, - ) + + with no_fp8(weights): + self.self_attn = FlashLlamaAttention( + index=index, + prefix=f"{prefix}.self_attn", + config=config, + weights=weights, + ) + self.mlp = LlamaMLP( prefix=f"{prefix}.mlp", config=config, weights=weights, index=index ) @@ -470,23 +492,27 @@ class FlashLlamaForCausalLM(torch.nn.Module): def __init__(self, prefix: str, config, weights): super().__init__() - self.embed_tokens = TensorParallelEmbedding( - prefix=( - "model.embed_tokens" if not prefix else f"{prefix}.model.embed_tokens" - ), - weights=weights, - ) + with no_fp8(weights): + self.embed_tokens = TensorParallelEmbedding( + prefix=( + "model.embed_tokens" + if not prefix + else f"{prefix}.model.embed_tokens" + ), + weights=weights, + ) self.model = FlashLlamaModel(prefix, config, weights) if config.tie_word_embeddings: suffix = "model.embed_tokens" else: suffix = "lm_head" - self.lm_head = SpeculativeHead.load( - config, - prefix=suffix if not prefix else f"{prefix}.{suffix}", - weights=weights, - ) + with no_fp8(weights): + self.lm_head = SpeculativeHead.load( + config, + prefix=suffix if not prefix else f"{prefix}.{suffix}", + weights=weights, + ) def forward( self, diff --git a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py index 49c0e903..a1e36fc7 100644 --- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py @@ -149,9 +149,7 @@ def _load_gqa(config, prefix: str, weights): config.hidden_size, ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}" - return TensorParallelColumnLinear( - get_linear(weight, bias=None, quantize=config.quantize) - ) + return TensorParallelColumnLinear(get_linear(weight, bias=None)) def _load_experts(config, prefix: str, mat, weights): diff --git a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py index 85dcb2a6..99664230 100644 --- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py @@ -56,7 +56,7 @@ def load_row(config, prefix: str, weights, bias: bool): else: bias = None - linear = get_linear(weight, bias, config.quantize) + linear = get_linear(weight, bias) if config.use_parallel_residual: return linear else: @@ -81,7 +81,7 @@ def load_qkv(config, prefix: str, weights, num_heads, head_size, hidden_size): bias = weights.get_sharded(f"{prefix}.bias", dim=0) bias = bias.view(num_heads, 3, head_size).permute(1, 0, 2).reshape(-1) - linear = get_linear(weight, bias, config.quantize) + linear = get_linear(weight, bias) if config.use_parallel_residual: return linear else: diff --git a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py index 6c508264..a1ce03b9 100644 --- a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py @@ -100,9 +100,7 @@ def _load_gqa(config, prefix: str, weights): ], f"{list(weight.shape)} != {[(num_heads + 2 * config.num_key_value_heads) * head_size, config.hidden_size]}" # this is the same as llama except for Phi uses bias=True - return TensorParallelColumnLinear( - get_linear(weight, bias=True, quantize=config.quantize) - ) + return TensorParallelColumnLinear(get_linear(weight, bias=True)) class FlashPhiAttention(torch.nn.Module): diff --git a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py index 65b40fed..d7cad480 100644 --- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py @@ -31,7 +31,7 @@ def load_row(config, prefix: str, weights, bias: bool): else: bias = None - linear = get_linear(weight, bias, config.quantize) + linear = get_linear(weight, bias) if config.parallel_attn: return linear else: diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py index 77b9d49c..2b939a10 100644 --- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py @@ -105,6 +105,7 @@ def _load_multi_mqa_gptq( g_idx=g_idx, bits=loader.bits, groupsize=loader.groupsize, + use_awq_kernel=loader.quantize == "awq", use_exllama=HAS_EXLLAMA, ) @@ -121,7 +122,7 @@ def _load_multi_mqa_gptq( bias = torch.cat([q_tensor, kv_tensor], dim=0) bias = bias.to(device=weights.device) - return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize)) + return TensorParallelColumnLinear(get_linear(weight, bias)) else: raise NotImplementedError("Gptq loading with santacoder is not implemented") @@ -193,7 +194,7 @@ def _load_multi_mqa( assert list(bias.shape) == [ (num_heads + 2) * head_size ], f"{weight.shape} != {[(num_heads + 2) * head_size]}" - return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize)) + return TensorParallelColumnLinear(get_linear(weight, bias)) def load_col(config, prefix: str, weights, bias: bool): @@ -206,7 +207,7 @@ def load_col(config, prefix: str, weights, bias: bool): bias = weights.get_sharded(f"{prefix}.bias", dim=0) else: bias = None - return TensorParallelColumnLinear(get_linear(weight, bias, config.quantize)) + return TensorParallelColumnLinear(get_linear(weight, bias)) def load_row(config, prefix: str, weights, bias: bool): @@ -221,7 +222,7 @@ def load_row(config, prefix: str, weights, bias: bool): else: bias = None return TensorParallelRowLinear( - get_linear(weight, bias, config.quantize), process_group=weights.process_group + get_linear(weight, bias), process_group=weights.process_group ) diff --git a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py index 19556f78..89471955 100644 --- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py @@ -149,9 +149,7 @@ def _load_gqa(config, prefix: str, weights): else: bias = None - return TensorParallelColumnLinear( - get_linear(weight, bias=bias, quantize=config.quantize) - ) + return TensorParallelColumnLinear(get_linear(weight, bias=bias)) class Starcoder2Attention(torch.nn.Module): diff --git a/server/text_generation_server/models/custom_modeling/idefics2.py b/server/text_generation_server/models/custom_modeling/idefics2.py index daf3329a..735c3899 100644 --- a/server/text_generation_server/models/custom_modeling/idefics2.py +++ b/server/text_generation_server/models/custom_modeling/idefics2.py @@ -34,7 +34,7 @@ from text_generation_server.layers import ( TensorParallelEmbedding, TensorParallelRowLinear, ) -from text_generation_server.utils.weights import DefaultWeightsLoader +from text_generation_server.utils.weights import DefaultWeightsLoader, UnquantizedWeight def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: @@ -698,7 +698,7 @@ class Idefics2ForConditionalGeneration(nn.Module): self.dtype = weights.dtype # The vision and connector models are not quantized. - with weights.use_loader(DefaultWeightsLoader()): + with weights.use_loader(DefaultWeightsLoader(UnquantizedWeight)): self.vision_model = Idefics2VisionTransformer( prefix=( f"{prefix}.model.vision_model" if prefix else "model.vision_model" @@ -707,16 +707,12 @@ class Idefics2ForConditionalGeneration(nn.Module): weights=weights, ) - quantize = config.quantize - try: - config.quantize = None - self.connector = Idefics2Connector( - prefix=f"{prefix}.model.connector" if prefix else "model.connector", - config=config, - weights=weights, - ) - finally: - config.quantize = quantize + config.quantize = None + self.connector = Idefics2Connector( + prefix=f"{prefix}.model.connector" if prefix else "model.connector", + config=config, + weights=weights, + ) self.config = config self.image_seq_len = config.perceiver_config.resampler_n_latents diff --git a/server/text_generation_server/models/custom_modeling/mpt_modeling.py b/server/text_generation_server/models/custom_modeling/mpt_modeling.py index fb09a8f1..bbe9f45a 100644 --- a/server/text_generation_server/models/custom_modeling/mpt_modeling.py +++ b/server/text_generation_server/models/custom_modeling/mpt_modeling.py @@ -75,7 +75,7 @@ def load_col(config, prefix, weights, bias): bias = bias.to(device=weights.device) else: bias = None - linear = get_linear(weight, bias, config.quantize) + linear = get_linear(weight, bias) return TensorParallelColumnLinear(linear) diff --git a/server/text_generation_server/utils/quantization.py b/server/text_generation_server/utils/quantization.py index 07975bea..e8e22db8 100644 --- a/server/text_generation_server/utils/quantization.py +++ b/server/text_generation_server/utils/quantization.py @@ -1,11 +1,14 @@ -from typing import Optional -import os import json +import os from dataclasses import dataclass +from typing import Optional from huggingface_hub import hf_hub_download - -from text_generation_server.utils.weights import DefaultWeightsLoader, WeightsLoader +from text_generation_server.utils.weights import ( + DefaultWeightsLoader, + UnquantizedWeight, + WeightsLoader, +) @dataclass @@ -104,10 +107,30 @@ def get_loader( quantize=quantize, sym=quantizer_config.sym, ) + elif quantize == "bitsandbytes": + from text_generation_server.layers.bnb import BNBWeight + + return DefaultWeightsLoader(BNBWeight) + elif quantize == "bitsandbytes-fp4": + from text_generation_server.layers.bnb import BNBFP4Weight + + return DefaultWeightsLoader(BNBFP4Weight) + elif quantize == "bitsandbytes-nf4": + from text_generation_server.layers.bnb import BNBNF4Weight + + return DefaultWeightsLoader(BNBNF4Weight) + elif quantize == "eetq": + from text_generation_server.layers.eetq import EETQWeight + + return DefaultWeightsLoader(EETQWeight) elif quantize == "exl2": from text_generation_server.layers.exl2 import Exl2WeightsLoader return Exl2WeightsLoader() + elif quantize == "fp8": + from text_generation_server.layers.fp8 import Fp8Weight + + return DefaultWeightsLoader(Fp8Weight) elif quantize == "marlin": from text_generation_server.layers.marlin import MarlinWeightsLoader @@ -115,5 +138,7 @@ def get_loader( bits=quantizer_config.bits, is_marlin_24=quantizer_config.checkpoint_format == "marlin_24", ) + elif quantize is None: + return DefaultWeightsLoader(UnquantizedWeight) else: - return DefaultWeightsLoader() + raise ValueError(f"Unknown quantization method: {quantize}") diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py index b530af23..6876b700 100644 --- a/server/text_generation_server/utils/weights.py +++ b/server/text_generation_server/utils/weights.py @@ -1,9 +1,13 @@ from abc import ABC, abstractmethod from contextlib import contextmanager +from dataclasses import dataclass +from enum import Enum, auto from pathlib import Path from typing import Dict, List, Optional, Union -from safetensors import safe_open + import torch +from safetensors import safe_open +from text_generation_server.utils.import_utils import SYSTEM class WeightsLoader(ABC): @@ -62,7 +66,39 @@ class WeightsLoader(ABC): ... +class Weight(ABC): + """Instances of this type implement unquantized/quantized/to-be + quantized weights.""" + + @abstractmethod + def get_linear(self, bias: torch.Tensor): + """Create a linear layer from this weight.""" + ... + + +@dataclass +class UnquantizedWeight: + weight: torch.Tensor + + def get_linear(self, bias: torch.Tensor): + from text_generation_server.layers.linear import FastLinear, FastLinearROCm + + if SYSTEM == "rocm": + return FastLinearROCm(self.weight, bias) + else: + return FastLinear(self.weight, bias) + + class DefaultWeightsLoader(WeightsLoader): + """Weight loader that loads (unquantized) Torch tensors.""" + + def __init__(self, weight_class): + """Create a loader. Weights will be wrapped using the given `weights_class`, + normally this will be `UnquantizedWeight`, but a quantizer-specific class + such as `Fp8Weight` can be used to quantize the weights during loading. + """ + self.weight_class = weight_class + """ Loader that uses tensors as-is with the exception of applying sharding and/or concatenation. @@ -74,16 +110,21 @@ class DefaultWeightsLoader(WeightsLoader): prefix: str, block_sizes: Union[int, List[int]], ): - return weights.get_packed_sharded( - f"{prefix}.weight", dim=0, block_sizes=block_sizes + + return self.weight_class( + weights.get_packed_sharded( + f"{prefix}.weight", dim=0, block_sizes=block_sizes + ), ) def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int): w = [weights.get_sharded(f"{p}.weight", dim=0) for p in prefixes] - return torch.cat(w, dim=dim) + return self.weight_class(torch.cat(w, dim=dim)) def get_weights_row(self, weights: "Weights", prefix: str): - return weights.get_sharded(f"{prefix}.weight", dim=1) + return self.weight_class( + weights.get_sharded(f"{prefix}.weight", dim=1), + ) class Weights: