From c9047667ad1bb5e95911b43fb853b8368ca2b18d Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 14 Aug 2024 08:49:58 +0200 Subject: [PATCH] Upgrading exl2. --- .gitignore | 2 +- Dockerfile | 6 +++--- flake.nix | 1 + server/Makefile | 1 + server/Makefile-exllamav2 | 12 ++++++++++++ .../layers/gptq/exllamav2.py | 19 +++++++++++++++---- .../models/causal_lm.py | 1 + .../models/flash_causal_lm.py | 1 + .../models/seq2seq_lm.py | 1 + server/text_generation_server/server.py | 6 +++--- 10 files changed, 39 insertions(+), 11 deletions(-) create mode 100644 server/Makefile-exllamav2 diff --git a/.gitignore b/.gitignore index bd9d9125..f79d8faa 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,7 @@ backends/client/src/v3/pb # ROCm auto-generated files *.hip -server/exllamav2_kernels/exllamav2_kernels/hip/ +server/exllamav2 server/exllama_kernels/exllama_kernels/hip/ server/exllama_kernels/exllama_kernels/hip_func/ *_hip.cuh diff --git a/Dockerfile b/Dockerfile index 458ff699..74e7d990 100644 --- a/Dockerfile +++ b/Dockerfile @@ -123,10 +123,10 @@ RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build # Build Transformers exllama kernels FROM kernel-builder AS exllamav2-kernels-builder WORKDIR /usr/src -COPY server/exllamav2_kernels/ . +COPY server/Makefile-exllamav2/ Makefile # Build specific version of transformers -RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build +RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-exllamav2 # Build Transformers awq kernels FROM kernel-builder AS awq-kernels-builder @@ -221,7 +221,7 @@ COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 / # Copy build artifacts from exllama kernels builder COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages # Copy build artifacts from exllamav2 kernels builder -COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages # Copy build artifacts from awq kernels builder COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages # Copy build artifacts from eetq kernels builder diff --git a/flake.nix b/flake.nix index e1f44212..229184d2 100644 --- a/flake.nix +++ b/flake.nix @@ -93,6 +93,7 @@ causal-conv1d click einops + exllamav2 fbgemm-gpu flashinfer flash-attn diff --git a/server/Makefile b/server/Makefile index 209fc44e..51ea8b32 100644 --- a/server/Makefile +++ b/server/Makefile @@ -6,6 +6,7 @@ include Makefile-eetq include Makefile-selective-scan include Makefile-lorax-punica include Makefile-fbgemm +include Makefile-exllamav2 unit-tests: pytest -s -vv -m "not private" tests diff --git a/server/Makefile-exllamav2 b/server/Makefile-exllamav2 new file mode 100644 index 00000000..0d4cc385 --- /dev/null +++ b/server/Makefile-exllamav2 @@ -0,0 +1,12 @@ +exllamav2_commit := v0.1.8 + +build-exllamav2: + git clone https://github.com/turboderp/exllamav2.git exllamav2 && \ + cd exllamav2 && git fetch && git checkout $(exllamav2_commit) && \ + git submodule update --init --recursive && \ + pip install -r requirements.txt && \ + CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py build + +install-exllamav2: build-exllamav2 + cd exllamav2/ && \ + CUDA_ARCH_LIST="8.0;9.0a" NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90a,code=sm_90a" TORCH_CUDA_ARCH_LIST="8.0;9.0a" python setup.py install diff --git a/server/text_generation_server/layers/gptq/exllamav2.py b/server/text_generation_server/layers/gptq/exllamav2.py index dc3b832f..4bd2f63f 100644 --- a/server/text_generation_server/layers/gptq/exllamav2.py +++ b/server/text_generation_server/layers/gptq/exllamav2.py @@ -12,7 +12,10 @@ from text_generation_server.layers.gptq import GPTQWeight from text_generation_server.utils.log import log_master try: - from exllamav2_kernels import make_q_matrix, gemm_half_q_half + from exllamav2.ext import exllamav2_ext + + make_q_matrix = exllamav2_ext.make_q_matrix + gemm_half_q_half = exllamav2_ext.gemm_half_q_half except ImportError: log_master(logger.warning, "exllamav2_kernels not installed.") raise @@ -70,6 +73,10 @@ def ext_make_q_matrix( """ Create Q matrix """ + # max_dq_size = 512*(1024**2) + # max_dq_rows = max_dq_size // out_features[0] + max_dq_rows = 0 + # EXL2 if isinstance(w, Exl2Weight): extra.q_group_map = make_group_map(w.q_groups, w.q_weight.shape[0]) @@ -83,10 +90,12 @@ def ext_make_q_matrix( w.q_scale_max, w.q_groups, extra.q_group_map, - none_tensor, - none_tensor, - none_tensor, + none_tensor, # zeros + none_tensor, # scales + none_tensor, # g_idx + none_tensor, # bias temp_dq, + max_dq_rows, ) # GPTQ elif isinstance(w, GPTQWeight): @@ -114,6 +123,7 @@ def ext_make_q_matrix( w.scales, w.g_idx.cpu(), temp_dq, + max_dq_rows, ) # GPTQ without g_idx else: @@ -129,6 +139,7 @@ def ext_make_q_matrix( w.scales, none_tensor, temp_dq, + max_dq_rows, ) else: RuntimeError("Cannot create handle") diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py index 212ab7a9..ba168b13 100644 --- a/server/text_generation_server/models/causal_lm.py +++ b/server/text_generation_server/models/causal_lm.py @@ -511,6 +511,7 @@ class CausalLM(Model): config_class=AutoConfig, batch_class=CausalLMBatch, ): + self.quantize = quantize self.batch_class = batch_class self.process_group, rank, world_size = initialize_torch_distributed() if torch.cuda.is_available(): diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 42d93a12..5e2fd20a 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -872,6 +872,7 @@ class FlashCausalLM(Model): head_size: Optional[int] = None, skip_special_tokens: bool = True, ): + self.quantize = quantize self.process_group, rank, world_size = initialize_torch_distributed() if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py index 79c001b0..3c92128a 100644 --- a/server/text_generation_server/models/seq2seq_lm.py +++ b/server/text_generation_server/models/seq2seq_lm.py @@ -553,6 +553,7 @@ class Seq2SeqLM(Model): tokenizer_class=AutoTokenizer, aliases=None, ): + self.quantize = quantize self.process_group, rank, world_size = initialize_torch_distributed() if torch.cuda.is_available(): device = torch.device(f"cuda:{rank}") diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py index b92ab572..22871ec5 100644 --- a/server/text_generation_server/server.py +++ b/server/text_generation_server/server.py @@ -50,12 +50,12 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer): self, model: Model, cache: Cache, - quantize: Optional[str], server_urls: List[str], ): self.cache = cache self.model = model - self.quantize = quantize + # Quantize is resolved during model loading + self.quantize = model.quantize self.server_urls = server_urls # For some reason, inference_mode does not work well with GLOO which we use on CPU if model.device.type == "cuda": @@ -255,7 +255,7 @@ def serve( ], ) generate_pb2_grpc.add_TextGenerationServiceServicer_to_server( - TextGenerationService(model, Cache(), quantize, server_urls), server + TextGenerationService(model, Cache(), server_urls), server ) SERVICE_NAMES = ( generate_pb2.DESCRIPTOR.services_by_name["TextGenerationService"].full_name,