From 4c3f8a70a1c8590851aa3d7c82a7cabf01ed6e87 Mon Sep 17 00:00:00 2001 From: drbh Date: Mon, 12 Aug 2024 11:24:32 -0400 Subject: [PATCH] fix: allocate tmp based on sgmv kernel if available (#2345) * fix: allocate tmp based on sgmv kernel if available * fix: re add copy build artifacts step for punica kernels --- Dockerfile | 2 ++ server/text_generation_server/utils/sgmv.py | 12 ++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index c68f76f6..458ff699 100644 --- a/Dockerfile +++ b/Dockerfile @@ -226,6 +226,8 @@ COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-31 COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages # Copy build artifacts from eetq kernels builder COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +# Copy build artifacts from lorax punica kernels builder +COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages # Copy build artifacts from fbgemm builder COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.10/cmake-install /opt/conda/lib/python3.10/site-packages # Copy build artifacts from vllm builder diff --git a/server/text_generation_server/utils/sgmv.py b/server/text_generation_server/utils/sgmv.py index e0aec25f..2d0a73a5 100644 --- a/server/text_generation_server/utils/sgmv.py +++ b/server/text_generation_server/utils/sgmv.py @@ -151,13 +151,17 @@ def get_tmp_expand_size(size: int) -> int: def get_tmp_tensors( nsegments: int, lora_rank: int, device: torch.device ) -> Tuple[torch.Tensor, torch.Tensor]: - if use_cutlass_shrink(lora_rank) and has_sgmv(): + use_cutlass = use_cutlass_shrink(lora_rank) and has_sgmv() + has_sgmv_available = has_sgmv() + + if use_cutlass: tmp = get_tmp_tensor_for_size(nsegments, device) return tmp, tmp + elif has_sgmv_available: + return get_tmp_tensor(device), get_tmp_tensor_for_size(nsegments, device) else: - tmp_shrink = get_tmp_tensor(device) - tmp_expand = get_tmp_tensor_for_size_no_kernels(nsegments, device) - return tmp_shrink, tmp_expand + tmp = get_tmp_tensor_for_size(nsegments, device) + return tmp, tmp def lora_a_sgmv_cutlass(