fix: add lora kernel to dockerfile, support running without kernels and refactors

This commit is contained in:
drbh 2024-06-14 00:35:07 +00:00
parent d6cf63ca53
commit aa88c4fd3a
4 changed files with 23 additions and 20 deletions

View File

@ -144,6 +144,13 @@ COPY server/Makefile-marlin Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-marlin
# Build Lorax Punica kernels
FROM kernel-builder as lorax-punica-builder
WORKDIR /usr/src
COPY server/Makefile-lorax-punica Makefile
# Build specific version of transformers
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica
# Build Transformers CUDA kernels
FROM kernel-builder as custom-kernels-builder
WORKDIR /usr/src
@ -214,6 +221,7 @@ COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy build artifacts from marlin kernels builder
COPY --from=marlin-kernels-builder /usr/src/marlin/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
# Copy builds artifacts from vllm builder
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages

View File

@ -1,9 +1,12 @@
lorax_punica_commit := c71861a653412267dc27ec86013dd945ce3474bc
lorax-punica: install-lorax-punica
git clone --no-checkout https://github.com/predibase/lorax.git
build-lorax-punica:
if [ ! -d 'lorax-punica' ]; then \
git clone --no-checkout https://github.com/predibase/lorax.git lorax-punica; \
fi
cd lorax-punica && git sparse-checkout set server/punica_kernels && git checkout $(lorax_punica_commit)
cd lorax-punica && git submodule update --init --recursive
cd lorax-punica/server/punica_kernels && python setup.py build
install-lorax-punica:
cd lorax && git sparse-checkout set server/punica_kernels && git checkout $(lorax_punica_commit)
cd lorax && git submodule update --init --recursive
cd lorax/server/punica_kernels && python setup.py install
install-lorax-punica: build-lorax-punica
cd lorax-punica/server/punica_kernels && python setup.py install

View File

@ -90,18 +90,6 @@ class Model(ABC):
self.loaded_adapters = set()
self.static_adapter_id = adapter_id
# TODO: review moving adapter loading to the model
if adapter_id and adapter_id != BASE_MODEL_ADAPTER_ID:
pass
# download_adapter(adapter_id, adapter_source, api_token=None)
# self.load_adapter(
# AdapterParameters(adapter_ids=[adapter_id]),
# adapter_source,
# adapter_index=0,
# api_token=None,
# dynamic=False,
# )
if speculate is None:
speculate = get_speculate()
self.speculate = speculate

View File

@ -136,6 +136,10 @@ def get_tmp_tensor_for_size(size: int, device: torch.device) -> torch.Tensor:
return torch.empty((tmp_size,), dtype=torch.uint8, device=device)
def get_tmp_tensor_for_size_no_kernels(size: int, device: torch.device) -> torch.Tensor:
return torch.empty((size,), dtype=torch.uint8, device=device)
def get_tmp_expand_size(size: int) -> int:
return _kernels.sgmv_cutlass_tmp_size(size)
@ -143,12 +147,12 @@ def get_tmp_expand_size(size: int) -> int:
def get_tmp_tensors(
nsegments: int, lora_rank: int, device: torch.device
) -> Tuple[torch.Tensor, torch.Tensor]:
if use_cutlass_shrink(lora_rank):
if use_cutlass_shrink(lora_rank) and has_sgmv():
tmp = get_tmp_tensor_for_size(nsegments, device)
return tmp, tmp
else:
tmp_shrink = get_tmp_tensor(device)
tmp_expand = get_tmp_tensor_for_size(nsegments, device)
tmp_expand = get_tmp_tensor_for_size_no_kernels(nsegments, device)
return tmp_shrink, tmp_expand