fix: add lora kernel to dockerfile, support running without kernels and refactors
This commit is contained in:
parent
d6cf63ca53
commit
aa88c4fd3a
|
@ -144,6 +144,13 @@ COPY server/Makefile-marlin Makefile
|
|||
# Build specific version of transformers
|
||||
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-marlin
|
||||
|
||||
# Build Lorax Punica kernels
|
||||
FROM kernel-builder as lorax-punica-builder
|
||||
WORKDIR /usr/src
|
||||
COPY server/Makefile-lorax-punica Makefile
|
||||
# Build specific version of transformers
|
||||
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-lorax-punica
|
||||
|
||||
# Build Transformers CUDA kernels
|
||||
FROM kernel-builder as custom-kernels-builder
|
||||
WORKDIR /usr/src
|
||||
|
@ -214,6 +221,7 @@ COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86
|
|||
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
||||
# Copy build artifacts from marlin kernels builder
|
||||
COPY --from=marlin-kernels-builder /usr/src/marlin/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
||||
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
||||
|
||||
# Copy builds artifacts from vllm builder
|
||||
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
|
||||
|
|
|
@ -1,9 +1,12 @@
|
|||
lorax_punica_commit := c71861a653412267dc27ec86013dd945ce3474bc
|
||||
|
||||
lorax-punica: install-lorax-punica
|
||||
git clone --no-checkout https://github.com/predibase/lorax.git
|
||||
build-lorax-punica:
|
||||
if [ ! -d 'lorax-punica' ]; then \
|
||||
git clone --no-checkout https://github.com/predibase/lorax.git lorax-punica; \
|
||||
fi
|
||||
cd lorax-punica && git sparse-checkout set server/punica_kernels && git checkout $(lorax_punica_commit)
|
||||
cd lorax-punica && git submodule update --init --recursive
|
||||
cd lorax-punica/server/punica_kernels && python setup.py build
|
||||
|
||||
install-lorax-punica:
|
||||
cd lorax && git sparse-checkout set server/punica_kernels && git checkout $(lorax_punica_commit)
|
||||
cd lorax && git submodule update --init --recursive
|
||||
cd lorax/server/punica_kernels && python setup.py install
|
||||
install-lorax-punica: build-lorax-punica
|
||||
cd lorax-punica/server/punica_kernels && python setup.py install
|
||||
|
|
|
@ -90,18 +90,6 @@ class Model(ABC):
|
|||
self.loaded_adapters = set()
|
||||
self.static_adapter_id = adapter_id
|
||||
|
||||
# TODO: review moving adapter loading to the model
|
||||
if adapter_id and adapter_id != BASE_MODEL_ADAPTER_ID:
|
||||
pass
|
||||
# download_adapter(adapter_id, adapter_source, api_token=None)
|
||||
# self.load_adapter(
|
||||
# AdapterParameters(adapter_ids=[adapter_id]),
|
||||
# adapter_source,
|
||||
# adapter_index=0,
|
||||
# api_token=None,
|
||||
# dynamic=False,
|
||||
# )
|
||||
|
||||
if speculate is None:
|
||||
speculate = get_speculate()
|
||||
self.speculate = speculate
|
||||
|
|
|
@ -136,6 +136,10 @@ def get_tmp_tensor_for_size(size: int, device: torch.device) -> torch.Tensor:
|
|||
return torch.empty((tmp_size,), dtype=torch.uint8, device=device)
|
||||
|
||||
|
||||
def get_tmp_tensor_for_size_no_kernels(size: int, device: torch.device) -> torch.Tensor:
|
||||
return torch.empty((size,), dtype=torch.uint8, device=device)
|
||||
|
||||
|
||||
def get_tmp_expand_size(size: int) -> int:
|
||||
return _kernels.sgmv_cutlass_tmp_size(size)
|
||||
|
||||
|
@ -143,12 +147,12 @@ def get_tmp_expand_size(size: int) -> int:
|
|||
def get_tmp_tensors(
|
||||
nsegments: int, lora_rank: int, device: torch.device
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
if use_cutlass_shrink(lora_rank):
|
||||
if use_cutlass_shrink(lora_rank) and has_sgmv():
|
||||
tmp = get_tmp_tensor_for_size(nsegments, device)
|
||||
return tmp, tmp
|
||||
else:
|
||||
tmp_shrink = get_tmp_tensor(device)
|
||||
tmp_expand = get_tmp_tensor_for_size(nsegments, device)
|
||||
tmp_expand = get_tmp_tensor_for_size_no_kernels(nsegments, device)
|
||||
return tmp_shrink, tmp_expand
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue