Merge branch 'main' into moe
This commit is contained in:
commit
e0e39fa0d9
16
Dockerfile
16
Dockerfile
|
@ -161,18 +161,6 @@ COPY server/custom_kernels/ .
|
|||
# Build specific version of transformers
|
||||
RUN python setup.py build
|
||||
|
||||
# Build vllm CUDA kernels
|
||||
FROM kernel-builder AS vllm-builder
|
||||
|
||||
WORKDIR /usr/src
|
||||
|
||||
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
|
||||
|
||||
COPY server/Makefile-vllm Makefile
|
||||
|
||||
# Build specific version of vllm
|
||||
RUN make build-vllm-cuda
|
||||
|
||||
# Build mamba kernels
|
||||
FROM kernel-builder AS mamba-builder
|
||||
WORKDIR /usr/src
|
||||
|
@ -230,8 +218,6 @@ COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86
|
|||
COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||
# Copy build artifacts from lorax punica kernels builder
|
||||
COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||
# Copy build artifacts from vllm builder
|
||||
COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
|
||||
# Copy build artifacts from mamba builder
|
||||
COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
|
||||
COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
|
||||
|
@ -247,7 +233,7 @@ COPY server/Makefile server/Makefile
|
|||
RUN cd server && \
|
||||
make gen-server && \
|
||||
pip install -r requirements_cuda.txt && \
|
||||
pip install ".[bnb, accelerate, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
|
||||
pip install ".[attention, bnb, accelerate, compressed-tensors, marlin, moe, quantize, peft, outlines]" --no-cache-dir && \
|
||||
pip install nvidia-nccl-cu12==2.22.3
|
||||
|
||||
ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
|
||||
|
|
|
@ -296,7 +296,7 @@ COPY server/Makefile server/Makefile
|
|||
RUN cd server && \
|
||||
make gen-server && \
|
||||
pip install -r requirements_rocm.txt && \
|
||||
pip install ".[accelerate, peft, outlines]" --no-cache-dir
|
||||
pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
|
||||
|
||||
# Install benchmarker
|
||||
COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/local/bin/text-generation-benchmark
|
||||
|
|
|
@ -117,7 +117,7 @@ COPY server/Makefile server/Makefile
|
|||
RUN cd server && \
|
||||
make gen-server && \
|
||||
pip install -r requirements_intel.txt && \
|
||||
pip install ".[accelerate, peft, outlines]" --no-cache-dir
|
||||
pip install ".[accelerate, compressed-tensors, peft, outlines]" --no-cache-dir
|
||||
|
||||
ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
|
||||
ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
|
||||
|
|
|
@ -180,7 +180,7 @@ async fn prefill(
|
|||
let latency = start_time.elapsed();
|
||||
|
||||
// Compute throughput from latency and batch size
|
||||
let throughput = batch_size as f64 / latency.as_secs_f64();
|
||||
let throughput = (batch_size * sequence_length) as f64 / latency.as_secs_f64();
|
||||
|
||||
// Decode batch cannot be empty
|
||||
let decode_batch = decode_batch.expect("decode_batch is None. This is a bug.");
|
||||
|
|
|
@ -36,8 +36,11 @@
|
|||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/GenerateResponse"
|
||||
}
|
||||
}
|
||||
},
|
||||
"text/event-stream": {
|
||||
"schema": {
|
||||
|
|
|
@ -63,6 +63,7 @@ Options:
|
|||
|
||||
Possible values:
|
||||
- awq: 4 bit quantization. Requires a specific AWQ quantized model: <https://hf.co/models?search=awq>. Should replace GPTQ models wherever possible because of the better latency
|
||||
- compressed-tensors: Compressed tensors, which can be a mixture of different quantization methods
|
||||
- eetq: 8 bit quantization, doesn't require specific model. Should be a drop-in replacement to bitsandbytes with much better performance. Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
|
||||
- exl2: Variable bit quantization. Requires a specific EXL2 quantized model: <https://hf.co/models?search=exl2>. Requires exllama2 kernels and does not support tensor parallelism (num_shard > 1)
|
||||
- gptq: 4 bit quantization. Requires a specific GTPQ quantized model: <https://hf.co/models?search=gptq>. text-generation-inference will use exllama (faster) kernels wherever possible, and use triton kernel (wider support) when it's not. AWQ has faster kernels
|
||||
|
|
12
flake.lock
12
flake.lock
|
@ -718,11 +718,11 @@
|
|||
},
|
||||
"nixpkgs_6": {
|
||||
"locked": {
|
||||
"lastModified": 1727675176,
|
||||
"narHash": "sha256-xIjBFMYldWvj+g8ahxMPofsj+OqxvKJN6YylNHQ7gn4=",
|
||||
"lastModified": 1731562571,
|
||||
"narHash": "sha256-9V0C/H6NL2Vk3Y76msqNA8TgwZ6Ge4frOVawTNFJQmM=",
|
||||
"owner": "nixos",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "a6d0207fea9212d28cd3d487efe6bc699663b93a",
|
||||
"rev": "19d66fab291f90ce56d0479b128cc7a5271bf666",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
@ -978,11 +978,11 @@
|
|||
"nixpkgs": "nixpkgs_6"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1730724647,
|
||||
"narHash": "sha256-SVv+50CGaCoU4zZwsg6ZAaOi/D5QJBL1P2SIB+3CEf4=",
|
||||
"lastModified": 1731674227,
|
||||
"narHash": "sha256-k/ur37KSc+RXcwwz0tgxeamz6wQ5rsOe5hMepzIdD2s=",
|
||||
"owner": "huggingface",
|
||||
"repo": "text-generation-inference-nix",
|
||||
"rev": "1512898a1e5ad9eff025205fa9c4d33a44506cf3",
|
||||
"rev": "407b9e22a0b7121bf6e171d67ce0144e3f3e39bf",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 128000,
|
||||
"logprob": null,
|
||||
"text": "<|begin_of_text|>"
|
||||
},
|
||||
{
|
||||
"id": 3923,
|
||||
"logprob": -7.609375,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.92529297,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5655,
|
||||
"logprob": -10.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.94628906,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"logprob": -2.9042969,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18682,
|
||||
"logprob": -0.8769531,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.0076942444,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.25073242,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.097595215,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 955,
|
||||
"logprob": -0.921875,
|
||||
"special": false,
|
||||
"text": " type"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.00027918816,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 21075,
|
||||
"logprob": -0.5527344,
|
||||
"special": false,
|
||||
"text": " artificial"
|
||||
},
|
||||
{
|
||||
"id": 11478,
|
||||
"logprob": -0.042541504,
|
||||
"special": false,
|
||||
"text": " intelligence"
|
||||
},
|
||||
{
|
||||
"id": 320,
|
||||
"logprob": -0.38891602,
|
||||
"special": false,
|
||||
"text": " ("
|
||||
},
|
||||
{
|
||||
"id": 15836,
|
||||
"logprob": -0.0011043549,
|
||||
"special": false,
|
||||
"text": "AI"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": " Deep learning is a type of artificial intelligence (AI"
|
||||
}
|
|
@ -0,0 +1,99 @@
|
|||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 128000,
|
||||
"logprob": null,
|
||||
"text": "<|begin_of_text|>"
|
||||
},
|
||||
{
|
||||
"id": 3923,
|
||||
"logprob": -7.609375,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.92529297,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5655,
|
||||
"logprob": -10.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.94628906,
|
||||
"text": " learning"
|
||||
}
|
||||
],
|
||||
"seed": 0,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 5380,
|
||||
"logprob": -0.23840332,
|
||||
"special": false,
|
||||
"text": "?\n"
|
||||
},
|
||||
{
|
||||
"id": 34564,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": "Deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": ","
|
||||
},
|
||||
{
|
||||
"id": 1101,
|
||||
"logprob": -1.2011719,
|
||||
"special": false,
|
||||
"text": " also"
|
||||
},
|
||||
{
|
||||
"id": 3967,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " known"
|
||||
},
|
||||
{
|
||||
"id": 439,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " as"
|
||||
},
|
||||
{
|
||||
"id": 30828,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " neural"
|
||||
},
|
||||
{
|
||||
"id": 4009,
|
||||
"logprob": -0.6777344,
|
||||
"special": false,
|
||||
"text": " network"
|
||||
},
|
||||
{
|
||||
"id": 477,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " or"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": "What is deep learning?\nDeep learning, also known as neural network or"
|
||||
}
|
|
@ -0,0 +1,418 @@
|
|||
[
|
||||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 128000,
|
||||
"logprob": null,
|
||||
"text": "<|begin_of_text|>"
|
||||
},
|
||||
{
|
||||
"id": 3923,
|
||||
"logprob": -7.609375,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.92529297,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5655,
|
||||
"logprob": -10.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.94628906,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"logprob": -2.9042969,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18682,
|
||||
"logprob": -0.8769531,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.0076942444,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.25146484,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.097595215,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 955,
|
||||
"logprob": -0.9248047,
|
||||
"special": false,
|
||||
"text": " type"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.00027513504,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 21075,
|
||||
"logprob": -0.5527344,
|
||||
"special": false,
|
||||
"text": " artificial"
|
||||
},
|
||||
{
|
||||
"id": 11478,
|
||||
"logprob": -0.043151855,
|
||||
"special": false,
|
||||
"text": " intelligence"
|
||||
},
|
||||
{
|
||||
"id": 320,
|
||||
"logprob": -0.3840332,
|
||||
"special": false,
|
||||
"text": " ("
|
||||
},
|
||||
{
|
||||
"id": 15836,
|
||||
"logprob": -0.0011043549,
|
||||
"special": false,
|
||||
"text": "AI"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": " Deep learning is a type of artificial intelligence (AI"
|
||||
},
|
||||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 128000,
|
||||
"logprob": null,
|
||||
"text": "<|begin_of_text|>"
|
||||
},
|
||||
{
|
||||
"id": 3923,
|
||||
"logprob": -7.6054688,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.92089844,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5655,
|
||||
"logprob": -10.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.94433594,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"logprob": -2.90625,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18682,
|
||||
"logprob": -0.875,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.007698059,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.25268555,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.09753418,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 955,
|
||||
"logprob": -0.92529297,
|
||||
"special": false,
|
||||
"text": " type"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.00027942657,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 21075,
|
||||
"logprob": -0.5527344,
|
||||
"special": false,
|
||||
"text": " artificial"
|
||||
},
|
||||
{
|
||||
"id": 11478,
|
||||
"logprob": -0.042541504,
|
||||
"special": false,
|
||||
"text": " intelligence"
|
||||
},
|
||||
{
|
||||
"id": 320,
|
||||
"logprob": -0.3840332,
|
||||
"special": false,
|
||||
"text": " ("
|
||||
},
|
||||
{
|
||||
"id": 15836,
|
||||
"logprob": -0.0011053085,
|
||||
"special": false,
|
||||
"text": "AI"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": " Deep learning is a type of artificial intelligence (AI"
|
||||
},
|
||||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 128000,
|
||||
"logprob": null,
|
||||
"text": "<|begin_of_text|>"
|
||||
},
|
||||
{
|
||||
"id": 3923,
|
||||
"logprob": -7.6054688,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.92089844,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5655,
|
||||
"logprob": -10.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.94433594,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"logprob": -2.90625,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18682,
|
||||
"logprob": -0.875,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.007698059,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.25268555,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.09753418,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 955,
|
||||
"logprob": -0.92529297,
|
||||
"special": false,
|
||||
"text": " type"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.00027942657,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 21075,
|
||||
"logprob": -0.5527344,
|
||||
"special": false,
|
||||
"text": " artificial"
|
||||
},
|
||||
{
|
||||
"id": 11478,
|
||||
"logprob": -0.042541504,
|
||||
"special": false,
|
||||
"text": " intelligence"
|
||||
},
|
||||
{
|
||||
"id": 320,
|
||||
"logprob": -0.3840332,
|
||||
"special": false,
|
||||
"text": " ("
|
||||
},
|
||||
{
|
||||
"id": 15836,
|
||||
"logprob": -0.0011053085,
|
||||
"special": false,
|
||||
"text": "AI"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": " Deep learning is a type of artificial intelligence (AI"
|
||||
},
|
||||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 128000,
|
||||
"logprob": null,
|
||||
"text": "<|begin_of_text|>"
|
||||
},
|
||||
{
|
||||
"id": 3923,
|
||||
"logprob": -7.6054688,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.92089844,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5655,
|
||||
"logprob": -10.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.94433594,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 30,
|
||||
"logprob": -2.90625,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 18682,
|
||||
"logprob": -0.875,
|
||||
"special": false,
|
||||
"text": " Deep"
|
||||
},
|
||||
{
|
||||
"id": 6975,
|
||||
"logprob": -0.007698059,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 374,
|
||||
"logprob": -0.25268555,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"logprob": -0.09753418,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 955,
|
||||
"logprob": -0.92529297,
|
||||
"special": false,
|
||||
"text": " type"
|
||||
},
|
||||
{
|
||||
"id": 315,
|
||||
"logprob": -0.00027942657,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 21075,
|
||||
"logprob": -0.5527344,
|
||||
"special": false,
|
||||
"text": " artificial"
|
||||
},
|
||||
{
|
||||
"id": 11478,
|
||||
"logprob": -0.042541504,
|
||||
"special": false,
|
||||
"text": " intelligence"
|
||||
},
|
||||
{
|
||||
"id": 320,
|
||||
"logprob": -0.3840332,
|
||||
"special": false,
|
||||
"text": " ("
|
||||
},
|
||||
{
|
||||
"id": 15836,
|
||||
"logprob": -0.0011053085,
|
||||
"special": false,
|
||||
"text": "AI"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": " Deep learning is a type of artificial intelligence (AI"
|
||||
}
|
||||
]
|
|
@ -0,0 +1,104 @@
|
|||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 2,
|
||||
"logprob": null,
|
||||
"text": "<bos>"
|
||||
},
|
||||
{
|
||||
"id": 1841,
|
||||
"logprob": -5.46875,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.69140625,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5271,
|
||||
"logprob": -12.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.32226562,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 235336,
|
||||
"logprob": -0.33203125,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 109,
|
||||
"logprob": -0.24707031,
|
||||
"special": false,
|
||||
"text": "\n\n"
|
||||
},
|
||||
{
|
||||
"id": 26843,
|
||||
"logprob": -0.14550781,
|
||||
"special": false,
|
||||
"text": "Deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.038330078,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.029907227,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 476,
|
||||
"logprob": -0.020996094,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 38397,
|
||||
"logprob": -0.828125,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 576,
|
||||
"logprob": -0.00049209595,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 6479,
|
||||
"logprob": -0.057373047,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.000207901,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 674,
|
||||
"logprob": -0.15429688,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": "\n\nDeep learning is a subset of machine learning that"
|
||||
}
|
|
@ -0,0 +1,99 @@
|
|||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 2,
|
||||
"logprob": null,
|
||||
"text": "<bos>"
|
||||
},
|
||||
{
|
||||
"id": 1841,
|
||||
"logprob": -5.46875,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.69140625,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5271,
|
||||
"logprob": -12.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.32226562,
|
||||
"text": " learning"
|
||||
}
|
||||
],
|
||||
"seed": 0,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 235336,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": "?"
|
||||
},
|
||||
{
|
||||
"id": 109,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": "\n\n"
|
||||
},
|
||||
{
|
||||
"id": 26843,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": "Deep"
|
||||
},
|
||||
{
|
||||
"id": 14715,
|
||||
"logprob": -0.38671875,
|
||||
"special": false,
|
||||
"text": " Learning"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 476,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 38397,
|
||||
"logprob": -0.12695312,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 576,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 6479,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": 0.0,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": "What is deep learning?\n\nDeep Learning is a subset of machine learning"
|
||||
}
|
|
@ -0,0 +1,418 @@
|
|||
[
|
||||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 2,
|
||||
"logprob": null,
|
||||
"text": "<bos>"
|
||||
},
|
||||
{
|
||||
"id": 1841,
|
||||
"logprob": -5.46875,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.69140625,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5271,
|
||||
"logprob": -12.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.32226562,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 235336,
|
||||
"logprob": -0.33203125,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 109,
|
||||
"logprob": -0.24707031,
|
||||
"special": false,
|
||||
"text": "\n\n"
|
||||
},
|
||||
{
|
||||
"id": 26843,
|
||||
"logprob": -0.14550781,
|
||||
"special": false,
|
||||
"text": "Deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.03857422,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.030883789,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 476,
|
||||
"logprob": -0.020996094,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 38397,
|
||||
"logprob": -0.828125,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 576,
|
||||
"logprob": -0.00051498413,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 6479,
|
||||
"logprob": -0.05883789,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.00020694733,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 674,
|
||||
"logprob": -0.15820312,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": "\n\nDeep learning is a subset of machine learning that"
|
||||
},
|
||||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 2,
|
||||
"logprob": null,
|
||||
"text": "<bos>"
|
||||
},
|
||||
{
|
||||
"id": 1841,
|
||||
"logprob": -5.46875,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.71484375,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5271,
|
||||
"logprob": -12.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.30859375,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 235336,
|
||||
"logprob": -0.3359375,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 109,
|
||||
"logprob": -0.23828125,
|
||||
"special": false,
|
||||
"text": "\n\n"
|
||||
},
|
||||
{
|
||||
"id": 26843,
|
||||
"logprob": -0.14550781,
|
||||
"special": false,
|
||||
"text": "Deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.038330078,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.030883789,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 476,
|
||||
"logprob": -0.020996094,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 38397,
|
||||
"logprob": -0.80859375,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 576,
|
||||
"logprob": -0.0005455017,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 6479,
|
||||
"logprob": -0.05908203,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.00020599365,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 674,
|
||||
"logprob": -0.17285156,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": "\n\nDeep learning is a subset of machine learning that"
|
||||
},
|
||||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 2,
|
||||
"logprob": null,
|
||||
"text": "<bos>"
|
||||
},
|
||||
{
|
||||
"id": 1841,
|
||||
"logprob": -5.46875,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.71484375,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5271,
|
||||
"logprob": -12.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.30859375,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 235336,
|
||||
"logprob": -0.3359375,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 109,
|
||||
"logprob": -0.23828125,
|
||||
"special": false,
|
||||
"text": "\n\n"
|
||||
},
|
||||
{
|
||||
"id": 26843,
|
||||
"logprob": -0.14550781,
|
||||
"special": false,
|
||||
"text": "Deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.038330078,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.030883789,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 476,
|
||||
"logprob": -0.020996094,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 38397,
|
||||
"logprob": -0.80859375,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 576,
|
||||
"logprob": -0.0005455017,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 6479,
|
||||
"logprob": -0.05908203,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.00020599365,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 674,
|
||||
"logprob": -0.17285156,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": "\n\nDeep learning is a subset of machine learning that"
|
||||
},
|
||||
{
|
||||
"details": {
|
||||
"best_of_sequences": null,
|
||||
"finish_reason": "length",
|
||||
"generated_tokens": 10,
|
||||
"prefill": [
|
||||
{
|
||||
"id": 2,
|
||||
"logprob": null,
|
||||
"text": "<bos>"
|
||||
},
|
||||
{
|
||||
"id": 1841,
|
||||
"logprob": -5.46875,
|
||||
"text": "What"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.71484375,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 5271,
|
||||
"logprob": -12.0,
|
||||
"text": " deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.30859375,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 235336,
|
||||
"logprob": -0.3359375,
|
||||
"text": "?"
|
||||
}
|
||||
],
|
||||
"seed": null,
|
||||
"tokens": [
|
||||
{
|
||||
"id": 109,
|
||||
"logprob": -0.23828125,
|
||||
"special": false,
|
||||
"text": "\n\n"
|
||||
},
|
||||
{
|
||||
"id": 26843,
|
||||
"logprob": -0.14550781,
|
||||
"special": false,
|
||||
"text": "Deep"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.038330078,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 603,
|
||||
"logprob": -0.030883789,
|
||||
"special": false,
|
||||
"text": " is"
|
||||
},
|
||||
{
|
||||
"id": 476,
|
||||
"logprob": -0.020996094,
|
||||
"special": false,
|
||||
"text": " a"
|
||||
},
|
||||
{
|
||||
"id": 38397,
|
||||
"logprob": -0.80859375,
|
||||
"special": false,
|
||||
"text": " subset"
|
||||
},
|
||||
{
|
||||
"id": 576,
|
||||
"logprob": -0.0005455017,
|
||||
"special": false,
|
||||
"text": " of"
|
||||
},
|
||||
{
|
||||
"id": 6479,
|
||||
"logprob": -0.05908203,
|
||||
"special": false,
|
||||
"text": " machine"
|
||||
},
|
||||
{
|
||||
"id": 6044,
|
||||
"logprob": -0.00020599365,
|
||||
"special": false,
|
||||
"text": " learning"
|
||||
},
|
||||
{
|
||||
"id": 674,
|
||||
"logprob": -0.17285156,
|
||||
"special": false,
|
||||
"text": " that"
|
||||
}
|
||||
],
|
||||
"top_tokens": null
|
||||
},
|
||||
"generated_text": "\n\nDeep learning is a subset of machine learning that"
|
||||
}
|
||||
]
|
|
@ -0,0 +1,86 @@
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def compressed_tensors_w8an_handle(launcher):
|
||||
with launcher(
|
||||
"neuralmagic/Llama-3.2-1B-Instruct-FP8",
|
||||
num_shard=2,
|
||||
quantize="compressed-tensors",
|
||||
) as handle:
|
||||
yield handle
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
async def compressed_tensors_w8an(compressed_tensors_w8an_handle):
|
||||
await compressed_tensors_w8an_handle.health(300)
|
||||
return compressed_tensors_w8an_handle.client
|
||||
|
||||
|
||||
@pytest.mark.release
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.private
|
||||
async def test_compressed_tensors_w8an(compressed_tensors_w8an, response_snapshot):
|
||||
response = await compressed_tensors_w8an.generate(
|
||||
"What is deep learning?",
|
||||
max_new_tokens=10,
|
||||
decoder_input_details=True,
|
||||
)
|
||||
|
||||
assert (
|
||||
response.generated_text
|
||||
== " Deep learning is a type of artificial intelligence (AI"
|
||||
)
|
||||
assert response.details.generated_tokens == 10
|
||||
assert response == response_snapshot
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_compressed_tensors_w8an_all_params(
|
||||
compressed_tensors_w8an, response_snapshot
|
||||
):
|
||||
response = await compressed_tensors_w8an.generate(
|
||||
"What is deep learning",
|
||||
max_new_tokens=10,
|
||||
repetition_penalty=1.2,
|
||||
return_full_text=True,
|
||||
stop_sequences=["test"],
|
||||
temperature=0.5,
|
||||
top_p=0.9,
|
||||
top_k=10,
|
||||
truncate=5,
|
||||
typical_p=0.9,
|
||||
watermark=True,
|
||||
decoder_input_details=True,
|
||||
seed=0,
|
||||
)
|
||||
|
||||
assert response.details.generated_tokens == 10
|
||||
assert (
|
||||
response.generated_text
|
||||
== "What is deep learning?\nDeep learning, also known as neural network or"
|
||||
)
|
||||
assert response == response_snapshot
|
||||
|
||||
|
||||
@pytest.mark.release
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.private
|
||||
async def test_compressed_tensors_w8an_load(
|
||||
compressed_tensors_w8an, generate_load, response_snapshot
|
||||
):
|
||||
responses = await generate_load(
|
||||
compressed_tensors_w8an,
|
||||
"What is deep learning?",
|
||||
max_new_tokens=10,
|
||||
n=4,
|
||||
)
|
||||
|
||||
assert (
|
||||
responses[0].generated_text
|
||||
== " Deep learning is a type of artificial intelligence (AI"
|
||||
)
|
||||
assert len(responses) == 4
|
||||
assert all([r.generated_text == responses[0].generated_text for r in responses])
|
||||
|
||||
assert responses == response_snapshot
|
|
@ -0,0 +1,86 @@
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def compressed_tensors_wna16_handle(launcher):
|
||||
with launcher(
|
||||
"neuralmagic/gemma-2-2b-it-quantized.w4a16",
|
||||
num_shard=2,
|
||||
quantize="compressed-tensors",
|
||||
) as handle:
|
||||
yield handle
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
async def compressed_tensors_wna16(compressed_tensors_wna16_handle):
|
||||
await compressed_tensors_wna16_handle.health(300)
|
||||
return compressed_tensors_wna16_handle.client
|
||||
|
||||
|
||||
@pytest.mark.release
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.private
|
||||
async def test_compressed_tensors_wna16(compressed_tensors_wna16, response_snapshot):
|
||||
response = await compressed_tensors_wna16.generate(
|
||||
"What is deep learning?",
|
||||
max_new_tokens=10,
|
||||
decoder_input_details=True,
|
||||
)
|
||||
|
||||
assert (
|
||||
response.generated_text
|
||||
== "\n\nDeep learning is a subset of machine learning that"
|
||||
)
|
||||
assert response.details.generated_tokens == 10
|
||||
assert response == response_snapshot
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_compressed_tensors_wna16_all_params(
|
||||
compressed_tensors_wna16, response_snapshot
|
||||
):
|
||||
response = await compressed_tensors_wna16.generate(
|
||||
"What is deep learning",
|
||||
max_new_tokens=10,
|
||||
repetition_penalty=1.2,
|
||||
return_full_text=True,
|
||||
stop_sequences=["test"],
|
||||
temperature=0.5,
|
||||
top_p=0.9,
|
||||
top_k=10,
|
||||
truncate=5,
|
||||
typical_p=0.9,
|
||||
watermark=True,
|
||||
decoder_input_details=True,
|
||||
seed=0,
|
||||
)
|
||||
|
||||
assert response.details.generated_tokens == 10
|
||||
assert (
|
||||
response.generated_text
|
||||
== "What is deep learning?\n\nDeep Learning is a subset of machine learning"
|
||||
)
|
||||
assert response == response_snapshot
|
||||
|
||||
|
||||
@pytest.mark.release
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.private
|
||||
async def test_compressed_tensors_wna16_load(
|
||||
compressed_tensors_wna16, generate_load, response_snapshot
|
||||
):
|
||||
responses = await generate_load(
|
||||
compressed_tensors_wna16,
|
||||
"What is deep learning?",
|
||||
max_new_tokens=10,
|
||||
n=4,
|
||||
)
|
||||
|
||||
assert (
|
||||
responses[0].generated_text
|
||||
== "\n\nDeep learning is a subset of machine learning that"
|
||||
)
|
||||
assert len(responses) == 4
|
||||
assert all([r.generated_text == responses[0].generated_text for r in responses])
|
||||
|
||||
assert responses == response_snapshot
|
|
@ -212,6 +212,8 @@ enum Quantization {
|
|||
/// <https://hf.co/models?search=awq>.
|
||||
/// Should replace GPTQ models wherever possible because of the better latency
|
||||
Awq,
|
||||
/// Compressed tensors, which can be a mixture of different quantization methods.
|
||||
CompressedTensors,
|
||||
/// 8 bit quantization, doesn't require specific model.
|
||||
/// Should be a drop-in replacement to bitsandbytes with much better performance.
|
||||
/// Kernels are from <https://github.com/NetEase-FuXi/EETQ.git>
|
||||
|
@ -274,6 +276,9 @@ impl std::fmt::Display for Quantization {
|
|||
Quantization::Awq => {
|
||||
write!(f, "awq")
|
||||
}
|
||||
Quantization::CompressedTensors => {
|
||||
write!(f, "compressed-tensors")
|
||||
}
|
||||
Quantization::Eetq => {
|
||||
write!(f, "eetq")
|
||||
}
|
||||
|
|
|
@ -3,8 +3,10 @@
|
|||
buildPythonPackage,
|
||||
poetry-core,
|
||||
mypy-protobuf,
|
||||
attention-kernels,
|
||||
awq-inference-engine,
|
||||
causal-conv1d,
|
||||
compressed-tensors,
|
||||
eetq,
|
||||
einops,
|
||||
exllamav2,
|
||||
|
@ -26,15 +28,18 @@
|
|||
opentelemetry-exporter-otlp,
|
||||
opentelemetry-instrumentation-grpc,
|
||||
opentelemetry-semantic-conventions,
|
||||
outlines,
|
||||
peft,
|
||||
prometheus-client,
|
||||
punica-kernels,
|
||||
py-cpuinfo,
|
||||
pydantic,
|
||||
safetensors,
|
||||
tokenizers,
|
||||
torch,
|
||||
sentencepiece,
|
||||
transformers,
|
||||
typer,
|
||||
vllm,
|
||||
}:
|
||||
|
||||
let
|
||||
|
@ -71,9 +76,11 @@ buildPythonPackage {
|
|||
pythonRemoveDeps = [ "scipy" ];
|
||||
|
||||
dependencies = [
|
||||
attention-kernels
|
||||
awq-inference-engine
|
||||
eetq
|
||||
causal-conv1d
|
||||
compressed-tensors
|
||||
einops
|
||||
exllamav2
|
||||
flashinfer
|
||||
|
@ -93,14 +100,17 @@ buildPythonPackage {
|
|||
opentelemetry-exporter-otlp
|
||||
opentelemetry-instrumentation-grpc
|
||||
opentelemetry-semantic-conventions
|
||||
outlines
|
||||
peft
|
||||
prometheus-client
|
||||
punica-kernels
|
||||
py-cpuinfo
|
||||
pydantic
|
||||
safetensors
|
||||
sentencepiece
|
||||
tokenizers
|
||||
transformers
|
||||
typer
|
||||
vllm
|
||||
];
|
||||
|
||||
prePatch = ''
|
||||
|
|
|
@ -10,10 +10,12 @@ use crate::{
|
|||
};
|
||||
use async_stream::stream;
|
||||
use async_trait::async_trait;
|
||||
use axum::response::sse::Event;
|
||||
use chat_template::ChatTemplate;
|
||||
use futures::future::try_join_all;
|
||||
use futures::Stream;
|
||||
use minijinja::ErrorKind;
|
||||
use serde::Serialize;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
use thiserror::Error;
|
||||
|
@ -373,4 +375,26 @@ impl InferError {
|
|||
InferError::StreamSerializationError(_) => "stream_serialization_error",
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn into_openai_event(self) -> Event {
|
||||
Event::default()
|
||||
.json_data(OpenaiErrorEvent {
|
||||
error: APIError {
|
||||
message: self.to_string(),
|
||||
http_status_code: 422,
|
||||
},
|
||||
})
|
||||
.unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct APIError {
|
||||
message: String,
|
||||
http_status_code: usize,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct OpenaiErrorEvent {
|
||||
error: APIError,
|
||||
}
|
||||
|
|
|
@ -22,11 +22,13 @@ use tracing::warn;
|
|||
use utoipa::ToSchema;
|
||||
use validation::Validation;
|
||||
|
||||
#[allow(clippy::large_enum_variant)]
|
||||
#[derive(Clone)]
|
||||
pub enum Tokenizer {
|
||||
Python {
|
||||
tokenizer_name: String,
|
||||
revision: Option<String>,
|
||||
trust_remote_code: bool,
|
||||
},
|
||||
Rust(tokenizers::Tokenizer),
|
||||
}
|
||||
|
@ -38,15 +40,20 @@ impl<'a> PyTokenizer<'a> {
|
|||
py: Python<'a>,
|
||||
tokenizer_name: String,
|
||||
revision: Option<String>,
|
||||
trust_remote_code: bool,
|
||||
) -> PyResult<PyTokenizer<'a>> {
|
||||
let transformers = py.import_bound("transformers")?;
|
||||
let auto = transformers.getattr("AutoTokenizer")?;
|
||||
let from_pretrained = auto.getattr("from_pretrained")?;
|
||||
let args = (tokenizer_name,);
|
||||
let kwargs = if let Some(rev) = &revision {
|
||||
[("revision", rev.to_string())].into_py_dict_bound(py)
|
||||
[
|
||||
("revision", rev.to_string().into_py(py)),
|
||||
("trust_remote_code", trust_remote_code.into_py(py)),
|
||||
]
|
||||
.into_py_dict_bound(py)
|
||||
} else {
|
||||
pyo3::types::PyDict::new_bound(py)
|
||||
[("trust_remote_code", trust_remote_code.into_py(py))].into_py_dict_bound(py)
|
||||
};
|
||||
let tokenizer = from_pretrained.call(args, Some(&kwargs))?;
|
||||
tracing::info!("Loaded a python tokenizer");
|
||||
|
|
|
@ -109,7 +109,7 @@ request_body = CompatGenerateRequest,
|
|||
responses(
|
||||
(status = 200, description = "Generated Text",
|
||||
content(
|
||||
("application/json" = GenerateResponse),
|
||||
("application/json" = Vec<GenerateResponse>),
|
||||
("text/event-stream" = StreamResponse),
|
||||
)),
|
||||
(status = 424, description = "Generation Error", body = ErrorResponse,
|
||||
|
@ -866,7 +866,7 @@ pub(crate) async fn completions(
|
|||
|
||||
yield Ok(event);
|
||||
}
|
||||
Err(err) => yield Ok(Event::from(err)),
|
||||
Err(err) => yield Ok(err.into_openai_event()),
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -1274,7 +1274,8 @@ pub(crate) async fn chat_completions(
|
|||
};
|
||||
let mut response_as_tool = using_tools;
|
||||
while let Some(result) = response_stream.next().await {
|
||||
if let Ok(stream_token) = result {
|
||||
match result{
|
||||
Ok(stream_token) => {
|
||||
let token_text = &stream_token.token.text.clone();
|
||||
match state {
|
||||
StreamState::Buffering => {
|
||||
|
@ -1368,6 +1369,8 @@ pub(crate) async fn chat_completions(
|
|||
}
|
||||
}
|
||||
}
|
||||
Err(err) => yield Ok(err.into_openai_event())
|
||||
}
|
||||
}
|
||||
yield Ok::<Event, Infallible>(Event::default().data("[DONE]"));
|
||||
};
|
||||
|
@ -1829,6 +1832,7 @@ pub async fn run(
|
|||
Tokenizer::Python {
|
||||
tokenizer_name: tokenizer_name.clone(),
|
||||
revision: revision.clone(),
|
||||
trust_remote_code,
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
|
@ -439,9 +439,11 @@ fn tokenizer_worker(
|
|||
Tokenizer::Python {
|
||||
tokenizer_name,
|
||||
revision,
|
||||
trust_remote_code,
|
||||
} => {
|
||||
pyo3::Python::with_gil(|py| -> pyo3::PyResult<()> {
|
||||
let tokenizer = PyTokenizer::from_py(py, tokenizer_name, revision)?;
|
||||
let tokenizer =
|
||||
PyTokenizer::from_py(py, tokenizer_name, revision, trust_remote_code)?;
|
||||
// Loop over requests
|
||||
while let Some(((inputs, add_special_tokens, truncate), response_tx, parent_span)) =
|
||||
receiver.blocking_recv()
|
||||
|
|
|
@ -23,14 +23,14 @@ gen-server:
|
|||
install-server: gen-server
|
||||
pip install pip --upgrade
|
||||
pip install -r requirements_cuda.txt
|
||||
pip install -e ".[accelerate, quantize, peft, outlines]"
|
||||
pip install -e ".[accelerate, compressed-tensors, quantize, peft, outlines]"
|
||||
|
||||
|
||||
install: install-cuda
|
||||
echo "Installed server"
|
||||
|
||||
install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention
|
||||
pip install -e ".[bnb,marlin,moe]"
|
||||
install-cuda: install-server install-flash-attention-v2-cuda install-flash-attention
|
||||
pip install -e ".[attention,bnb,marlin,moe]"
|
||||
pip install nvidia-nccl-cu12==2.22.3
|
||||
|
||||
install-rocm: install-server install-flash-attention-v2-rocm install-vllm-rocm
|
||||
|
|
|
@ -1,14 +1,4 @@
|
|||
commit_cuda := d243e9dc7e2c9c2e36a4150ec8e64809cb55c01b
|
||||
commit_rocm := 4e0929e6e4fa0a3d09d358715c288020ea9dc247
|
||||
build-vllm-cuda:
|
||||
if [ ! -d 'vllm' ]; then \
|
||||
pip install -U ninja packaging --no-cache-dir && \
|
||||
git clone https://github.com/Narsil/vllm.git vllm; \
|
||||
fi
|
||||
cd vllm && git fetch origin && git checkout $(commit_cuda) && python setup.py build
|
||||
|
||||
install-vllm-cuda: build-vllm-cuda
|
||||
cd vllm && git fetch origin && git checkout $(commit_cuda) && pip install -e .
|
||||
|
||||
build-vllm-rocm:
|
||||
if [ ! -d 'vllm' ]; then \
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "accelerate"
|
||||
|
@ -167,6 +167,17 @@ files = [
|
|||
[package.dependencies]
|
||||
frozenlist = ">=1.1.0"
|
||||
|
||||
[[package]]
|
||||
name = "airportsdata"
|
||||
version = "20241001"
|
||||
description = "Extensive database of location and timezone data for nearly every airport and landing strip in the world."
|
||||
optional = true
|
||||
python-versions = ">=3.9"
|
||||
files = [
|
||||
{file = "airportsdata-20241001-py3-none-any.whl", hash = "sha256:67d71cf2c5378cc17ff66b62b1e11aa2444043949c894543ac8fd8dafce192fd"},
|
||||
{file = "airportsdata-20241001.tar.gz", hash = "sha256:fa0bd143b4f4be3557cb892fa0612ef210fd91a92bd720b4d8221de576a4fa00"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "annotated-types"
|
||||
version = "0.7.0"
|
||||
|
@ -189,6 +200,74 @@ files = [
|
|||
{file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "attention-kernels"
|
||||
version = "0.1.1"
|
||||
description = "Attention kernels"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:812851d4ce0f54ca764ff3815a731b15f0cb110115d0aa2d0997cd7794d808bb"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
torch = "*"
|
||||
|
||||
[package.source]
|
||||
type = "url"
|
||||
url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
|
||||
|
||||
[[package]]
|
||||
name = "attention-kernels"
|
||||
version = "0.1.1"
|
||||
description = "Attention kernels"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:614c402621b11dd1f5741a016b9fd27cb6a68814471f2048bc05206923516268"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
torch = "*"
|
||||
|
||||
[package.source]
|
||||
type = "url"
|
||||
url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
|
||||
|
||||
[[package]]
|
||||
name = "attention-kernels"
|
||||
version = "0.1.1"
|
||||
description = "Attention kernels"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:6b2ca7c98997431d5f6c4af7553dce6b1bff8dfdec374c97c6ffba71325a02b7"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
torch = "*"
|
||||
|
||||
[package.source]
|
||||
type = "url"
|
||||
url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
|
||||
|
||||
[[package]]
|
||||
name = "attention-kernels"
|
||||
version = "0.1.1"
|
||||
description = "Attention kernels"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:a56710c5626e461d6f628ae14b74ffc89833578ebd59c3c0c47f5d6f07461fbf"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
torch = "*"
|
||||
|
||||
[package.source]
|
||||
type = "url"
|
||||
url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
|
||||
|
||||
[[package]]
|
||||
name = "attrs"
|
||||
version = "24.2.0"
|
||||
|
@ -388,6 +467,26 @@ files = [
|
|||
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "compressed-tensors"
|
||||
version = "0.7.1"
|
||||
description = "Library for utilization of compressed safetensors of neural network models"
|
||||
optional = true
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "compressed-tensors-0.7.1.tar.gz", hash = "sha256:3c7865ebfe4ea76ae94d7c674bcf93aedd2064571f682c09a377a219d5ebb3a0"},
|
||||
{file = "compressed_tensors-0.7.1-py3-none-any.whl", hash = "sha256:22d11558a70f655ae647db9c8e9fb14a5e9d6983ca5aec3f267518625fd6dd0e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pydantic = ">=2.0"
|
||||
torch = ">=1.7.0"
|
||||
transformers = "*"
|
||||
|
||||
[package.extras]
|
||||
accelerate = ["accelerate"]
|
||||
dev = ["black (==22.12.0)", "flake8 (>=3.8.3)", "isort (==5.8.0)", "nbconvert (>=7.16.3)", "pytest (>=6.0.0)", "wheel (>=0.36.2)"]
|
||||
|
||||
[[package]]
|
||||
name = "datasets"
|
||||
version = "2.21.0"
|
||||
|
@ -1023,17 +1122,6 @@ MarkupSafe = ">=2.0"
|
|||
[package.extras]
|
||||
i18n = ["Babel (>=2.7)"]
|
||||
|
||||
[[package]]
|
||||
name = "joblib"
|
||||
version = "1.4.2"
|
||||
description = "Lightweight pipelining with Python functions"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "joblib-1.4.2-py3-none-any.whl", hash = "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6"},
|
||||
{file = "joblib-1.4.2.tar.gz", hash = "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jsonschema"
|
||||
version = "4.23.0"
|
||||
|
@ -1086,36 +1174,6 @@ interegular = ["interegular (>=0.3.1,<0.4.0)"]
|
|||
nearley = ["js2py"]
|
||||
regex = ["regex"]
|
||||
|
||||
[[package]]
|
||||
name = "llvmlite"
|
||||
version = "0.43.0"
|
||||
description = "lightweight wrapper around basic LLVM functionality"
|
||||
optional = true
|
||||
python-versions = ">=3.9"
|
||||
files = [
|
||||
{file = "llvmlite-0.43.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a289af9a1687c6cf463478f0fa8e8aa3b6fb813317b0d70bf1ed0759eab6f761"},
|
||||
{file = "llvmlite-0.43.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6d4fd101f571a31acb1559ae1af30f30b1dc4b3186669f92ad780e17c81e91bc"},
|
||||
{file = "llvmlite-0.43.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7d434ec7e2ce3cc8f452d1cd9a28591745de022f931d67be688a737320dfcead"},
|
||||
{file = "llvmlite-0.43.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6912a87782acdff6eb8bf01675ed01d60ca1f2551f8176a300a886f09e836a6a"},
|
||||
{file = "llvmlite-0.43.0-cp310-cp310-win_amd64.whl", hash = "sha256:14f0e4bf2fd2d9a75a3534111e8ebeb08eda2f33e9bdd6dfa13282afacdde0ed"},
|
||||
{file = "llvmlite-0.43.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3e8d0618cb9bfe40ac38a9633f2493d4d4e9fcc2f438d39a4e854f39cc0f5f98"},
|
||||
{file = "llvmlite-0.43.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e0a9a1a39d4bf3517f2af9d23d479b4175ead205c592ceeb8b89af48a327ea57"},
|
||||
{file = "llvmlite-0.43.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1da416ab53e4f7f3bc8d4eeba36d801cc1894b9fbfbf2022b29b6bad34a7df2"},
|
||||
{file = "llvmlite-0.43.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:977525a1e5f4059316b183fb4fd34fa858c9eade31f165427a3977c95e3ee749"},
|
||||
{file = "llvmlite-0.43.0-cp311-cp311-win_amd64.whl", hash = "sha256:d5bd550001d26450bd90777736c69d68c487d17bf371438f975229b2b8241a91"},
|
||||
{file = "llvmlite-0.43.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f99b600aa7f65235a5a05d0b9a9f31150c390f31261f2a0ba678e26823ec38f7"},
|
||||
{file = "llvmlite-0.43.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:35d80d61d0cda2d767f72de99450766250560399edc309da16937b93d3b676e7"},
|
||||
{file = "llvmlite-0.43.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eccce86bba940bae0d8d48ed925f21dbb813519169246e2ab292b5092aba121f"},
|
||||
{file = "llvmlite-0.43.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df6509e1507ca0760787a199d19439cc887bfd82226f5af746d6977bd9f66844"},
|
||||
{file = "llvmlite-0.43.0-cp312-cp312-win_amd64.whl", hash = "sha256:7a2872ee80dcf6b5dbdc838763d26554c2a18aa833d31a2635bff16aafefb9c9"},
|
||||
{file = "llvmlite-0.43.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9cd2a7376f7b3367019b664c21f0c61766219faa3b03731113ead75107f3b66c"},
|
||||
{file = "llvmlite-0.43.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:18e9953c748b105668487b7c81a3e97b046d8abf95c4ddc0cd3c94f4e4651ae8"},
|
||||
{file = "llvmlite-0.43.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:74937acd22dc11b33946b67dca7680e6d103d6e90eeaaaf932603bec6fe7b03a"},
|
||||
{file = "llvmlite-0.43.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc9efc739cc6ed760f795806f67889923f7274276f0eb45092a1473e40d9b867"},
|
||||
{file = "llvmlite-0.43.0-cp39-cp39-win_amd64.whl", hash = "sha256:47e147cdda9037f94b399bf03bfd8a6b6b1f2f90be94a454e3386f006455a9b4"},
|
||||
{file = "llvmlite-0.43.0.tar.gz", hash = "sha256:ae2b5b5c3ef67354824fb75517c8db5fbe93bc02cd9671f3c62271626bc041d5"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "loguru"
|
||||
version = "0.6.0"
|
||||
|
@ -1557,40 +1615,6 @@ doc = ["nb2plots (>=0.7)", "nbconvert (<7.9)", "numpydoc (>=1.6)", "pillow (>=9.
|
|||
extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.11)", "sympy (>=1.10)"]
|
||||
test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "numba"
|
||||
version = "0.60.0"
|
||||
description = "compiling Python code using LLVM"
|
||||
optional = true
|
||||
python-versions = ">=3.9"
|
||||
files = [
|
||||
{file = "numba-0.60.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5d761de835cd38fb400d2c26bb103a2726f548dc30368853121d66201672e651"},
|
||||
{file = "numba-0.60.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:159e618ef213fba758837f9837fb402bbe65326e60ba0633dbe6c7f274d42c1b"},
|
||||
{file = "numba-0.60.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1527dc578b95c7c4ff248792ec33d097ba6bef9eda466c948b68dfc995c25781"},
|
||||
{file = "numba-0.60.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fe0b28abb8d70f8160798f4de9d486143200f34458d34c4a214114e445d7124e"},
|
||||
{file = "numba-0.60.0-cp310-cp310-win_amd64.whl", hash = "sha256:19407ced081d7e2e4b8d8c36aa57b7452e0283871c296e12d798852bc7d7f198"},
|
||||
{file = "numba-0.60.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a17b70fc9e380ee29c42717e8cc0bfaa5556c416d94f9aa96ba13acb41bdece8"},
|
||||
{file = "numba-0.60.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3fb02b344a2a80efa6f677aa5c40cd5dd452e1b35f8d1c2af0dfd9ada9978e4b"},
|
||||
{file = "numba-0.60.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5f4fde652ea604ea3c86508a3fb31556a6157b2c76c8b51b1d45eb40c8598703"},
|
||||
{file = "numba-0.60.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4142d7ac0210cc86432b818338a2bc368dc773a2f5cf1e32ff7c5b378bd63ee8"},
|
||||
{file = "numba-0.60.0-cp311-cp311-win_amd64.whl", hash = "sha256:cac02c041e9b5bc8cf8f2034ff6f0dbafccd1ae9590dc146b3a02a45e53af4e2"},
|
||||
{file = "numba-0.60.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d7da4098db31182fc5ffe4bc42c6f24cd7d1cb8a14b59fd755bfee32e34b8404"},
|
||||
{file = "numba-0.60.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:38d6ea4c1f56417076ecf8fc327c831ae793282e0ff51080c5094cb726507b1c"},
|
||||
{file = "numba-0.60.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:62908d29fb6a3229c242e981ca27e32a6e606cc253fc9e8faeb0e48760de241e"},
|
||||
{file = "numba-0.60.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0ebaa91538e996f708f1ab30ef4d3ddc344b64b5227b67a57aa74f401bb68b9d"},
|
||||
{file = "numba-0.60.0-cp312-cp312-win_amd64.whl", hash = "sha256:f75262e8fe7fa96db1dca93d53a194a38c46da28b112b8a4aca168f0df860347"},
|
||||
{file = "numba-0.60.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:01ef4cd7d83abe087d644eaa3d95831b777aa21d441a23703d649e06b8e06b74"},
|
||||
{file = "numba-0.60.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:819a3dfd4630d95fd574036f99e47212a1af41cbcb019bf8afac63ff56834449"},
|
||||
{file = "numba-0.60.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b983bd6ad82fe868493012487f34eae8bf7dd94654951404114f23c3466d34b"},
|
||||
{file = "numba-0.60.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c151748cd269ddeab66334bd754817ffc0cabd9433acb0f551697e5151917d25"},
|
||||
{file = "numba-0.60.0-cp39-cp39-win_amd64.whl", hash = "sha256:3031547a015710140e8c87226b4cfe927cac199835e5bf7d4fe5cb64e814e3ab"},
|
||||
{file = "numba-0.60.0.tar.gz", hash = "sha256:5df6158e5584eece5fc83294b949fd30b9f1125df7708862205217e068aabf16"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
llvmlite = "==0.43.*"
|
||||
numpy = ">=1.22,<2.1"
|
||||
|
||||
[[package]]
|
||||
name = "numpy"
|
||||
version = "1.26.4"
|
||||
|
@ -1968,36 +1992,83 @@ opentelemetry-api = "1.25.0"
|
|||
|
||||
[[package]]
|
||||
name = "outlines"
|
||||
version = "0.0.34"
|
||||
version = "0.1.3"
|
||||
description = "Probabilistic Generative Model Programming"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
python-versions = ">=3.9"
|
||||
files = [
|
||||
{file = "outlines-0.0.34-py3-none-any.whl", hash = "sha256:911588a7e64a4f193b97fb4c501d98ccfd4e95a98f6a3ada67a280bf0c373c50"},
|
||||
{file = "outlines-0.0.34.tar.gz", hash = "sha256:594e7204c770b47a62eb5c2ba7d25ea0ab2e16882b5f04556712a0228d3d3309"},
|
||||
{file = "outlines-0.1.3-py3-none-any.whl", hash = "sha256:afcf6012b7cabbaae4a58975d03190c0bbc3d402b0b2a37538e05f335d73a247"},
|
||||
{file = "outlines-0.1.3.tar.gz", hash = "sha256:5a48ad00d3bdd8eccaa7574821eb5aaa27ab9f61fde9c3fba52f352dc00197e4"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
airportsdata = "*"
|
||||
cloudpickle = "*"
|
||||
datasets = "*"
|
||||
diskcache = "*"
|
||||
interegular = "*"
|
||||
jinja2 = "*"
|
||||
joblib = "*"
|
||||
jsonschema = "*"
|
||||
lark = "*"
|
||||
nest-asyncio = "*"
|
||||
numba = "*"
|
||||
numpy = "*"
|
||||
numpy = "<2.0.0"
|
||||
outlines-core = "0.1.14"
|
||||
pycountry = "*"
|
||||
pydantic = ">=2.0"
|
||||
referencing = "*"
|
||||
requests = "*"
|
||||
scipy = "*"
|
||||
torch = ">=2.1.0"
|
||||
transformers = "*"
|
||||
torch = "*"
|
||||
tqdm = "*"
|
||||
typing-extensions = "*"
|
||||
|
||||
[package.extras]
|
||||
serve = ["fastapi", "pydantic (>=2.0)", "ray (==2.9.0)", "uvicorn", "vllm (>=0.3.0)"]
|
||||
test = ["accelerate", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "datasets", "diff-cover", "huggingface-hub", "llama-cpp-python (>=0.2.42)", "pre-commit", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "responses", "transformers"]
|
||||
serve = ["fastapi", "pydantic (>=2.0)", "uvicorn", "vllm (>=0.3.0)"]
|
||||
test = ["accelerate", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "diff-cover", "exllamav2", "huggingface-hub", "llama-cpp-python", "mlx-lm", "openai (>=1.0.0)", "pillow", "pre-commit", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "responses", "transformers", "vllm"]
|
||||
|
||||
[[package]]
|
||||
name = "outlines-core"
|
||||
version = "0.1.14"
|
||||
description = "Structured Text Generation in Rust"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "outlines_core-0.1.14-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:291c6d9d348cb5562cd28ce44d80822d77238f1cd7c30d890b5b20488e71608d"},
|
||||
{file = "outlines_core-0.1.14-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3a50e2f6745e0c34cc857d1bd5590e2966ad06e8ce10802976e9e6c116c7533d"},
|
||||
{file = "outlines_core-0.1.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7dfe64b590a6a88dcc5e59f0a399fff0458cdcf97d68de07f08e1bd3bf8ac1d"},
|
||||
{file = "outlines_core-0.1.14-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:100de068ce52893bec316481e65db8f1c734a0f25f540c29dafd7a8afec0a29d"},
|
||||
{file = "outlines_core-0.1.14-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e06cb724770fd0fe1c8444382c4a6e79901bba33720f70fe6c8437f58eceb92e"},
|
||||
{file = "outlines_core-0.1.14-cp310-cp310-win32.whl", hash = "sha256:6d41da3d8a087fd54133cf910c2d5759da55490bbd0e3bc6c1e7907b54248415"},
|
||||
{file = "outlines_core-0.1.14-cp310-cp310-win_amd64.whl", hash = "sha256:646fd1073feed393bc77f9605a2fa27a54551ab04f85867ce789af1dee6326fa"},
|
||||
{file = "outlines_core-0.1.14-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:60f3a947fe09106f7668cf832c28b9269b8f0fc109f081608acfce9262213359"},
|
||||
{file = "outlines_core-0.1.14-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5e273a100c922f794d8e077a8161d0985d3005887066b4af3ae7afd3742fe9b8"},
|
||||
{file = "outlines_core-0.1.14-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:622e547f11a869fc67be40abc4cbcda89ae6f46f9eb46a1ec0666bd6807e0c67"},
|
||||
{file = "outlines_core-0.1.14-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:60c9933a9faaa51b39aea3518f1822b0d3ec2c9a13b16849caca3955e29e320d"},
|
||||
{file = "outlines_core-0.1.14-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4a8c616ce103ef9102dbf4326f67b03e1e0f46aa19351e57f4beb37588c00428"},
|
||||
{file = "outlines_core-0.1.14-cp311-cp311-win32.whl", hash = "sha256:1c77aaa4556cbb6e93cc42be0a6e262f175e0754b7694d702d642ff03df67f2c"},
|
||||
{file = "outlines_core-0.1.14-cp311-cp311-win_amd64.whl", hash = "sha256:eb6ffe410866f65dbe17e95b0aabd70d990f058a2dc4e8b74f9583b07248cd36"},
|
||||
{file = "outlines_core-0.1.14-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b0e408b033618f23e9bb928a47b33b1bd4c9d04a3dbec680a20977de3b4f590d"},
|
||||
{file = "outlines_core-0.1.14-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:21d1393a6da5d3320e8c8247e9deeb851c5c862fd6ea5c779bd29797e8987155"},
|
||||
{file = "outlines_core-0.1.14-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5829c568db76673d36caaf0f86e96748b491b4a209deb9be87617372394a5fb9"},
|
||||
{file = "outlines_core-0.1.14-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e855ec99bce1099c0755bcbfa44568adf7ae0083905ba04f58a17614ddf0fe7"},
|
||||
{file = "outlines_core-0.1.14-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b897cfbf9c2719aa011d9b439b4c6751d9c7df5683b2169617972d4b4a914403"},
|
||||
{file = "outlines_core-0.1.14-cp38-cp38-win32.whl", hash = "sha256:4c9d908004b31bcd432156d60f4895bf5e1b51ca8c8eed82b12f1bb57d5bf7fd"},
|
||||
{file = "outlines_core-0.1.14-cp38-cp38-win_amd64.whl", hash = "sha256:6668a930d928216d0b319ad84947903f1e27556f604a9743051f795b11008b64"},
|
||||
{file = "outlines_core-0.1.14-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b745aa469cf3fb347b79a257804d75d1324e01691158664c1e413a816ce6b98d"},
|
||||
{file = "outlines_core-0.1.14-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:27504c8360467429d6223ebc49180d6956d7418bfc3d324f6ad10f069e1813ad"},
|
||||
{file = "outlines_core-0.1.14-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd8f1e1d91a206a520d1c577ce00136de2beb1d200ef93759fd4c9f45abe24d3"},
|
||||
{file = "outlines_core-0.1.14-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f30c8acb42895b624c504b85678331c5f9376fa4b8069ce06a27cf80f5881e27"},
|
||||
{file = "outlines_core-0.1.14-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:0e6cd0e7d995a7b04d90139a695279ab4a9eb7f492618b2c037a85eaf5f9fc59"},
|
||||
{file = "outlines_core-0.1.14-cp39-cp39-win32.whl", hash = "sha256:3104af4084da0e7c3d4b8538b43c725581d66bb68d426bc389680f06c3667476"},
|
||||
{file = "outlines_core-0.1.14-cp39-cp39-win_amd64.whl", hash = "sha256:45c6b9baded0337c4dcfa156af05ec4efd2b25c4d976e77be28146e4037b991f"},
|
||||
{file = "outlines_core-0.1.14.tar.gz", hash = "sha256:6db033e4f8e48381164e36cc716746640ad5022f0d86e4c88af15c75886b93a4"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
interegular = "*"
|
||||
jsonschema = "*"
|
||||
|
||||
[package.extras]
|
||||
test = ["accelerate", "asv", "beartype (<0.16.0)", "coverage[toml] (>=5.1)", "datasets", "diff-cover", "huggingface-hub", "pillow", "pre-commit", "pydantic", "pytest", "pytest-benchmark", "pytest-cov", "pytest-mock", "setuptools-rust", "torch", "transformers"]
|
||||
|
||||
[[package]]
|
||||
name = "packaging"
|
||||
|
@ -2470,6 +2541,17 @@ numpy = ">=1.16.6"
|
|||
[package.extras]
|
||||
test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"]
|
||||
|
||||
[[package]]
|
||||
name = "pycountry"
|
||||
version = "24.6.1"
|
||||
description = "ISO country, subdivision, language, currency and script definitions and their translations"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "pycountry-24.6.1-py3-none-any.whl", hash = "sha256:f1a4fb391cd7214f8eefd39556d740adcc233c778a27f8942c8dca351d6ce06f"},
|
||||
{file = "pycountry-24.6.1.tar.gz", hash = "sha256:b61b3faccea67f87d10c1f2b0fc0be714409e8fcdcc1315613174f6466c10221"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pydantic"
|
||||
version = "2.9.2"
|
||||
|
@ -3971,7 +4053,9 @@ type = ["pytest-mypy"]
|
|||
|
||||
[extras]
|
||||
accelerate = ["accelerate"]
|
||||
attention = ["attention-kernels", "attention-kernels", "attention-kernels", "attention-kernels"]
|
||||
bnb = ["bitsandbytes"]
|
||||
compressed-tensors = ["compressed-tensors"]
|
||||
marlin = ["marlin-kernels", "marlin-kernels", "marlin-kernels", "marlin-kernels"]
|
||||
moe = ["moe-kernels", "moe-kernels", "moe-kernels", "moe-kernels"]
|
||||
outlines = ["outlines"]
|
||||
|
@ -3982,4 +4066,4 @@ torch = ["torch"]
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.9,<3.13"
|
||||
content-hash = "b39033e573f50a0f046787aebf1702d86673aad0b2fcee818404fcea7f644b81"
|
||||
content-hash = "05add88628d836faceae1a26fde4092651a6eca74555ae38ebff879a7895be7e"
|
||||
|
|
|
@ -9,7 +9,7 @@ text-generation-server = 'text_generation_server.cli:app'
|
|||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.9,<3.13"
|
||||
protobuf = "^4.25.3"
|
||||
protobuf = ">=4.25.3,<6"
|
||||
grpcio = "^1.51.1"
|
||||
grpcio-status = "^1.51.1"
|
||||
grpcio-reflection = "^1.51.1"
|
||||
|
@ -34,12 +34,19 @@ peft = { version = "^0.10", optional = true }
|
|||
torch = { version = "^2.4.0", optional = true }
|
||||
scipy = "^1.11.1"
|
||||
pillow = "^10.0.0"
|
||||
outlines= { version = "^0.0.34", optional = true }
|
||||
prometheus-client = "^0.20.0"
|
||||
outlines= { version = "^0.1.1", optional = true }
|
||||
prometheus-client = ">=0.20.0,<0.22"
|
||||
py-cpuinfo = "^9.0.0"
|
||||
compressed-tensors = { version = "^0.7.1", optional = true }
|
||||
# Remove later, temporary workaround for outlines.
|
||||
numpy = "^1.26"
|
||||
|
||||
attention-kernels = [
|
||||
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
|
||||
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
|
||||
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
|
||||
{ url = "https://github.com/danieldk/attention-kernels/releases/download/v0.1.1/attention_kernels-0.1.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
|
||||
]
|
||||
marlin-kernels = [
|
||||
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
|
||||
{ url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.1/marlin_kernels-0.3.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
|
||||
|
@ -57,7 +64,9 @@ rich = "^13.7.1"
|
|||
[tool.poetry.extras]
|
||||
torch = ["torch"]
|
||||
accelerate = ["accelerate"]
|
||||
attention = ["attention-kernels"]
|
||||
bnb = ["bitsandbytes"]
|
||||
compressed-tensors = ["compressed-tensors"]
|
||||
marlin = ["marlin-kernels"]
|
||||
moe = ["moe-kernels"]
|
||||
peft = ["peft"]
|
||||
|
|
|
@ -45,7 +45,7 @@ sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
|
|||
setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
|
||||
transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
transformers==4.46.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
|
|
|
@ -45,7 +45,7 @@ sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
|
|||
setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
|
||||
transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
transformers==4.46.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
|
|
|
@ -45,7 +45,7 @@ sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
|
|||
setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
|
||||
transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
transformers==4.46.0 ; python_version >= "3.9" and python_version < "3.13"
|
||||
typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
|
||||
typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
|
||||
urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
|
||||
|
|
|
@ -19,6 +19,7 @@ class Quantization(str, Enum):
|
|||
bitsandbytes_fp4 = "bitsandbytes-fp4"
|
||||
gptq = "gptq"
|
||||
awq = "awq"
|
||||
compressed_tensors = "compressed-tensors"
|
||||
eetq = "eetq"
|
||||
exl2 = "exl2"
|
||||
fp8 = "fp8"
|
||||
|
|
|
@ -108,7 +108,7 @@ def paged_attention(
|
|||
if softcap is not None:
|
||||
raise RuntimeError("Paged attention doesn't support softcapping")
|
||||
input_lengths = seqlen.input_lengths + seqlen.cache_lengths
|
||||
from vllm._C import ops
|
||||
import attention_kernels
|
||||
|
||||
out = torch.empty_like(query)
|
||||
|
||||
|
@ -116,7 +116,7 @@ def paged_attention(
|
|||
max_num_partitions == 1 or num_seqs * num_heads > 512
|
||||
)
|
||||
if use_v1:
|
||||
ops.paged_attention_v1(
|
||||
attention_kernels.paged_attention_v1(
|
||||
out,
|
||||
query,
|
||||
kv_cache.key,
|
||||
|
@ -146,7 +146,7 @@ def paged_attention(
|
|||
)
|
||||
max_logits = torch.empty_like(exp_sums)
|
||||
|
||||
ops.paged_attention_v2(
|
||||
attention_kernels.paged_attention_v2(
|
||||
out,
|
||||
exp_sums,
|
||||
max_logits,
|
||||
|
|
|
@ -200,12 +200,12 @@ def paged_reshape_and_cache(
|
|||
):
|
||||
if SYSTEM == "cuda":
|
||||
try:
|
||||
from vllm._C import cache_ops
|
||||
import attention_kernels
|
||||
except Exception as e:
|
||||
raise ImportError(
|
||||
f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
|
||||
f"Could not import attention_kernels. Make sure your installation is correct. Complete error: {e}"
|
||||
)
|
||||
cache_ops.reshape_and_cache(
|
||||
attention_kernels.reshape_and_cache(
|
||||
key, value, key_cache, value_cache, slots, "auto", 1.0
|
||||
)
|
||||
elif SYSTEM == "rocm":
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
from .loader import CompressedTensorsLoader
|
||||
|
||||
__all__ = ["CompressedTensorsLoader"]
|
|
@ -0,0 +1,174 @@
|
|||
from typing import Any, Dict, List, Union
|
||||
|
||||
from compressed_tensors import QuantizationConfig, QuantizationStatus
|
||||
from compressed_tensors.config import CompressionFormat
|
||||
from compressed_tensors.quantization import (
|
||||
QuantizationScheme,
|
||||
QuantizationType,
|
||||
find_name_or_class_matches,
|
||||
)
|
||||
from loguru import logger
|
||||
from pydantic import ValidationError
|
||||
from torch import nn
|
||||
|
||||
from text_generation_server.layers.compressed_tensors.w8an_fp import W8ANFpLoader
|
||||
from text_generation_server.layers.compressed_tensors.wna16_int import WNA16Loader
|
||||
from text_generation_server.utils.log import log_once
|
||||
from text_generation_server.utils.weights import (
|
||||
DefaultWeightsLoader,
|
||||
UnquantizedWeight,
|
||||
Weights,
|
||||
WeightsLoader,
|
||||
)
|
||||
|
||||
# compressed-tensors can match modules as quantization targets. However,
|
||||
# they need to be objects rather than classes or class names. Since we
|
||||
# need to match `Linear` targets, make an instance that can be re-used.
|
||||
_EMPTY_LINEAR: nn.Module = nn.Linear(0, 0)
|
||||
|
||||
|
||||
class CompressedTensorsLoader(WeightsLoader):
|
||||
"""Loader for checkpoints stored in the compressed-tensors format."""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
quantization_config_raw = config.get("quantization_config")
|
||||
if quantization_config_raw is None:
|
||||
# `compression_config` was renamed to `quantization_config`; support
|
||||
# retained for backward compatibility.
|
||||
quantization_config_raw = config.get("compression_config")
|
||||
if quantization_config_raw is None:
|
||||
raise ValueError(
|
||||
"Checkpoint does not have compressed-tensors configuration"
|
||||
)
|
||||
|
||||
try:
|
||||
quantization_config = QuantizationConfig.model_validate(
|
||||
quantization_config_raw
|
||||
)
|
||||
except ValidationError as e:
|
||||
raise ValueError("Cannot parse compressed-tensors configuration") from e
|
||||
|
||||
if quantization_config.quantization_status not in (
|
||||
QuantizationStatus.COMPRESSED,
|
||||
QuantizationStatus.FROZEN,
|
||||
):
|
||||
raise ValueError(
|
||||
f"Model quantization was not finished, status was: {quantization_config.quantization_status}"
|
||||
)
|
||||
|
||||
self.ignore = (
|
||||
quantization_config.ignore if quantization_config.ignore is not None else []
|
||||
)
|
||||
self.loaders = self._get_target_loaders(quantization_config)
|
||||
|
||||
for target, loader in self.loaders.items():
|
||||
log_once(
|
||||
logger.info,
|
||||
f"Using {loader} for compressed-tensors target '{target}'",
|
||||
)
|
||||
|
||||
def get_weights(self, weights: Weights, prefix: str):
|
||||
loader = self._lookup_loader(prefix)
|
||||
return loader.get_weights(weights, prefix)
|
||||
|
||||
def get_weights_col_packed(
|
||||
self,
|
||||
weights: "Weights",
|
||||
prefix: str,
|
||||
block_sizes: Union[int, List[int]],
|
||||
):
|
||||
loader = self._lookup_loader(prefix)
|
||||
return loader.get_weights_col_packed(weights, prefix, block_sizes)
|
||||
|
||||
def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
|
||||
loader = self._lookup_loader(prefixes[0])
|
||||
return loader.get_multi_weights_col(weights, prefixes, dim)
|
||||
|
||||
def get_weights_row(self, weights: Weights, prefix: str):
|
||||
loader = self._lookup_loader(prefix)
|
||||
return loader.get_weights_row(weights, prefix)
|
||||
|
||||
def _get_target_loaders(
|
||||
self, quantization_config: QuantizationConfig
|
||||
) -> Dict[str, WeightsLoader]:
|
||||
"""
|
||||
A compressed-tensors checkpoint can use different quantizations
|
||||
for different targets. This method returns a dictionary with a
|
||||
loader per target.
|
||||
"""
|
||||
|
||||
loaders: Dict[str, WeightsLoader] = {}
|
||||
|
||||
format = quantization_config.format
|
||||
|
||||
for group_name, group in quantization_config.config_groups.items():
|
||||
# The group configuration can be a string, but does that ever
|
||||
# happen in a serialized quantization config?
|
||||
assert isinstance(group, QuantizationScheme)
|
||||
|
||||
loader = self._create_loader_for_group(format, group_name, group)
|
||||
|
||||
# A quantized parameter group can have multiple targets, add the
|
||||
# loader for all the targets.
|
||||
for target in group.targets:
|
||||
if target in loaders:
|
||||
raise ValueError(
|
||||
f"Target '{target} has multiple configured loaders'"
|
||||
)
|
||||
loaders[target] = loader
|
||||
|
||||
return loaders
|
||||
|
||||
def _create_loader_for_group(
|
||||
self, format: str, group_name: str, group: QuantizationScheme
|
||||
) -> WeightsLoader:
|
||||
"""
|
||||
Find and create a loader for the group with the given quantization
|
||||
scheme.
|
||||
"""
|
||||
# NOTE: we ignore group.output_activations because we don't support
|
||||
# output quantization yet.
|
||||
|
||||
input_activations = group.input_activations
|
||||
weights = group.weights
|
||||
if (
|
||||
format
|
||||
in {
|
||||
CompressionFormat.float_quantized.value,
|
||||
CompressionFormat.naive_quantized.value,
|
||||
}
|
||||
and weights is not None
|
||||
and weights.type == QuantizationType.FLOAT
|
||||
and weights.num_bits == 8
|
||||
):
|
||||
# FP W8A8 or W8A16.
|
||||
return W8ANFpLoader(input_activations=input_activations, weights=weights)
|
||||
elif (
|
||||
format == CompressionFormat.pack_quantized.value
|
||||
and weights is not None
|
||||
and weights.type == QuantizationType.INT
|
||||
and weights.num_bits in (4, 8)
|
||||
):
|
||||
# INT W4A16 or W8A16 (GPTQ/AWQ-like).
|
||||
return WNA16Loader(weights)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Group '{group_name}' has unsupported compressed-tensors configurtion"
|
||||
)
|
||||
|
||||
def _lookup_loader(self, prefix: str) -> WeightsLoader:
|
||||
"""
|
||||
Look up the loader to use for a given parameter name (prefix).
|
||||
"""
|
||||
|
||||
if len(find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.ignore)) > 0:
|
||||
return DefaultWeightsLoader(UnquantizedWeight)
|
||||
|
||||
# We currently only handle linear layers, so unconditionally pass
|
||||
# a `Linear` instance.
|
||||
targets = find_name_or_class_matches(prefix, _EMPTY_LINEAR, self.loaders.keys())
|
||||
if len(targets) == 0:
|
||||
raise ValueError(
|
||||
f"Cannot find compressed-tensors target for prefix: {prefix}"
|
||||
)
|
||||
return self.loaders[targets[0]]
|
|
@ -0,0 +1,174 @@
|
|||
from typing import List, Optional, Union
|
||||
|
||||
import torch
|
||||
from compressed_tensors.quantization import QuantizationArgs, QuantizationType
|
||||
|
||||
from text_generation_server.layers.fp8 import Fp8Weight, _load_scalar_or_matrix_scale
|
||||
from text_generation_server.utils.weights import Weights, WeightsLoader
|
||||
|
||||
|
||||
class W8ANFpLoader(WeightsLoader):
|
||||
"""
|
||||
Loader for W8A8/W8A16 FP compressed-tensors parameters.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
input_activations: Optional[QuantizationArgs],
|
||||
weights: QuantizationArgs,
|
||||
):
|
||||
assert weights.type == QuantizationType.FLOAT and weights.num_bits == 8
|
||||
|
||||
# We ignore the `strategy` option which sets the scales to be
|
||||
# per-tensor, per-channel or per-token. What scales are supported
|
||||
# is dependent on the kernels used (e.g. cutlass can do tokenwise,
|
||||
# Torch cannot, and FP8-Marlin does not quantize inputs at all).
|
||||
# So, instead we try to use the best-possible configuration.
|
||||
|
||||
self.load_weight_scale = not weights.dynamic
|
||||
self.load_input_scale = (
|
||||
input_activations is not None and not input_activations.dynamic
|
||||
)
|
||||
self.force_w8a16 = (
|
||||
input_activations is not None and input_activations.num_bits == 16
|
||||
)
|
||||
|
||||
def __str__(self) -> str:
|
||||
def scale_to_str(scale):
|
||||
return "static" if scale else "dynamic"
|
||||
|
||||
quantization_type = f"W8A{16 if self.force_w8a16 else 8}"
|
||||
|
||||
return f"{self.__class__.__name__} ({quantization_type}, weight: {scale_to_str(self.load_weight_scale)}, input: {scale_to_str(self.load_input_scale)})"
|
||||
|
||||
def get_weights(self, weights: "Weights", prefix: str):
|
||||
w = weights.get_tensor(f"{prefix}.weight")
|
||||
|
||||
weight_scale = None
|
||||
if self.load_weight_scale:
|
||||
weight_scale = (
|
||||
weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
|
||||
.reshape(-1)
|
||||
.expand(w.shape[0])
|
||||
)
|
||||
|
||||
input_scale = None
|
||||
if self.load_input_scale:
|
||||
input_scale = weights.get_tensor(
|
||||
f"{prefix}.input_scale", to_dtype=False
|
||||
).reshape(-1)
|
||||
|
||||
return Fp8Weight(
|
||||
weight=w,
|
||||
weight_scale=weight_scale,
|
||||
input_scale=input_scale,
|
||||
dtype=weights.dtype,
|
||||
force_w8a16=self.force_w8a16,
|
||||
)
|
||||
|
||||
def get_weights_col_packed(
|
||||
self,
|
||||
weights: Weights,
|
||||
prefix: str,
|
||||
block_sizes: Union[int, List[int]],
|
||||
):
|
||||
w = weights.get_packed_sharded(
|
||||
f"{prefix}.weight", dim=0, block_sizes=block_sizes
|
||||
)
|
||||
|
||||
weight_scale = None
|
||||
if self.load_weight_scale:
|
||||
weight_scale = weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
|
||||
if weight_scale.numel() > 1:
|
||||
weight_scale = weights.get_packed_sharded(
|
||||
f"{prefix}.weight_scale",
|
||||
dim=0,
|
||||
block_sizes=block_sizes,
|
||||
to_dtype=False,
|
||||
)
|
||||
weight_scale = weight_scale.reshape(-1).expand(w.shape[0])
|
||||
|
||||
input_scale = None
|
||||
if self.load_input_scale:
|
||||
input_scale = weights.get_tensor(f"{prefix}.input_scale", to_dtype=False)
|
||||
if input_scale.numel() > 1:
|
||||
input_scale = weights.get_packed_sharded(
|
||||
f"{prefix}.input_scale",
|
||||
dim=0,
|
||||
block_sizes=block_sizes,
|
||||
to_dtype=False,
|
||||
)
|
||||
input_scale = input_scale.reshape(-1).max()
|
||||
|
||||
return Fp8Weight(
|
||||
weight=w,
|
||||
weight_scale=weight_scale,
|
||||
input_scale=input_scale,
|
||||
dtype=weights.dtype,
|
||||
force_w8a16=self.force_w8a16,
|
||||
)
|
||||
|
||||
def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: int):
|
||||
# FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
|
||||
w = [
|
||||
weights.get_sharded(f"{p}.weight", dim=0, to_device=False) for p in prefixes
|
||||
]
|
||||
shapes = [x.shape for x in w]
|
||||
|
||||
# Concat then send to the device
|
||||
w = torch.cat(w, dim=dim).to(weights.device)
|
||||
|
||||
weight_scale = None
|
||||
if self.load_weight_scale:
|
||||
weight_scale = [
|
||||
_load_scalar_or_matrix_scale(weights, f"{p}.weight_scale", shape)
|
||||
for p, shape in zip(prefixes, shapes)
|
||||
]
|
||||
weight_scale = torch.cat(weight_scale, dim=0).reshape(-1)
|
||||
|
||||
input_scale = None
|
||||
if self.load_input_scale:
|
||||
input_scale = [
|
||||
_load_scalar_or_matrix_scale(weights, f"{p}.input_scale", shape)
|
||||
for p, shape in zip(prefixes, shapes)
|
||||
if weights.has_tensor(f"{p}.input_scale")
|
||||
]
|
||||
assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
|
||||
input_scale = (
|
||||
torch.cat(input_scale, dim=0).reshape(-1).max()
|
||||
if len(input_scale) != 0
|
||||
else None
|
||||
)
|
||||
|
||||
return Fp8Weight(
|
||||
weight=w,
|
||||
weight_scale=weight_scale,
|
||||
input_scale=input_scale,
|
||||
dtype=weights.dtype,
|
||||
force_w8a16=self.force_w8a16,
|
||||
)
|
||||
|
||||
def get_weights_row(self, weights: "Weights", prefix: str):
|
||||
w = weights.get_sharded(f"{prefix}.weight", dim=1)
|
||||
weight_scale = None
|
||||
if self.load_weight_scale:
|
||||
weight_scale = (
|
||||
weights.get_tensor(f"{prefix}.weight_scale", to_dtype=False)
|
||||
.reshape(-1)
|
||||
.expand(w.shape[0])
|
||||
)
|
||||
|
||||
input_scale = None
|
||||
if self.load_input_scale:
|
||||
input_scale = weights.get_tensor(
|
||||
f"{prefix}.input_scale", to_dtype=False
|
||||
).reshape(-1)
|
||||
|
||||
return Fp8Weight(
|
||||
weight=w,
|
||||
weight_scale=weight_scale,
|
||||
input_scale=input_scale,
|
||||
dtype=weights.dtype,
|
||||
force_w8a16=self.force_w8a16,
|
||||
)
|
|
@ -0,0 +1,188 @@
|
|||
from typing import List, Union
|
||||
|
||||
import torch
|
||||
from compressed_tensors.quantization import ActivationOrdering, QuantizationArgs
|
||||
from loguru import logger
|
||||
|
||||
from text_generation_server.layers.marlin.gptq import repack_gptq_for_marlin
|
||||
from text_generation_server.utils.log import log_once
|
||||
from text_generation_server.utils.weights import Weights, WeightsLoader
|
||||
|
||||
|
||||
class WNA16Loader(WeightsLoader):
|
||||
"""
|
||||
Loader for W4A16/W8A16 INT compressed-tensors parameters.
|
||||
"""
|
||||
|
||||
def __init__(self, weights: QuantizationArgs):
|
||||
self.weights = weights
|
||||
self.desc_act = self.weights.actorder == ActivationOrdering.GROUP
|
||||
self.groupsize = (
|
||||
-1 if self.weights.group_size is None else self.weights.group_size
|
||||
)
|
||||
|
||||
def __str__(self) -> str:
|
||||
quantization_type = f"W{self.weights.num_bits}8A16"
|
||||
|
||||
return f"{self.__class__.__name__} ({quantization_type})"
|
||||
|
||||
def get_weights(self, weights: Weights, prefix: str):
|
||||
log_once(logger.info, "Using GPTQ-Marlin kernels")
|
||||
try:
|
||||
weight_packed = weights.get_tensor(f"{prefix}.weight_packed").t()
|
||||
except RuntimeError:
|
||||
raise RuntimeError(
|
||||
f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
|
||||
)
|
||||
|
||||
zero_point = None
|
||||
if not self.weights.symmetric:
|
||||
zero_point = weights.get_tensor(f"{prefix}.weight_zero_point").t()
|
||||
|
||||
g_idx = None
|
||||
if self.desc_act:
|
||||
g_idx = weights.get_tensor(f"{prefix}.weight_g_idx")
|
||||
|
||||
scales = weights.get_tensor(f"{prefix}.weight.scales").t()
|
||||
|
||||
return repack_gptq_for_marlin(
|
||||
qweight=weight_packed.contiguous(),
|
||||
scales=scales,
|
||||
qzeros=zero_point,
|
||||
g_idx=g_idx,
|
||||
bits=self.weights.num_bits,
|
||||
desc_act=self.desc_act,
|
||||
groupsize=self.groupsize,
|
||||
quant_method="compressed-tensors",
|
||||
sym=self.weights.symmetric,
|
||||
sharded_infeatures=False,
|
||||
)
|
||||
|
||||
def get_weights_col_packed(
|
||||
self,
|
||||
weights: Weights,
|
||||
prefix: str,
|
||||
block_sizes: Union[int, List[int]],
|
||||
):
|
||||
try:
|
||||
weight_packed = weights.get_packed_sharded(
|
||||
f"{prefix}.weight_packed", dim=0, block_sizes=block_sizes
|
||||
).t()
|
||||
except RuntimeError:
|
||||
raise RuntimeError(
|
||||
f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
|
||||
)
|
||||
scales = weights.get_packed_sharded(
|
||||
f"{prefix}.weight_scale", dim=0, block_sizes=block_sizes
|
||||
).t()
|
||||
scales = scales.to(dtype=weights.dtype)
|
||||
|
||||
zero_point = None
|
||||
if not self.weights.symmetric:
|
||||
zero_point = weights.get_packed_sharded(
|
||||
f"{prefix}.qzeros", dim=0, block_sizes=block_sizes
|
||||
).t()
|
||||
|
||||
g_idx = None
|
||||
if self.desc_act:
|
||||
g_idx = weights.get_tensor(f"{prefix}.g_idx")
|
||||
|
||||
return repack_gptq_for_marlin(
|
||||
qweight=weight_packed.contiguous(),
|
||||
scales=scales,
|
||||
qzeros=zero_point,
|
||||
g_idx=g_idx,
|
||||
bits=self.weights.num_bits,
|
||||
desc_act=self.desc_act,
|
||||
groupsize=self.groupsize,
|
||||
quant_method="compressed-tensors",
|
||||
sym=self.weights.symmetric,
|
||||
sharded_infeatures=False,
|
||||
)
|
||||
|
||||
def get_multi_weights_col(self, weights: Weights, prefixes: List[str], dim: int):
|
||||
try:
|
||||
weight_packed = torch.cat(
|
||||
[
|
||||
weights.get_sharded(f"{p}.weight_packed", dim=0).t()
|
||||
for p in prefixes
|
||||
],
|
||||
dim=1,
|
||||
)
|
||||
except RuntimeError:
|
||||
raise RuntimeError(
|
||||
f"Cannot load w{self.weights.num_bits}a16 weight, make sure the model is already quantized"
|
||||
)
|
||||
|
||||
scales = torch.cat(
|
||||
[weights.get_sharded(f"{p}.weight_scale", dim=0).t() for p in prefixes],
|
||||
dim=1,
|
||||
)
|
||||
|
||||
zero_point = None
|
||||
if not self.weights.symmetric:
|
||||
zero_point = torch.cat(
|
||||
[weights.get_sharded(f"{p}.qzeros", dim=0).t() for p in prefixes], dim=1
|
||||
).t()
|
||||
|
||||
g_idx = None
|
||||
if self.desc_act:
|
||||
w = [weights.get_tensor(f"{p}.g_idx") for p in prefixes]
|
||||
for w2 in w[1:]:
|
||||
torch.testing.assert_close(w2, w[0])
|
||||
g_idx = w[0]
|
||||
|
||||
return repack_gptq_for_marlin(
|
||||
qweight=weight_packed.contiguous(),
|
||||
scales=scales,
|
||||
qzeros=zero_point,
|
||||
g_idx=g_idx,
|
||||
bits=self.weights.num_bits,
|
||||
desc_act=self.desc_act,
|
||||
groupsize=self.groupsize,
|
||||
quant_method="compressed-tensors",
|
||||
sym=self.weights.symmetric,
|
||||
sharded_infeatures=False,
|
||||
)
|
||||
|
||||
def get_weights_row(self, weights: Weights, prefix: str):
|
||||
log_once(logger.info, "Using GPTQ-Marlin kernels")
|
||||
try:
|
||||
weight_packed = weights.get_sharded(f"{prefix}.weight_packed", dim=1).t()
|
||||
except RuntimeError:
|
||||
raise RuntimeError(
|
||||
f"Cannot load `{self.quantize}` weight, make sure the model is already quantized."
|
||||
)
|
||||
|
||||
zero_point = None
|
||||
if not self.weights.symmetric:
|
||||
if self.desc_act or self.groupsize == -1:
|
||||
zero_point = weights.get_tensor(f"{prefix}.weight_zero_point").t()
|
||||
else:
|
||||
zero_point = weights.get_sharded(
|
||||
f"{prefix}.weight_zero_point", dim=1
|
||||
).t()
|
||||
|
||||
g_idx = None
|
||||
if self.desc_act:
|
||||
g_idx = weights.get_sharded(f"{prefix}.g_idx", dim=0)
|
||||
|
||||
if self.desc_act or self.groupsize == -1:
|
||||
scales = weights.get_tensor(f"{prefix}.weight_scale").t()
|
||||
else:
|
||||
scales = weights.get_sharded(f"{prefix}.weight_scale", dim=1).t()
|
||||
|
||||
sharded_in_features = weights.process_group.size() > 1
|
||||
|
||||
return repack_gptq_for_marlin(
|
||||
qweight=weight_packed.contiguous(),
|
||||
scales=scales,
|
||||
qzeros=zero_point,
|
||||
g_idx=g_idx,
|
||||
bits=self.weights.num_bits,
|
||||
desc_act=self.desc_act,
|
||||
groupsize=self.groupsize,
|
||||
quant_method="compressed-tensors",
|
||||
sym=self.weights.symmetric,
|
||||
sharded_infeatures=sharded_in_features,
|
||||
)
|
|
@ -29,7 +29,7 @@ else:
|
|||
CUTLASS_FP8_AVAILABLE = False
|
||||
|
||||
|
||||
def get_fp8_linear() -> Type[torch.nn.Module]:
|
||||
def get_fp8_linear(force_w8a16: bool = False) -> Type[torch.nn.Module]:
|
||||
"""
|
||||
Return an FP8 linear `Module` that is compatible with the current system.
|
||||
"""
|
||||
|
@ -37,7 +37,14 @@ def get_fp8_linear() -> Type[torch.nn.Module]:
|
|||
if SYSTEM == "cuda":
|
||||
|
||||
major, _ = torch.cuda.get_device_capability()
|
||||
if major == 8 and os.getenv("USE_CUTLASS_W8A8", "0") != "1":
|
||||
# Marlin is W8A16, use it when:
|
||||
#
|
||||
# - On capability 8.x where x < 8: W8A8 FP8 GEMM is not supported.
|
||||
# - On capability 8.9: W8A8 FP8 GEMM is supported, but Marlin-FP8 is faster.
|
||||
# - On capability 9.x when force_w8a16: cutlass kernels do not support W8A16.
|
||||
if (major == 8 or (major == 9 and force_w8a16)) and os.getenv(
|
||||
"USE_CUTLASS_W8A8", "0"
|
||||
) != "1":
|
||||
# NOTE: Capability 8.9 is supported by cutlass kernels, but FP8-Marlin
|
||||
# gives better decoding throughput on L4 and L40.
|
||||
from text_generation_server.layers.marlin import GPTQMarlinFP8Linear
|
||||
|
@ -283,14 +290,17 @@ class Fp8Weight(Weight):
|
|||
weight_scale: Optional[torch.Tensor] = None
|
||||
input_scale: Optional[torch.Tensor] = None
|
||||
activation_scale_ub: Optional[float] = None
|
||||
force_w8a16: bool = False
|
||||
|
||||
def get_linear(self, bias: torch.Tensor):
|
||||
if self.weight_scale is None:
|
||||
return get_fp8_linear().from_unquant(self.weight, bias, self.dtype)
|
||||
return get_fp8_linear(force_w8a16=self.force_w8a16).from_unquant(
|
||||
self.weight, bias, self.dtype
|
||||
)
|
||||
# This is not checked by the fbgemm kernels, but they require contiguous
|
||||
# memory. Can be non-contiguous when we e.g. expand from scalars.
|
||||
self.weight_scale = self.weight_scale.contiguous()
|
||||
return get_fp8_linear().from_fp8(
|
||||
return get_fp8_linear(force_w8a16=self.force_w8a16).from_fp8(
|
||||
weight=self.weight,
|
||||
scale=self.weight_scale,
|
||||
dtype=self.dtype,
|
||||
|
|
|
@ -261,7 +261,7 @@ class GPTQMarlinWeight(Weight):
|
|||
|
||||
def __post_init__(self):
|
||||
assert self.qweight.dtype == torch.int32
|
||||
assert self.scales.dtype == torch.float16
|
||||
assert self.scales.dtype in (torch.float16, torch.bfloat16)
|
||||
assert self.g_idx.dtype == torch.int32
|
||||
assert self.perm.dtype == torch.int32
|
||||
|
||||
|
@ -300,7 +300,7 @@ def repack_gptq_for_marlin(
|
|||
raise RuntimeError(
|
||||
f"Repacking GPTQ weights with group size {groupsize} as Marlin is not supported, must be one of: {supported_sizes}"
|
||||
)
|
||||
if not (sym or quant_method == "awq"):
|
||||
if not (sym or quant_method == "awq" or quant_method == "compressed-tensors"):
|
||||
raise RuntimeError(
|
||||
"Repacking GPTQ weights with asymmetric quantization as Marlin is not supported."
|
||||
)
|
||||
|
|
|
@ -370,46 +370,23 @@ def get_model(
|
|||
compression_config = config_dict.get("compression_config", None)
|
||||
if quantization_config is not None and quantize is None:
|
||||
method = quantization_config.get("quant_method", None)
|
||||
config_groups = quantization_config.get("config_groups", None)
|
||||
if method in {"gptq", "awq", "exl2"}:
|
||||
log_master(logger.info, f"Auto selecting quantization method {method}")
|
||||
quantize = method
|
||||
elif method == "fbgemm_fp8" or method == "fp8":
|
||||
log_master(logger.info, "Auto selecting quantization method fp8")
|
||||
quantize = "fp8"
|
||||
elif config_groups is not None:
|
||||
# TODO: at some point we should probably fully parse the compression
|
||||
# configuration to know which parameters are compressed.
|
||||
for _, group in config_groups.items():
|
||||
weights_config = group.get("weights")
|
||||
if weights_config is not None:
|
||||
if (
|
||||
weights_config["type"] == "float"
|
||||
and weights_config["num_bits"] == 8
|
||||
):
|
||||
if method == "compressed-tensors":
|
||||
log_master(
|
||||
logger.info, "Auto selecting quantization method fp8"
|
||||
logger.info, "Auto selecting quantization method compressed-tensors"
|
||||
)
|
||||
quantize = "fp8"
|
||||
break
|
||||
quantize = "compressed-tensors"
|
||||
else:
|
||||
log_master(logger.warning, f"Unknown quantization method {method}")
|
||||
elif compression_config is not None:
|
||||
# `compression_config` renamed to `quantization_config`; support retained for backward compatibility.
|
||||
config_groups = compression_config.get("config_groups")
|
||||
if config_groups is not None:
|
||||
for _, group in config_groups.items():
|
||||
weights_config = group.get("weights")
|
||||
if weights_config is not None:
|
||||
if (
|
||||
weights_config["type"] == "float"
|
||||
and weights_config["num_bits"] == 8
|
||||
):
|
||||
log_master(
|
||||
logger.info, "Auto selecting quantization method fp8"
|
||||
)
|
||||
quantize = "fp8"
|
||||
break
|
||||
log_master(logger.info, "Auto selecting quantization method compressed-tensors")
|
||||
quantize = "compressed-tensors"
|
||||
|
||||
if dtype is None:
|
||||
if quantize in ["awq", "exl2", "gptq", "marlin"]:
|
||||
|
@ -559,7 +536,7 @@ def get_model(
|
|||
# TODO: fix how we determine model type for Mamba
|
||||
if "ssm_cfg" in config_dict:
|
||||
# *only happens in Mamba case
|
||||
model_type = "ssm"
|
||||
model_type = "mamba"
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Could not determine model type for {model_id} revision {revision}"
|
||||
|
|
|
@ -23,8 +23,10 @@ from typing import Optional, List, Tuple, Any
|
|||
from text_generation_server.layers.attention.kv_cache import get_kv_scales
|
||||
from text_generation_server.utils.import_utils import SYSTEM
|
||||
|
||||
if SYSTEM != "ipex":
|
||||
if SYSTEM == "rocm":
|
||||
from vllm.model_executor.layers.fused_moe import fused_moe
|
||||
elif SYSTEM != "ipex":
|
||||
from moe_kernels.fused_moe import fused_moe
|
||||
else:
|
||||
from intel_extension_for_pytorch.llm.modules import GatedMLPMOE
|
||||
|
||||
|
|
|
@ -212,7 +212,7 @@ class MambaModel(nn.Module):
|
|||
try:
|
||||
self.lm_head = SpeculativeHead.load(config, f"{prefix}.embeddings", weights)
|
||||
except RuntimeError:
|
||||
self.lm_head = SpeculativeHead.load(config, f"{prefix}.embeddings", weights)
|
||||
self.lm_head = SpeculativeHead.load(config, f"{prefix}.embedding", weights)
|
||||
self.config = config
|
||||
|
||||
def forward(
|
||||
|
|
|
@ -5,7 +5,7 @@ from loguru import logger
|
|||
from typing import Dict, Union
|
||||
from text_generation_server.pb.generate_pb2 import GrammarType
|
||||
|
||||
from outlines.fsm.fsm import RegexFSM
|
||||
from outlines.fsm.guide import RegexGuide
|
||||
from outlines.fsm.json_schema import build_regex_from_schema
|
||||
from functools import lru_cache
|
||||
from typing import List, Optional, DefaultDict
|
||||
|
@ -482,7 +482,7 @@ class HeterogeneousProcessorWrapper(LogitsProcessor):
|
|||
|
||||
class GrammarLogitProcessor(LogitsProcessor):
|
||||
fsm_state: DefaultDict[int, int]
|
||||
fsm: RegexFSM
|
||||
fsm: RegexGuide
|
||||
|
||||
def __init__(self, tokenizer, device, grammar, grammar_type):
|
||||
self.device = device
|
||||
|
@ -498,8 +498,9 @@ class GrammarLogitProcessor(LogitsProcessor):
|
|||
):
|
||||
if fsm_grammar_state == -1 or self.fsm is None:
|
||||
return logits
|
||||
allowed_tokens = self.fsm.allowed_token_ids(fsm_grammar_state)
|
||||
allowed_tokens = self.fsm.get_next_instruction(fsm_grammar_state).tokens
|
||||
mask = torch.full_like(logits, -math.inf)
|
||||
if allowed_tokens is not None:
|
||||
mask[:, allowed_tokens] = 0
|
||||
biased_scores = logits + mask
|
||||
return biased_scores
|
||||
|
@ -513,7 +514,7 @@ class GrammarLogitProcessor(LogitsProcessor):
|
|||
def _advance(next_token_id, fsm_grammar_state, fsm):
|
||||
if fsm_grammar_state == -1:
|
||||
return fsm_grammar_state
|
||||
return fsm.next_state(fsm_grammar_state, next_token_id)
|
||||
return fsm.get_next_state(fsm_grammar_state, next_token_id)
|
||||
|
||||
# TODO: move grammar compilation into the router
|
||||
@staticmethod
|
||||
|
@ -530,7 +531,7 @@ class GrammarLogitProcessor(LogitsProcessor):
|
|||
schema = "(.*?)"
|
||||
elif grammar_type == GrammarType.GRAMMAR_TYPE_REGEX:
|
||||
pass # schema is already a regex just here for clarity
|
||||
fsm = RegexFSM(schema, tokenizer)
|
||||
fsm = RegexGuide.from_regex(schema, tokenizer)
|
||||
logger.debug(f"Compiled FSM in {time.time() - start_time:.2f}s")
|
||||
return fsm
|
||||
|
||||
|
@ -588,7 +589,8 @@ class HeterogeneousGrammarLogitProcessor(LogitsProcessor):
|
|||
fsm = self.fsms[i]
|
||||
if fsm_grammar_states[i] == -1 or fsm is None:
|
||||
continue
|
||||
allowed_tokens = fsm.allowed_token_ids(fsm_grammar_states[i])
|
||||
allowed_tokens = fsm.get_next_instruction(fsm_grammar_states[i]).tokens
|
||||
if allowed_tokens is not None:
|
||||
mask[i, allowed_tokens] = 0
|
||||
logits[i] += mask[i]
|
||||
return logits
|
||||
|
|
|
@ -27,7 +27,20 @@ class _FP8QuantizerConfig:
|
|||
activation_scale_ub: float
|
||||
|
||||
|
||||
# We should probably do this with Pytantic JSON deserialization,
|
||||
def _get_config_json(model_id: str, revision: Optional[str], filename: str):
|
||||
if os.path.exists(
|
||||
os.path.join(
|
||||
model_id,
|
||||
)
|
||||
):
|
||||
filename = os.path.join(model_id, filename)
|
||||
else:
|
||||
filename = hf_hub_download(model_id, filename=filename, revision=revision)
|
||||
with open(filename, "r") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
# We should probably do this with Pydantic JSON deserialization,
|
||||
# but for now we'll stay close to the old _set_gptq_params.
|
||||
def _get_quantizer_config(model_id, revision):
|
||||
bits = 4
|
||||
|
@ -39,12 +52,7 @@ def _get_quantizer_config(model_id, revision):
|
|||
|
||||
filename = "config.json"
|
||||
try:
|
||||
if os.path.exists(os.path.join(model_id, filename)):
|
||||
filename = os.path.join(model_id, filename)
|
||||
else:
|
||||
filename = hf_hub_download(model_id, filename=filename, revision=revision)
|
||||
with open(filename, "r") as f:
|
||||
data = json.load(f)
|
||||
data = _get_config_json(model_id, revision, filename)
|
||||
|
||||
# FP8 config
|
||||
if data["quantization_config"]["quant_method"] == "fbgemm_fp8":
|
||||
|
@ -67,14 +75,7 @@ def _get_quantizer_config(model_id, revision):
|
|||
except Exception:
|
||||
filename = "quantize_config.json"
|
||||
try:
|
||||
if os.path.exists(os.path.join(model_id, filename)):
|
||||
filename = os.path.join(model_id, filename)
|
||||
else:
|
||||
filename = hf_hub_download(
|
||||
model_id, filename=filename, revision=revision
|
||||
)
|
||||
with open(filename, "r") as f:
|
||||
data = json.load(f)
|
||||
data = _get_config_json(model_id, revision, filename)
|
||||
bits = data["bits"]
|
||||
groupsize = data["group_size"]
|
||||
|
||||
|
@ -90,14 +91,7 @@ def _get_quantizer_config(model_id, revision):
|
|||
except Exception:
|
||||
filename = "quant_config.json"
|
||||
try:
|
||||
if os.path.exists(os.path.join(model_id, filename)):
|
||||
filename = os.path.join(model_id, filename)
|
||||
else:
|
||||
filename = hf_hub_download(
|
||||
model_id, filename=filename, revision=revision
|
||||
)
|
||||
with open(filename, "r") as f:
|
||||
data = json.load(f)
|
||||
data = _get_config_json(model_id, revision, filename)
|
||||
bits = data["w_bit"]
|
||||
groupsize = data["q_group_size"]
|
||||
desc_act = data["desc_act"]
|
||||
|
@ -119,6 +113,14 @@ def _get_quantizer_config(model_id, revision):
|
|||
def get_loader(
|
||||
quantize: Optional[str], model_id: str, revision: Optional[str]
|
||||
) -> WeightsLoader:
|
||||
if quantize == "compressed-tensors":
|
||||
config = _get_config_json(model_id, revision, "config.json")
|
||||
from text_generation_server.layers.compressed_tensors import (
|
||||
CompressedTensorsLoader,
|
||||
)
|
||||
|
||||
return CompressedTensorsLoader(config)
|
||||
|
||||
quantizer_config = _get_quantizer_config(model_id, revision)
|
||||
if quantize in {"awq", "gptq"}:
|
||||
from text_generation_server.layers.gptq import GPTQWeightsLoader
|
||||
|
|
Loading…
Reference in New Issue