add intel xpu support for TGI (#1475)
# What does this PR do? <!-- Congratulations! You've made it this far! You're not quite done yet though. Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution. Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change. Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost. --> <!-- Remove if not applicable --> Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. <!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @ @OlivierDehaene OR @Narsil --> --------- Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Co-authored-by: Morgan Funtowicz <funtowiczmo@gmail.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
parent
f9cf345625
commit
45ecf9d040
|
@ -274,12 +274,105 @@ jobs:
|
|||
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min
|
||||
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-rocm,mode=min
|
||||
|
||||
build-and-push-image-intel:
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-build-and-push-image-intel-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
needs:
|
||||
- start-runner
|
||||
- build-and-push-image # Wait for the main docker image to be built
|
||||
- integration-tests # Wait for the main integration-tests
|
||||
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
|
||||
permissions:
|
||||
contents: write
|
||||
packages: write
|
||||
# This is used to complete the identity challenge
|
||||
# with sigstore/fulcio when running outside of PRs.
|
||||
id-token: write
|
||||
security-events: write
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
- name: Initialize Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2.0.0
|
||||
with:
|
||||
install: true
|
||||
- name: Inject slug/short variables
|
||||
uses: rlespinasse/github-slug-action@v4.4.1
|
||||
- name: Tailscale
|
||||
uses: tailscale/github-action@7bd8039bf25c23c4ab1b8d6e2cc2da2280601966
|
||||
with:
|
||||
authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
|
||||
- name: Login to GitHub Container Registry
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Login to internal Container Registry
|
||||
uses: docker/login-action@v2.1.0
|
||||
with:
|
||||
username: ${{ secrets.TAILSCALE_DOCKER_USERNAME }}
|
||||
password: ${{ secrets.TAILSCALE_DOCKER_PASSWORD }}
|
||||
registry: registry.internal.huggingface.tech
|
||||
- name: Login to Azure Container Registry
|
||||
if: github.event_name != 'pull_request'
|
||||
uses: docker/login-action@v2.1.0
|
||||
with:
|
||||
username: ${{ secrets.AZURE_DOCKER_USERNAME }}
|
||||
password: ${{ secrets.AZURE_DOCKER_PASSWORD }}
|
||||
registry: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io
|
||||
# If pull request
|
||||
- name: Extract metadata (tags, labels) for Docker
|
||||
if: ${{ github.event_name == 'pull_request' }}
|
||||
id: meta-pr
|
||||
uses: docker/metadata-action@v4.3.0
|
||||
with:
|
||||
images: |
|
||||
registry.internal.huggingface.tech/api-inference/community/text-generation-inference
|
||||
tags: |
|
||||
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-intel
|
||||
# If main, release or tag
|
||||
- name: Extract metadata (tags, labels) for Docker
|
||||
if: ${{ github.event_name != 'pull_request' }}
|
||||
id: meta
|
||||
uses: docker/metadata-action@v4.3.0
|
||||
with:
|
||||
flavor: |
|
||||
latest=false
|
||||
images: |
|
||||
registry.internal.huggingface.tech/api-inference/community/text-generation-inference
|
||||
ghcr.io/huggingface/text-generation-inference
|
||||
db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference
|
||||
tags: |
|
||||
type=semver,pattern={{version}}-intel
|
||||
type=semver,pattern={{major}}.{{minor}}-intel
|
||||
type=raw,value=latest-intel,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
|
||||
type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}-intel
|
||||
- name: Build and push Docker image
|
||||
id: build-and-push
|
||||
uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
file: Dockerfile_intel
|
||||
push: true
|
||||
platforms: 'linux/amd64'
|
||||
build-args: |
|
||||
GIT_SHA=${{ env.GITHUB_SHA }}
|
||||
DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}-intel
|
||||
tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
|
||||
cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-intel,mode=min
|
||||
cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache-intel,mode=min
|
||||
|
||||
stop-runner:
|
||||
name: Stop self-hosted EC2 runner
|
||||
needs:
|
||||
- start-runner
|
||||
- build-and-push-image
|
||||
- build-and-push-image-rocm
|
||||
- build-and-push-image-intel
|
||||
- integration-tests
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
|
|
|
@ -0,0 +1,105 @@
|
|||
FROM lukemathwalker/cargo-chef:latest-rust-1.75 AS chef
|
||||
WORKDIR /usr/src
|
||||
|
||||
ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
|
||||
|
||||
FROM chef as planner
|
||||
COPY Cargo.toml Cargo.toml
|
||||
COPY rust-toolchain.toml rust-toolchain.toml
|
||||
COPY proto proto
|
||||
COPY benchmark benchmark
|
||||
COPY router router
|
||||
COPY launcher launcher
|
||||
RUN cargo chef prepare --recipe-path recipe.json
|
||||
|
||||
FROM chef AS builder
|
||||
|
||||
ARG GIT_SHA
|
||||
ARG DOCKER_LABEL
|
||||
|
||||
RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
|
||||
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
|
||||
unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
|
||||
unzip -o $PROTOC_ZIP -d /usr/local 'include/*' && \
|
||||
rm -f $PROTOC_ZIP
|
||||
|
||||
COPY --from=planner /usr/src/recipe.json recipe.json
|
||||
RUN cargo chef cook --release --recipe-path recipe.json
|
||||
|
||||
COPY Cargo.toml Cargo.toml
|
||||
COPY rust-toolchain.toml rust-toolchain.toml
|
||||
COPY proto proto
|
||||
COPY benchmark benchmark
|
||||
COPY router router
|
||||
COPY launcher launcher
|
||||
RUN cargo build --release
|
||||
|
||||
|
||||
# Text Generation Inference base image for Intel
|
||||
FROM intel/intel-extension-for-pytorch:2.1.10-xpu as base
|
||||
|
||||
USER root
|
||||
# libssl.so.1.1 is not installed on Ubuntu 22.04 by default, install it
|
||||
RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb && \
|
||||
dpkg -i ./libssl1.1_1.1.1f-1ubuntu2_amd64.deb
|
||||
|
||||
|
||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
|
||||
| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
|
||||
|
||||
RUN apt-get update && apt install -y intel-basekit xpu-smi cmake python3-dev ninja-build
|
||||
|
||||
# Text Generation Inference base env
|
||||
ENV HUGGINGFACE_HUB_CACHE=/data \
|
||||
HF_HUB_ENABLE_HF_TRANSFER=1 \
|
||||
PORT=80
|
||||
|
||||
|
||||
WORKDIR /usr/src
|
||||
# Build pytorch and ipex
|
||||
RUN git clone https://github.com/intel/intel-extension-for-pytorch && cd intel-extension-for-pytorch && git checkout -b xpu_main origin/xpu-main
|
||||
RUN git clone https://github.com/pytorch/pytorch.git && cd pytorch && git checkout 209f2fa8ff86652f67d75c2f19bf9cb9942fd018 && git apply /usr/src/intel-extension-for-pytorch/torch_patches/00*.patch
|
||||
|
||||
# Install server
|
||||
COPY proto proto
|
||||
COPY server server
|
||||
COPY server/Makefile server/Makefile
|
||||
RUN cd server && \
|
||||
make gen-server && \
|
||||
pip install -r requirements_cuda.txt && \
|
||||
pip install ".[accelerate, peft, outlines]" --no-cache-dir
|
||||
|
||||
ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
|
||||
ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
|
||||
ENV FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric
|
||||
ENV DIAGUTIL_PATH=/opt/intel/oneapi/compiler/latest/etc/compiler/sys_check/sys_check.sh
|
||||
ENV CCL_CONFIGURATION=cpu_gpu_dpcpp
|
||||
ENV MANPATH=/opt/intel/oneapi/mpi/latest/share/man:/opt/intel/oneapi/mpi/latest/share/man:/opt/intel/oneapi/compiler/latest/share/man
|
||||
ENV CMAKE_PREFIX_PATH=/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/compiler/latest
|
||||
ENV CMPLR_ROOT=/opt/intel/oneapi/compiler/latest
|
||||
ENV LIBRARY_PATH=/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/compiler/latest/lib
|
||||
ENV OCL_ICD_FILENAMES=libintelocl_emu.so:libalteracl.so:/opt/intel/oneapi/compiler/latest/lib/libintelocl.so
|
||||
ENV CLASSPATH=/opt/intel/oneapi/mpi/latest/share/java/mpi.jar:/opt/intel/oneapi/mpi/latest/share/java/mpi.jar
|
||||
ENV LD_LIBRARY_PATH=/opt/intel/oneapi/ccl/latest/lib/:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/lib:/opt/intel/oneapi/lib/intel64:
|
||||
ENV MKLROOT=/opt/intel/oneapi/mkl/latest
|
||||
ENV NLSPATH=/opt/intel/oneapi/mkl/latest/share/locale/%l_%t/%N:/opt/intel/oneapi/compiler/latest/lib/locale/%l_%t/%N
|
||||
ENV PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/compiler/latest/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||
ENV CPATH=/opt/intel/oneapi/mpi/latest/include:/opt/intel/oneapi/ccl/latest/include:/opt/intel/oneapi/mkl/latest/include
|
||||
ENV CCL_ZE_IPC_EXCHANGE=sockets
|
||||
|
||||
|
||||
RUN pip uninstall -y torch && cd pytorch && git submodule update --init --recursive && python setup.py install
|
||||
RUN pip uninstall -y intel-extension-for-pytorch && cd intel-extension-for-pytorch && git submodule update --init --recursive && USE_AOT_DEVLIST='pvc' BUILD_SEPARATE_OPS=ON BUILD_WITH_CPU=ON USE_XETLA=ON python setup.py install
|
||||
|
||||
# Install benchmarker
|
||||
COPY --from=builder /usr/src/target/release/text-generation-benchmark /usr/local/bin/text-generation-benchmark
|
||||
# Install router
|
||||
COPY --from=builder /usr/src/target/release/text-generation-router /usr/local/bin/text-generation-router
|
||||
# Install launcher
|
||||
COPY --from=builder /usr/src/target/release/text-generation-launcher /usr/local/bin/text-generation-launcher
|
||||
|
||||
# Final image
|
||||
FROM base
|
||||
|
||||
ENTRYPOINT ["text-generation-launcher"]
|
||||
CMD ["--json-output"]
|
|
@ -7,14 +7,17 @@ pub(crate) struct Env {
|
|||
git_sha: &'static str,
|
||||
docker_label: &'static str,
|
||||
nvidia_env: String,
|
||||
xpu_env: String,
|
||||
}
|
||||
|
||||
impl Env {
|
||||
pub fn new() -> Self {
|
||||
let nvidia_env = nvidia_smi();
|
||||
let xpu_env = xpu_smi();
|
||||
|
||||
Self {
|
||||
nvidia_env: nvidia_env.unwrap_or("N/A".to_string()),
|
||||
xpu_env: xpu_env.unwrap_or("N/A".to_string()),
|
||||
cargo_target: env!("VERGEN_CARGO_TARGET_TRIPLE"),
|
||||
cargo_version: env!("VERGEN_RUSTC_SEMVER"),
|
||||
git_sha: option_env!("VERGEN_GIT_SHA").unwrap_or("N/A"),
|
||||
|
@ -31,7 +34,8 @@ impl fmt::Display for Env {
|
|||
writeln!(f, "Cargo version: {}", self.cargo_version)?;
|
||||
writeln!(f, "Commit sha: {}", self.git_sha)?;
|
||||
writeln!(f, "Docker label: {}", self.docker_label)?;
|
||||
write!(f, "nvidia-smi:\n{}", self.nvidia_env)?;
|
||||
writeln!(f, "nvidia-smi:\n{}", self.nvidia_env)?;
|
||||
write!(f, "xpu-smi:\n{}", self.xpu_env)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -43,3 +47,10 @@ fn nvidia_smi() -> Option<String> {
|
|||
let output = nvidia_smi.replace('\n', "\n ");
|
||||
Some(output.trim().to_string())
|
||||
}
|
||||
|
||||
fn xpu_smi() -> Option<String> {
|
||||
let output = Command::new("xpu-smi").arg("discovery").output().ok()?;
|
||||
let xpu_smi = String::from_utf8(output.stdout).ok()?;
|
||||
let output = xpu_smi.replace('\n', "\n ");
|
||||
Some(output.trim().to_string())
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@ import math
|
|||
import torch
|
||||
|
||||
from typing import Optional, List, Tuple
|
||||
from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
|
||||
|
||||
BLOCK_SIZE: int = 16
|
||||
# Will be set in warmup
|
||||
|
@ -24,7 +25,10 @@ class CacheManager:
|
|||
self.repeat_slots = repeat_slots
|
||||
|
||||
element_size = torch.tensor([], dtype=dtype).element_size()
|
||||
x = self.block_size // element_size
|
||||
if IS_XPU_SYSTEM:
|
||||
x = 1
|
||||
else:
|
||||
x = self.block_size // element_size
|
||||
|
||||
self.kv_cache = [
|
||||
(
|
||||
|
|
|
@ -21,8 +21,10 @@ from transformers.activations import ACT2FN
|
|||
from transformers.configuration_utils import PretrainedConfig
|
||||
from typing import Optional, List, Tuple, Any
|
||||
from loguru import logger
|
||||
from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
|
||||
|
||||
from vllm.model_executor.layers.fused_moe import fused_moe
|
||||
if not IS_XPU_SYSTEM:
|
||||
from vllm.model_executor.layers.fused_moe import fused_moe
|
||||
from text_generation_server.utils import paged_attention, flash_attn
|
||||
from text_generation_server.utils.layers import (
|
||||
FastLinear,
|
||||
|
|
|
@ -24,7 +24,10 @@ import torch.distributed
|
|||
import numpy as np
|
||||
|
||||
from torch import nn
|
||||
from vllm.model_executor.layers.fused_moe import fused_moe
|
||||
from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
|
||||
|
||||
if not IS_XPU_SYSTEM:
|
||||
from vllm.model_executor.layers.fused_moe import fused_moe
|
||||
from transformers.activations import ACT2FN
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from typing import Optional, List, Tuple
|
||||
|
|
|
@ -33,7 +33,7 @@ from text_generation_server.utils import StoppingCriteria, HeterogeneousNextToke
|
|||
from text_generation_server.utils.dist import MEMORY_FRACTION
|
||||
|
||||
tracer = trace.get_tracer(__name__)
|
||||
|
||||
from text_generation_server.utils.import_utils import IS_CUDA_SYSTEM, IS_ROCM_SYSTEM, IS_XPU_SYSTEM
|
||||
|
||||
@dataclass
|
||||
class FlashCausalLMBatch(Batch):
|
||||
|
@ -752,7 +752,10 @@ class FlashCausalLM(Model):
|
|||
|
||||
def warmup(self, batch: FlashCausalLMBatch):
|
||||
# The warmup batch is the biggest batch we could ever receive
|
||||
torch.cuda.empty_cache()
|
||||
if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
|
||||
torch.cuda.empty_cache()
|
||||
elif IS_XPU_SYSTEM:
|
||||
torch.xpu.empty_cache()
|
||||
try:
|
||||
cache_manager = set_cache_manager(
|
||||
batch.blocks,
|
||||
|
@ -772,7 +775,10 @@ class FlashCausalLM(Model):
|
|||
f"You need to decrease `--max-batch-prefill-tokens`"
|
||||
) from e
|
||||
|
||||
torch.cuda.synchronize(self.device)
|
||||
if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
|
||||
torch.cuda.synchronize(self.device)
|
||||
elif IS_XPU_SYSTEM:
|
||||
torch.xpu.synchronize(self.device)
|
||||
|
||||
# Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
|
||||
# Calculate the number of blocks that can be allocated with the free memory
|
||||
|
@ -780,12 +786,18 @@ class FlashCausalLM(Model):
|
|||
cache_block_size = BLOCK_SIZE * self.num_kv_heads * self.head_size
|
||||
total_cache_size = self.num_layers * cache_block_size * 2 * dtype_size
|
||||
|
||||
total_free_memory, _ = torch.cuda.mem_get_info(self.device)
|
||||
total_gpu_memory = torch.cuda.get_device_properties(self.device).total_memory
|
||||
if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
|
||||
total_free_memory, _ = torch.cuda.mem_get_info(self.device)
|
||||
total_gpu_memory = torch.cuda.get_device_properties(self.device).total_memory
|
||||
|
||||
free_memory = max(
|
||||
0, total_free_memory - (1 - MEMORY_FRACTION) * total_gpu_memory
|
||||
)
|
||||
free_memory = max(
|
||||
0, total_free_memory - (1 - MEMORY_FRACTION) * total_gpu_memory
|
||||
)
|
||||
elif IS_XPU_SYSTEM:
|
||||
total_gpu_memory = torch.xpu.get_device_properties(self.device).total_memory
|
||||
free_memory = int(total_gpu_memory *0.5)
|
||||
else:
|
||||
raise NotImplementedError("FlashModel is only available on GPU")
|
||||
|
||||
num_blocks = (
|
||||
# Leave 5% for some wiggle room
|
||||
|
|
|
@ -18,6 +18,7 @@ from text_generation_server.utils import (
|
|||
|
||||
tracer = trace.get_tracer(__name__)
|
||||
|
||||
from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
|
||||
|
||||
class FlashLlama(FlashCausalLM):
|
||||
def __init__(
|
||||
|
@ -33,6 +34,9 @@ class FlashLlama(FlashCausalLM):
|
|||
if torch.cuda.is_available():
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
dtype = torch.float16 if dtype is None else dtype
|
||||
elif IS_XPU_SYSTEM:
|
||||
device = torch.device(f"xpu:{rank}")
|
||||
dtype = torch.float16 if dtype is None else dtype
|
||||
else:
|
||||
raise NotImplementedError("FlashLlama is only available on GPU")
|
||||
|
||||
|
|
|
@ -33,8 +33,9 @@ tracer = trace.get_tracer(__name__)
|
|||
# Will be set in init
|
||||
SLIDING_WINDOW: Optional[int] = None
|
||||
SLIDING_WINDOW_BLOCKS: Optional[int] = None
|
||||
from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
|
||||
|
||||
MEM_POOL = torch.cuda.graph_pool_handle()
|
||||
MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
|
||||
|
||||
|
||||
def set_sliding_window(sliding_window: int, sliding_window_blocks: int):
|
||||
|
@ -316,6 +317,9 @@ class BaseFlashMistral(FlashCausalLM):
|
|||
if torch.cuda.is_available():
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
dtype = torch.float16 if dtype is None else dtype
|
||||
elif IS_XPU_SYSTEM:
|
||||
device = torch.device(f"xpu:{rank}")
|
||||
dtype = torch.float16 if dtype is None else dtype
|
||||
else:
|
||||
raise NotImplementedError("FlashMistral is only available on GPU")
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ from text_generation_server.utils import (
|
|||
weight_files,
|
||||
Weights,
|
||||
)
|
||||
|
||||
from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
|
||||
tracer = trace.get_tracer(__name__)
|
||||
|
||||
|
||||
|
@ -32,6 +32,9 @@ class FlashNeoXSharded(FlashCausalLM):
|
|||
if torch.cuda.is_available():
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
dtype = torch.float16 if dtype is None else dtype
|
||||
elif IS_XPU_SYSTEM:
|
||||
device = torch.device(f"xpu:{rank}")
|
||||
dtype = torch.float16 if dtype is None else dtype
|
||||
else:
|
||||
raise NotImplementedError("FlashNeoX is only available on GPU")
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@ from text_generation_server.utils import (
|
|||
weight_files,
|
||||
Weights,
|
||||
)
|
||||
|
||||
from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
|
||||
tracer = trace.get_tracer(__name__)
|
||||
|
||||
|
||||
|
@ -33,6 +33,9 @@ class FlashRWSharded(FlashCausalLM):
|
|||
if torch.cuda.is_available():
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
dtype = torch.float16 if dtype is None else dtype
|
||||
elif IS_XPU_SYSTEM:
|
||||
device = torch.device(f"xpu:{rank}")
|
||||
dtype = torch.float16 if dtype is None else dtype
|
||||
else:
|
||||
raise NotImplementedError("FlashRW is only available on GPU")
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@ from text_generation_server.utils import (
|
|||
Weights,
|
||||
)
|
||||
|
||||
from text_generation_server.utils.import_utils import IS_XPU_SYSTEM
|
||||
tracer = trace.get_tracer(__name__)
|
||||
|
||||
|
||||
|
@ -35,6 +36,9 @@ class FlashSantacoderSharded(FlashCausalLM):
|
|||
if torch.cuda.is_available():
|
||||
device = torch.device(f"cuda:{rank}")
|
||||
dtype = torch.float16 if dtype is None else dtype
|
||||
elif IS_XPU_SYSTEM:
|
||||
device = torch.device(f"xpu:{rank}")
|
||||
dtype = torch.float16 if dtype is None else dtype
|
||||
else:
|
||||
raise NotImplementedError("FlashSantacoderSharded is only available on GPU")
|
||||
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
import torch
|
||||
import os
|
||||
|
||||
MEM_POOL = torch.cuda.graph_pool_handle()
|
||||
MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
|
||||
# This is overridden by the cli
|
||||
cuda_graphs = os.getenv("CUDA_GRAPHS")
|
||||
if cuda_graphs is not None and cuda_graphs != "0":
|
||||
if torch.cuda.is_available() and cuda_graphs is not None and cuda_graphs != "0":
|
||||
try:
|
||||
cuda_graphs = [int(item) for item in cuda_graphs.split(",")]
|
||||
except Exception as e:
|
||||
|
|
|
@ -57,7 +57,14 @@ def initialize_torch_distributed():
|
|||
options.is_high_priority_stream = True
|
||||
options._timeout = timedelta(seconds=60)
|
||||
else:
|
||||
backend = "gloo"
|
||||
try:
|
||||
import oneccl_bindings_for_pytorch
|
||||
|
||||
backend = "ccl"
|
||||
if os.getenv("CCL_WORKER_COUNT", None) is None:
|
||||
os.environ["CCL_WORKER_COUNT"] = str(1)
|
||||
except ImportError:
|
||||
backend = "gloo"
|
||||
options = None
|
||||
|
||||
if WORLD_SIZE == 1:
|
||||
|
|
|
@ -2,69 +2,81 @@ import os
|
|||
import torch
|
||||
|
||||
from loguru import logger
|
||||
import math
|
||||
|
||||
from text_generation_server.utils.import_utils import IS_CUDA_SYSTEM, IS_ROCM_SYSTEM
|
||||
from text_generation_server.utils.import_utils import (
|
||||
IS_CUDA_SYSTEM,
|
||||
IS_ROCM_SYSTEM,
|
||||
IS_XPU_SYSTEM,
|
||||
)
|
||||
|
||||
if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false":
|
||||
raise ImportError("`USE_FLASH_ATTENTION` is false.")
|
||||
|
||||
if not torch.cuda.is_available():
|
||||
raise ImportError("CUDA is not available")
|
||||
|
||||
major, minor = torch.cuda.get_device_capability()
|
||||
is_sm75 = major == 7 and minor == 5
|
||||
is_sm8x = major == 8 and minor >= 0
|
||||
is_sm90 = major == 9 and minor == 0
|
||||
|
||||
HAS_FLASH_ATTN = False
|
||||
HAS_FLASH_ATTN = True
|
||||
HAS_FLASH_ATTN_V2_CUDA = False
|
||||
HAS_FLASH_ATTN_V2_ROCM = False
|
||||
try:
|
||||
|
||||
if IS_XPU_SYSTEM:
|
||||
import intel_extension_for_pytorch as ipex
|
||||
|
||||
if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM:
|
||||
if not torch.cuda.is_available():
|
||||
raise ImportError("CUDA is not available")
|
||||
|
||||
major, minor = torch.cuda.get_device_capability()
|
||||
is_sm75 = major == 7 and minor == 5
|
||||
is_sm8x = major == 8 and minor >= 0
|
||||
is_sm90 = major == 9 and minor == 0
|
||||
|
||||
HAS_FLASH_ATTN = False
|
||||
HAS_FLASH_ATTN_V2_CUDA = False
|
||||
HAS_FLASH_ATTN_V2_ROCM = False
|
||||
try:
|
||||
import flash_attn_2_cuda
|
||||
except ImportError:
|
||||
architecture_suffix = ""
|
||||
if IS_CUDA_SYSTEM:
|
||||
architecture_suffix = "-cuda"
|
||||
try:
|
||||
import flash_attn_2_cuda
|
||||
except ImportError:
|
||||
architecture_suffix = ""
|
||||
if IS_CUDA_SYSTEM:
|
||||
architecture_suffix = "-cuda"
|
||||
elif IS_ROCM_SYSTEM:
|
||||
architecture_suffix = "-rocm"
|
||||
raise ImportError(
|
||||
"Flash Attention V2 is not installed.\n"
|
||||
"Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
|
||||
f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`"
|
||||
)
|
||||
if not (is_sm8x or is_sm90):
|
||||
raise ImportError(
|
||||
f"GPU with CUDA capability {major} {minor} is not supported for "
|
||||
"Flash Attention V2"
|
||||
)
|
||||
HAS_FLASH_ATTN_V2_CUDA = IS_CUDA_SYSTEM
|
||||
HAS_FLASH_ATTN_V2_ROCM = IS_ROCM_SYSTEM
|
||||
except ImportError as e:
|
||||
try:
|
||||
import flash_attn_cuda
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Flash Attention is not installed.\n"
|
||||
"Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
|
||||
"or install flash attention with `cd server && make install install-flash-attention`"
|
||||
) from e
|
||||
|
||||
if IS_CUDA_SYSTEM and not (is_sm75 or is_sm8x or is_sm90):
|
||||
raise ImportError(
|
||||
f"GPU with CUDA capability {major} {minor} is not supported"
|
||||
) from e
|
||||
elif IS_ROCM_SYSTEM:
|
||||
architecture_suffix = "-rocm"
|
||||
raise ImportError(
|
||||
"Flash Attention V2 is not installed.\n"
|
||||
"Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
|
||||
f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`"
|
||||
)
|
||||
if not (is_sm8x or is_sm90):
|
||||
raise ImportError(
|
||||
f"GPU with CUDA capability {major} {minor} is not supported for "
|
||||
"Flash Attention V2"
|
||||
)
|
||||
HAS_FLASH_ATTN_V2_CUDA = IS_CUDA_SYSTEM
|
||||
HAS_FLASH_ATTN_V2_ROCM = IS_ROCM_SYSTEM
|
||||
except ImportError as e:
|
||||
try:
|
||||
import flash_attn_cuda
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Flash Attention is not installed.\n"
|
||||
"Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
|
||||
"or install flash attention with `cd server && make install install-flash-attention`"
|
||||
) from e
|
||||
for idx in range(torch.cuda.device_count()):
|
||||
if "MI210" not in torch.cuda.get_device_name(
|
||||
idx
|
||||
) and "MI250" not in torch.cuda.get_device_name(idx):
|
||||
raise ImportError(
|
||||
f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention"
|
||||
)
|
||||
|
||||
if IS_CUDA_SYSTEM and not (is_sm75 or is_sm8x or is_sm90):
|
||||
raise ImportError(
|
||||
f"GPU with CUDA capability {major} {minor} is not supported"
|
||||
) from e
|
||||
elif IS_ROCM_SYSTEM:
|
||||
for idx in range(torch.cuda.device_count()):
|
||||
if "MI210" not in torch.cuda.get_device_name(
|
||||
idx
|
||||
) and "MI250" not in torch.cuda.get_device_name(idx):
|
||||
raise ImportError(
|
||||
f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention"
|
||||
)
|
||||
|
||||
logger.warning(f"Unable to use Flash Attention V2: {e}")
|
||||
HAS_FLASH_ATTN = True
|
||||
logger.warning(f"Unable to use Flash Attention V2: {e}")
|
||||
HAS_FLASH_ATTN = True
|
||||
|
||||
|
||||
def attention(
|
||||
|
@ -80,6 +92,28 @@ def attention(
|
|||
if window_size_left <= 0 and window_size_left != -1:
|
||||
raise ValueError("`window_size_left` must be > 0 or -1")
|
||||
|
||||
if IS_XPU_SYSTEM:
|
||||
if window_size_left != -1:
|
||||
raise ValueError(
|
||||
f"XPU version of Flash Attention does not support window attention (window_size_left != -1, got window_size_left={window_size_left})."
|
||||
)
|
||||
return ipex.llm.functional.varlen_attention(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
out,
|
||||
cu_seqlens,
|
||||
cu_seqlens,
|
||||
max_s,
|
||||
max_s,
|
||||
0.0,
|
||||
softmax_scale,
|
||||
False,
|
||||
True,
|
||||
False,
|
||||
None,
|
||||
)
|
||||
|
||||
if HAS_FLASH_ATTN_V2_CUDA:
|
||||
return flash_attn_2_cuda.varlen_fwd(
|
||||
q,
|
||||
|
|
|
@ -1,4 +1,13 @@
|
|||
import torch
|
||||
|
||||
def is_xpu_available():
|
||||
try:
|
||||
import intel_extension_for_pytorch
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
return hasattr(torch, "xpu") and torch.xpu.is_available()
|
||||
|
||||
IS_ROCM_SYSTEM = torch.version.hip is not None
|
||||
IS_CUDA_SYSTEM = torch.version.cuda is not None
|
||||
IS_XPU_SYSTEM = is_xpu_available()
|
||||
|
|
|
@ -18,7 +18,14 @@ except ImportError:
|
|||
from accelerate import init_empty_weights
|
||||
|
||||
from text_generation_server.utils.gptq.quant_linear import QuantLinear
|
||||
from text_generation_server.utils.import_utils import IS_CUDA_SYSTEM, IS_ROCM_SYSTEM
|
||||
from text_generation_server.utils.import_utils import (
|
||||
IS_CUDA_SYSTEM,
|
||||
IS_ROCM_SYSTEM,
|
||||
IS_XPU_SYSTEM,
|
||||
)
|
||||
|
||||
if IS_XPU_SYSTEM:
|
||||
import intel_extension_for_pytorch as ipex
|
||||
|
||||
HAS_AWQ = True
|
||||
try:
|
||||
|
@ -812,7 +819,15 @@ try:
|
|||
|
||||
class FastLayerNorm(nn.LayerNorm):
|
||||
def forward(self, hidden_states, residual=None):
|
||||
if hidden_states.shape[-1] > 8192 or IS_ROCM_SYSTEM:
|
||||
if IS_XPU_SYSTEM:
|
||||
res_out = hidden_states
|
||||
out = ipex.llm.functional.add_layer_norm(
|
||||
residual, hidden_states, self.weight, self.bias, self.eps, True
|
||||
)
|
||||
if residual is not None:
|
||||
res_out = residual
|
||||
return out, res_out
|
||||
elif hidden_states.shape[-1] > 8192 or IS_ROCM_SYSTEM:
|
||||
if residual is not None:
|
||||
hidden_states += residual
|
||||
residual = hidden_states
|
||||
|
@ -858,7 +873,20 @@ try:
|
|||
return cls(weight, eps)
|
||||
|
||||
def forward(self, hidden_states, residual=None):
|
||||
if hidden_states.shape[-1] > 8192:
|
||||
if IS_XPU_SYSTEM:
|
||||
residual_out = hidden_states
|
||||
out = ipex.llm.functional.add_rms_norm(
|
||||
residual,
|
||||
hidden_states,
|
||||
self.weight,
|
||||
None,
|
||||
self.variance_epsilon,
|
||||
True,
|
||||
)
|
||||
if residual is not None:
|
||||
residual_out = residual
|
||||
return out, residual_out
|
||||
elif hidden_states.shape[-1] > 8192:
|
||||
if residual is not None:
|
||||
hidden_states += residual
|
||||
residual = hidden_states
|
||||
|
@ -984,6 +1012,10 @@ try:
|
|||
|
||||
# Inplace operation, updating query and key.
|
||||
pos_encoding_ops.rotary_embedding(query, key, head_size, cos, sin, True)
|
||||
elif IS_XPU_SYSTEM:
|
||||
ipex.llm.functional.rotary_embedding(
|
||||
query, key, sin, cos, query.size(-1), True
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Your system seem to be not supported. Please check your install or open an issue at https://github.com/huggingface/text-generation-inference/issues with a clear reproduction."
|
||||
|
@ -1103,6 +1135,7 @@ try:
|
|||
|
||||
cos = torch.index_select(self._cos_cached, 0, position_ids)
|
||||
sin = torch.index_select(self._sin_cached, 0, position_ids)
|
||||
|
||||
# Note: this unsqueeze is not necessary on RoCm + VLLM ROPE implementation, but we leave it as is to avoid yet an other controlflow.
|
||||
return cos.unsqueeze(1), sin.unsqueeze(1)
|
||||
|
||||
|
|
|
@ -1,9 +1,15 @@
|
|||
import torch
|
||||
|
||||
from text_generation_server.utils.import_utils import IS_CUDA_SYSTEM, IS_ROCM_SYSTEM
|
||||
from text_generation_server.utils.import_utils import (
|
||||
IS_CUDA_SYSTEM,
|
||||
IS_ROCM_SYSTEM,
|
||||
IS_XPU_SYSTEM,
|
||||
)
|
||||
|
||||
_PARTITION_SIZE = 512
|
||||
|
||||
if IS_XPU_SYSTEM:
|
||||
import intel_extension_for_pytorch as ipex
|
||||
|
||||
|
||||
def reshape_and_cache(
|
||||
key: torch.Tensor,
|
||||
|
@ -22,6 +28,10 @@ def reshape_and_cache(
|
|||
from vllm import cache_ops
|
||||
|
||||
cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots)
|
||||
elif IS_XPU_SYSTEM:
|
||||
ipex.llm.modules.PagedAttention.reshape_and_cache(
|
||||
key, value, key_cache, value_cache, slots
|
||||
)
|
||||
else:
|
||||
raise ValueError("vllm is not supported on your system")
|
||||
|
||||
|
@ -58,6 +68,22 @@ def attention(
|
|||
block_size = value_cache.shape[3]
|
||||
num_seqs, num_heads, head_size = query.shape
|
||||
max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
|
||||
if IS_XPU_SYSTEM:
|
||||
query = query.contiguous()
|
||||
return ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
|
||||
out,
|
||||
query,
|
||||
key_cache,
|
||||
value_cache,
|
||||
kv_head_mapping,
|
||||
softmax_scale,
|
||||
block_tables,
|
||||
input_lengths,
|
||||
block_size,
|
||||
max_s,
|
||||
None,
|
||||
)
|
||||
|
||||
# NOTE(woosuk): We use a simple heuristic to decide whether to use
|
||||
# PagedAttention V1 or V2. If the number of partitions is 1, we use
|
||||
# V1 to avoid the overhead of reduction. Also, if the number of
|
||||
|
|
Loading…
Reference in New Issue