From f41d644a903d179915e122896aba6bc77821795a Mon Sep 17 00:00:00 2001 From: "Wang, Yi" Date: Thu, 23 May 2024 20:11:08 +0800 Subject: [PATCH] reenable xpu for tgi (#1939) # What does this PR do? Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. Signed-off-by: Wang, Yi A --- Dockerfile_intel | 1 + .../text_generation_server/layers/rotary.py | 2 + .../custom_modeling/idefics_modeling.py | 2 +- .../utils/flash_attn.py | 79 ++++++++++--------- .../utils/import_utils.py | 2 +- 5 files changed, 45 insertions(+), 41 deletions(-) diff --git a/Dockerfile_intel b/Dockerfile_intel index 5bc39d64..809992e1 100644 --- a/Dockerfile_intel +++ b/Dockerfile_intel @@ -43,6 +43,7 @@ USER root RUN wget http://nz2.archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb && \ dpkg -i ./libssl1.1_1.1.1f-1ubuntu2_amd64.deb +RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list diff --git a/server/text_generation_server/layers/rotary.py b/server/text_generation_server/layers/rotary.py index 198e5d8d..648d28ab 100644 --- a/server/text_generation_server/layers/rotary.py +++ b/server/text_generation_server/layers/rotary.py @@ -9,6 +9,8 @@ if SYSTEM == "cuda": import rotary_emb elif SYSTEM == "rocm": from vllm._C import ops +elif SYSTEM == "xpu": + import intel_extension_for_pytorch as ipex def _create_inv_freq(dim, base, device): diff --git a/server/text_generation_server/models/custom_modeling/idefics_modeling.py b/server/text_generation_server/models/custom_modeling/idefics_modeling.py index d0c84308..786ef559 100644 --- a/server/text_generation_server/models/custom_modeling/idefics_modeling.py +++ b/server/text_generation_server/models/custom_modeling/idefics_modeling.py @@ -62,7 +62,7 @@ if SYSTEM == "cuda": elif SYSTEM == "rocm": from vllm._C import ops else: - raise RuntimeError(f"Unsupported system {SYSTEM}") + dropout_layer_norm = None @dataclass diff --git a/server/text_generation_server/utils/flash_attn.py b/server/text_generation_server/utils/flash_attn.py index 9ac5655c..4f5cf10b 100644 --- a/server/text_generation_server/utils/flash_attn.py +++ b/server/text_generation_server/utils/flash_attn.py @@ -5,7 +5,9 @@ from loguru import logger import math from text_generation_server.utils.import_utils import SYSTEM -from text_generation_server.utils.flash_attn_triton import triton_attention + +if SYSTEM != "xpu": + from text_generation_server.utils.flash_attn_triton import triton_attention if os.getenv("USE_FLASH_ATTENTION", "").lower() == "false": raise ImportError("`USE_FLASH_ATTENTION` is false.") @@ -15,43 +17,6 @@ HAS_FLASH_ATTN_V2_ROCM = False ROCM_USE_FLASH_ATTN_V2_CK = False ROCM_USE_FLASH_ATTN_V2_TRITON = False -if SYSTEM == "xpu": - import intel_extension_for_pytorch as ipex - - def attention( - q, - k, - v, - out, - cu_seqlens, - max_s, - softmax_scale, - window_size_left=-1, - ): - if window_size_left <= 0 and window_size_left != -1: - raise ValueError("`window_size_left` must be > 0 or -1") - - if window_size_left != -1: - raise ValueError( - f"XPU version of Flash Attention does not support window attention (window_size_left != -1, got window_size_left={window_size_left})." - ) - return ipex.llm.functional.varlen_attention( - q, - k, - v, - out, - cu_seqlens, - cu_seqlens, - max_s, - max_s, - 0.0, - softmax_scale, - False, - True, - False, - None, - ) - if SYSTEM in {"cuda", "rocm"}: if not torch.cuda.is_available(): @@ -124,8 +89,44 @@ if SYSTEM in {"cuda", "rocm"}: logger.warning(f"Unable to use Flash Attention V2: {e}") HAS_FLASH_ATTN = True +if SYSTEM == "xpu": + import intel_extension_for_pytorch as ipex -if HAS_FLASH_ATTN_V2_CUDA: + def attention( + q, + k, + v, + out, + cu_seqlens, + max_s, + softmax_scale, + window_size_left=-1, + ): + if window_size_left <= 0 and window_size_left != -1: + raise ValueError("`window_size_left` must be > 0 or -1") + + if window_size_left != -1: + raise ValueError( + f"XPU version of Flash Attention does not support window attention (window_size_left != -1, got window_size_left={window_size_left})." + ) + return ipex.llm.functional.varlen_attention( + q, + k, + v, + out, + cu_seqlens, + cu_seqlens, + max_s, + max_s, + 0.0, + softmax_scale, + False, + True, + False, + None, + ) + +elif HAS_FLASH_ATTN_V2_CUDA: def attention( q, diff --git a/server/text_generation_server/utils/import_utils.py b/server/text_generation_server/utils/import_utils.py index f54987eb..40e57646 100644 --- a/server/text_generation_server/utils/import_utils.py +++ b/server/text_generation_server/utils/import_utils.py @@ -17,7 +17,7 @@ def get_cuda_free_memory(device, memory_fraction): return free_memory -def get_xpu_free_memory(device): +def get_xpu_free_memory(device, memory_fraction): total_gpu_memory = torch.xpu.get_device_properties(device).total_memory free_memory = int(total_gpu_memory * 0.5) return free_memory