ix issues

2024-08-06 10:29:46 +00:00 · 2024-08-06 10:29:46 +00:00 · 5788c942a5
parent 55e6059eb1
commit 5788c942a5
3 changed files with 19 additions and 20 deletions
--- a/35
+++ b/35
@ -98,26 +98,25 @@ RUN pip uninstall -y triton && \
    cd triton/python && \
    pip install .

-# RUN git clone --depth 1 --recursive --single-branch --branch 2.3-patched https://github.com/fxmarty/pytorch.git pytorch && cd pytorch && pip install -r requirements.txt --no-cache-dir
+RUN git clone --depth 1 --recursive --single-branch --branch main https://github.com/pytorch/pytorch.git pytorch && cd pytorch && pip install -r requirements.txt --no-cache-dir && \
+    git checkout da320214e66b5af0f7db8fd18a64dbb519d17b27

-# ARG _GLIBCXX_USE_CXX11_ABI="1"
-# ARG CMAKE_PREFIX_PATH="/opt/conda"
-# ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942"
-# ARG BUILD_CAFFE2="0" \
-#     BUILD_CAFFE2_OPS="0" \
-#     USE_CUDA="0" \
-#     USE_ROCM="1" \
-#     BUILD_TEST="0" \
-#     USE_FBGEMM="0" \
-#     USE_NNPACK="0" \
-#     USE_QNNPACK="0" \
-#     USE_XNNPACK="0" \
-#     USE_FLASH_ATTENTION="1" \
-#     USE_MEM_EFF_ATTENTION="0"
+ARG _GLIBCXX_USE_CXX11_ABI="1"
+ARG CMAKE_PREFIX_PATH="/opt/conda"
+ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+ARG BUILD_CAFFE2="0" \
+    BUILD_CAFFE2_OPS="0" \
+    USE_CUDA="0" \
+    USE_ROCM="1" \
+    BUILD_TEST="0" \
+    USE_FBGEMM="0" \
+    USE_NNPACK="0" \
+    USE_QNNPACK="0" \
+    USE_XNNPACK="0" \
+    USE_FLASH_ATTENTION="1" \
+    USE_MEM_EFF_ATTENTION="0"

-# RUN cd pytorch && python tools/amd_build/build_amd.py && python setup.py install
-
-RUN pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.1
+RUN cd pytorch && python tools/amd_build/build_amd.py && python setup.py install

 # Set AS recommended: https://github.com/ROCm/triton/wiki/A-script-to-set-program-execution-environment-in-ROCm
 ENV HIP_FORCE_DEV_KERNARG=1
--- a/server/text_generation_server/layers/attention/rocm.py
+++ b/server/text_generation_server/layers/attention/rocm.py
@ -208,7 +208,7 @@ if ENGINE == "ck":
            softcap,
            False,
            None,
-        )
+        )[0]

 elif ENGINE == "triton":
    from .flash_attn_triton import triton_attention
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -1159,7 +1159,7 @@ class FlashCausalLM(Model):

                log_master(
                    logger.info,
-                    f"PyTorch TunableOp (https://github.com/fxmarty/pytorch/tree/2.3-patched/aten/src/ATen/cuda/tunable) is enabled. The warmup may take several minutes, picking the ROCm optimal matrix multiplication kernel for the target lengths {', '.join([str(seqlen) for seqlen in tuning_sequences])}, with typical 5-8% latency improvement for small sequence lengths. The picked GEMMs are saved in the file {tunableop_filepath}. To disable TunableOp, please launch TGI with `PYTORCH_TUNABLEOP_ENABLED=0`.",
+                    f"PyTorch TunableOp is enabled. The warmup may take several minutes, picking the ROCm optimal matrix multiplication kernel for the target lengths {', '.join([str(seqlen) for seqlen in tuning_sequences])}, with typical 5-8% latency improvement for small sequence lengths. The picked GEMMs are saved in the file {tunableop_filepath}. To disable TunableOp, please launch TGI with `PYTORCH_TUNABLEOP_ENABLED=0`.",
                )

                if os.path.isfile(tunableop_filepath):