chore: update to torch 2.4 (#2259)

* chore: update to torch 2.4

* remove un-necessary patch

* fix
This commit is contained in:
OlivierDehaene 2024-07-23 20:39:43 +00:00 committed by GitHub
parent bc9593a5b1
commit e7e3aa6cac
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 5 additions and 330 deletions

View File

@ -41,7 +41,7 @@ RUN cargo build --profile release-opt
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS pytorch-install FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS pytorch-install
# NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099 # NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
ARG PYTORCH_VERSION=2.3.0 ARG PYTORCH_VERSION=2.4.0
ARG PYTHON_VERSION=3.10 ARG PYTHON_VERSION=3.10
# Keep in sync with `server/pyproject.toml # Keep in sync with `server/pyproject.toml
@ -167,8 +167,6 @@ FROM kernel-builder AS fbgemm-builder
WORKDIR /usr/src WORKDIR /usr/src
COPY server/Makefile-fbgemm Makefile COPY server/Makefile-fbgemm Makefile
COPY server/fbgemm_remove_unused.patch fbgemm_remove_unused.patch
COPY server/fix_torch90a.sh fix_torch90a.sh
RUN make build-fbgemm RUN make build-fbgemm
@ -254,10 +252,7 @@ COPY server/Makefile server/Makefile
RUN cd server && \ RUN cd server && \
make gen-server && \ make gen-server && \
pip install -r requirements_cuda.txt && \ pip install -r requirements_cuda.txt && \
pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir && \ pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir
pip install nvidia-nccl-cu12==2.22.3
ENV LD_PRELOAD=/opt/conda/lib/python3.10/site-packages/nvidia/nccl/lib/libnccl.so.2
# Deps before the binaries # Deps before the binaries
# The binaries change on every build given we burn the SHA into them # The binaries change on every build given we burn the SHA into them

View File

@ -30,7 +30,6 @@ install: install-cuda
install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention install-fbgemm install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention install-fbgemm
pip install -e ".[bnb]" pip install -e ".[bnb]"
pip install nvidia-nccl-cu12==2.22.3
install-rocm: install-server install-flash-attention-v2-rocm install-vllm-rocm install-rocm: install-server install-flash-attention-v2-rocm install-vllm-rocm

View File

@ -1,10 +1,8 @@
fbgemm_commit := 9cf0429b726931cfab72b8264730bea682f32fca fbgemm_commit := ddac8dd9fc0bee70a3f456df68b8aac38576c856
build-fbgemm: build-fbgemm:
chmod +x fix_torch90a.sh && ./fix_torch90a.sh && \
git clone https://github.com/pytorch/FBGEMM.git fbgemm && \ git clone https://github.com/pytorch/FBGEMM.git fbgemm && \
cp fbgemm_remove_unused.patch fbgemm && \ cd fbgemm && git fetch && git checkout $(fbgemm_commit) && \
cd fbgemm && git fetch && git checkout $(fbgemm_commit) && git apply fbgemm_remove_unused.patch && \
git submodule update --init --recursive && \ git submodule update --init --recursive && \
cd fbgemm_gpu && \ cd fbgemm_gpu && \
pip install -r requirements.txt && \ pip install -r requirements.txt && \

View File

@ -1,306 +0,0 @@
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 2244ea6f..96265a48 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -94,14 +94,14 @@ endif()
# Build Experimental Modules
################################################################################
-if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM)
- # TODO: Figure out NCCL/RCCL integration with ROCm
- add_subdirectory(experimental/example)
-endif()
-
-if(NOT FBGEMM_CPU_ONLY)
- add_subdirectory(experimental/gemm)
-endif()
+# if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM)
+# # TODO: Figure out NCCL/RCCL integration with ROCm
+# add_subdirectory(experimental/example)
+# endif()
+
+# if(NOT FBGEMM_CPU_ONLY)
+# add_subdirectory(experimental/gemm)
+# endif()
if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM)
# CUTLASS currently doesn't build on ROCm and CK hasnt yet been added:
diff --git a/fbgemm_gpu/FbgemmGpu.cmake b/fbgemm_gpu/FbgemmGpu.cmake
index c56773fe..0c0d349e 100644
--- a/fbgemm_gpu/FbgemmGpu.cmake
+++ b/fbgemm_gpu/FbgemmGpu.cmake
@@ -446,53 +446,55 @@ set_source_files_properties(${fbgemm_sources}
################################################################################
set(fbgemm_gpu_sources_static_cpu
- codegen/training/forward/embedding_forward_split_cpu.cpp
- codegen/inference/embedding_forward_quantized_host_cpu.cpp
- codegen/training/backward/embedding_backward_dense_host_cpu.cpp
- codegen/utils/embedding_bounds_check_host_cpu.cpp
- src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
- src/permute_multi_embedding_ops/permute_multi_embedding_function.cpp
- src/permute_multi_embedding_ops/permute_multi_embedding_ops_cpu.cpp
- src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp
- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp
- src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
- src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
- src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
- src/input_combine_ops/input_combine_cpu.cpp
- src/layout_transform_ops/layout_transform_ops_cpu.cpp
+ # codegen/training/forward/embedding_forward_split_cpu.cpp
+ # codegen/inference/embedding_forward_quantized_host_cpu.cpp
+ # codegen/training/backward/embedding_backward_dense_host_cpu.cpp
+ # codegen/utils/embedding_bounds_check_host_cpu.cpp
+ # src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
+ # src/permute_multi_embedding_ops/permute_multi_embedding_function.cpp
+ # src/permute_multi_embedding_ops/permute_multi_embedding_ops_cpu.cpp
+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp
+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp
+ # src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
+ # src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
+ # src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
+ # src/input_combine_ops/input_combine_cpu.cpp
+ # src/layout_transform_ops/layout_transform_ops_cpu.cpp
src/quantize_ops/quantize_ops_cpu.cpp
src/quantize_ops/quantize_ops_meta.cpp
- src/sparse_ops/sparse_ops_cpu.cpp
- src/sparse_ops/sparse_ops_meta.cpp
- src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
- src/split_embeddings_cache/linearize_cache_indices.cpp
- src/split_embeddings_cache/lfu_cache_populate_byte.cpp
- src/split_embeddings_cache/lru_cache_populate_byte.cpp
- src/split_embeddings_cache/lxu_cache.cpp
- src/split_embeddings_cache/split_embeddings_cache_ops.cpp
- codegen/training/index_select/batch_index_select_dim0_ops.cpp
- codegen/training/index_select/batch_index_select_dim0_cpu_host.cpp)
+ # src/sparse_ops/sparse_ops_cpu.cpp
+ # src/sparse_ops/sparse_ops_meta.cpp
+ # src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
+ # src/split_embeddings_cache/linearize_cache_indices.cpp
+ # src/split_embeddings_cache/lfu_cache_populate_byte.cpp
+ # src/split_embeddings_cache/lru_cache_populate_byte.cpp
+ # src/split_embeddings_cache/lxu_cache.cpp
+ # src/split_embeddings_cache/split_embeddings_cache_ops.cpp
+ # codegen/training/index_select/batch_index_select_dim0_ops.cpp
+ # codegen/training/index_select/batch_index_select_dim0_cpu_host.cpp)
+)
if(NOT FBGEMM_CPU_ONLY)
list(APPEND fbgemm_gpu_sources_static_cpu
- codegen/inference/embedding_forward_quantized_host.cpp
- codegen/utils/embedding_bounds_check_host.cpp
- src/intraining_embedding_pruning_ops/intraining_embedding_pruning_gpu.cpp
- src/layout_transform_ops/layout_transform_ops_gpu.cpp
- src/memory_utils/memory_utils.cpp
- src/memory_utils/memory_utils_ops.cpp
- src/memory_utils/memory_utils_ops_cpu.cpp
- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp
- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp
+ # codegen/inference/embedding_forward_quantized_host.cpp
+ # codegen/utils/embedding_bounds_check_host.cpp
+ # src/intraining_embedding_pruning_ops/intraining_embedding_pruning_gpu.cpp
+ # src/layout_transform_ops/layout_transform_ops_gpu.cpp
+ # src/memory_utils/memory_utils.cpp
+ # src/memory_utils/memory_utils_ops.cpp
+ # src/memory_utils/memory_utils_ops_cpu.cpp
+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp
+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp
src/quantize_ops/quantize_ops_gpu.cpp
- src/sparse_ops/sparse_ops_gpu.cpp
- src/split_embeddings_utils/split_embeddings_utils.cpp
- src/split_embeddings_cache/split_embeddings_cache_ops.cu
- src/metric_ops/metric_ops_host.cpp
- src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
- src/input_combine_ops/input_combine_gpu.cpp
- codegen/training/index_select/batch_index_select_dim0_host.cpp)
+ # src/sparse_ops/sparse_ops_gpu.cpp
+ # src/split_embeddings_utils/split_embeddings_utils.cpp
+ # src/split_embeddings_cache/split_embeddings_cache_ops.cu
+ # src/metric_ops/metric_ops_host.cpp
+ # src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
+ # src/input_combine_ops/input_combine_gpu.cpp
+ # codegen/training/index_select/batch_index_select_dim0_host.cpp)
+ )
if(NVML_LIB_PATH OR USE_ROCM)
message(STATUS "Adding merge_pooled_embeddings sources")
@@ -516,36 +518,36 @@ endif()
if(NOT FBGEMM_CPU_ONLY)
set(fbgemm_gpu_sources_static_gpu
- codegen/utils/embedding_bounds_check.cu
- codegen/inference/embedding_forward_quantized_split_lookup.cu
- src/embedding_inplace_ops/embedding_inplace_update.cu
- src/histogram_binning_calibration_ops.cu
- src/input_combine_ops/input_combine.cu
- src/intraining_embedding_pruning_ops/intraining_embedding_pruning.cu
- src/memory_utils/memory_utils.cu
- src/memory_utils/memory_utils_ops.cu
- src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu
- src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu
- src/jagged_tensor_ops/dense_to_jagged_forward.cu
- src/jagged_tensor_ops/jagged_dense_bmm_forward.cu
- src/jagged_tensor_ops/jagged_dense_dense_elementwise_add_jagged_output_forward.cu
- src/jagged_tensor_ops/jagged_dense_elementwise_mul_backward.cu
- src/jagged_tensor_ops/jagged_dense_elementwise_mul_forward.cu
- src/jagged_tensor_ops/jagged_index_add_2d_forward.cu
- src/jagged_tensor_ops/jagged_index_select_2d_forward.cu
- src/jagged_tensor_ops/jagged_jagged_bmm_forward.cu
- src/jagged_tensor_ops/jagged_softmax_backward.cu
- src/jagged_tensor_ops/jagged_softmax_forward.cu
- src/jagged_tensor_ops/jagged_tensor_ops.cu
- src/jagged_tensor_ops/jagged_to_padded_dense_backward.cu
- src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu
- src/jagged_tensor_ops/jagged_unique_indices.cu
- src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu
- src/layout_transform_ops/layout_transform_ops.cu
- src/metric_ops/metric_ops.cu
- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu
- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu
- src/permute_multi_embedding_ops/permute_multi_embedding_ops.cu
+ # codegen/utils/embedding_bounds_check.cu
+ # codegen/inference/embedding_forward_quantized_split_lookup.cu
+ # src/embedding_inplace_ops/embedding_inplace_update.cu
+ # src/histogram_binning_calibration_ops.cu
+ # src/input_combine_ops/input_combine.cu
+ # src/intraining_embedding_pruning_ops/intraining_embedding_pruning.cu
+ # src/memory_utils/memory_utils.cu
+ # src/memory_utils/memory_utils_ops.cu
+ # src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu
+ # src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu
+ # src/jagged_tensor_ops/dense_to_jagged_forward.cu
+ # src/jagged_tensor_ops/jagged_dense_bmm_forward.cu
+ # src/jagged_tensor_ops/jagged_dense_dense_elementwise_add_jagged_output_forward.cu
+ # src/jagged_tensor_ops/jagged_dense_elementwise_mul_backward.cu
+ # src/jagged_tensor_ops/jagged_dense_elementwise_mul_forward.cu
+ # src/jagged_tensor_ops/jagged_index_add_2d_forward.cu
+ # src/jagged_tensor_ops/jagged_index_select_2d_forward.cu
+ # src/jagged_tensor_ops/jagged_jagged_bmm_forward.cu
+ # src/jagged_tensor_ops/jagged_softmax_backward.cu
+ # src/jagged_tensor_ops/jagged_softmax_forward.cu
+ # src/jagged_tensor_ops/jagged_tensor_ops.cu
+ # src/jagged_tensor_ops/jagged_to_padded_dense_backward.cu
+ # src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu
+ # src/jagged_tensor_ops/jagged_unique_indices.cu
+ # src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu
+ # src/layout_transform_ops/layout_transform_ops.cu
+ # src/metric_ops/metric_ops.cu
+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu
+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu
+ # src/permute_multi_embedding_ops/permute_multi_embedding_ops.cu
src/quantize_ops/quantize_bfloat16.cu
src/quantize_ops/quantize_fp8_rowwise.cu
src/quantize_ops/quantize_fused_8bit_rowwise.cu
@@ -554,39 +556,40 @@ if(NOT FBGEMM_CPU_ONLY)
src/quantize_ops/quantize_msfp.cu
src/quantize_ops/quantize_padded_fp8_rowwise.cu
src/quantize_ops/quantize_mx.cu
- src/sparse_ops/sparse_async_cumsum.cu
- src/sparse_ops/sparse_block_bucketize_features.cu
- src/sparse_ops/sparse_bucketize_features.cu
- src/sparse_ops/sparse_batched_unary_embeddings.cu
- src/sparse_ops/sparse_compute_frequency_sequence.cu
- src/sparse_ops/sparse_expand_into_jagged_permute.cu
- src/sparse_ops/sparse_group_index.cu
- src/sparse_ops/sparse_index_add.cu
- src/sparse_ops/sparse_index_select.cu
- src/sparse_ops/sparse_invert_permute.cu
- src/sparse_ops/sparse_pack_segments_backward.cu
- src/sparse_ops/sparse_pack_segments_forward.cu
- src/sparse_ops/sparse_permute_1d.cu
- src/sparse_ops/sparse_permute_2d.cu
- src/sparse_ops/sparse_permute102.cu
- src/sparse_ops/sparse_permute_embeddings.cu
- src/sparse_ops/sparse_range.cu
- src/sparse_ops/sparse_reorder_batched_ad.cu
- src/sparse_ops/sparse_segment_sum_csr.cu
- src/sparse_ops/sparse_zipf.cu
- src/split_embeddings_cache/lfu_cache_find.cu
- src/split_embeddings_cache/lfu_cache_populate.cu
- src/split_embeddings_cache/lfu_cache_populate_byte.cu
- src/split_embeddings_cache/lru_cache_find.cu
- src/split_embeddings_cache/lru_cache_populate.cu
- src/split_embeddings_cache/lru_cache_populate_byte.cu
- src/split_embeddings_cache/lxu_cache.cu
- src/split_embeddings_cache/linearize_cache_indices.cu
- src/split_embeddings_cache/reset_weight_momentum.cu
- src/split_embeddings_utils/generate_vbe_metadata.cu
- src/split_embeddings_utils/get_infos_metadata.cu
- src/split_embeddings_utils/radix_sort_pairs.cu
- src/split_embeddings_utils/transpose_embedding_input.cu)
+ # src/sparse_ops/sparse_async_cumsum.cu
+ # src/sparse_ops/sparse_block_bucketize_features.cu
+ # src/sparse_ops/sparse_bucketize_features.cu
+ # src/sparse_ops/sparse_batched_unary_embeddings.cu
+ # src/sparse_ops/sparse_compute_frequency_sequence.cu
+ # src/sparse_ops/sparse_expand_into_jagged_permute.cu
+ # src/sparse_ops/sparse_group_index.cu
+ # src/sparse_ops/sparse_index_add.cu
+ # src/sparse_ops/sparse_index_select.cu
+ # src/sparse_ops/sparse_invert_permute.cu
+ # src/sparse_ops/sparse_pack_segments_backward.cu
+ # src/sparse_ops/sparse_pack_segments_forward.cu
+ # src/sparse_ops/sparse_permute_1d.cu
+ # src/sparse_ops/sparse_permute_2d.cu
+ # src/sparse_ops/sparse_permute102.cu
+ # src/sparse_ops/sparse_permute_embeddings.cu
+ # src/sparse_ops/sparse_range.cu
+ # src/sparse_ops/sparse_reorder_batched_ad.cu
+ # src/sparse_ops/sparse_segment_sum_csr.cu
+ # src/sparse_ops/sparse_zipf.cu
+ # src/split_embeddings_cache/lfu_cache_find.cu
+ # src/split_embeddings_cache/lfu_cache_populate.cu
+ # src/split_embeddings_cache/lfu_cache_populate_byte.cu
+ # src/split_embeddings_cache/lru_cache_find.cu
+ # src/split_embeddings_cache/lru_cache_populate.cu
+ # src/split_embeddings_cache/lru_cache_populate_byte.cu
+ # src/split_embeddings_cache/lxu_cache.cu
+ # src/split_embeddings_cache/linearize_cache_indices.cu
+ # src/split_embeddings_cache/reset_weight_momentum.cu
+ # src/split_embeddings_utils/generate_vbe_metadata.cu
+ # src/split_embeddings_utils/get_infos_metadata.cu
+ # src/split_embeddings_utils/radix_sort_pairs.cu
+ # src/split_embeddings_utils/transpose_embedding_input.cu)
+ )
set_source_files_properties(${fbgemm_gpu_sources_static_gpu}
PROPERTIES COMPILE_OPTIONS
diff --git a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt
index 01f1d6ab..a6b8d7a8 100644
--- a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt
+++ b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt
@@ -25,23 +25,24 @@ set(fbgemm_sources_include_directories
${THIRDPARTY}/json/include
${NCCL_INCLUDE_DIRS})
-set(attention_ops_sources
- src/attention/attention.cpp
- src/attention/gqa_attn_splitk.cu)
+# set(attention_ops_sources
+# src/attention/attention.cpp
+# src/attention/gqa_attn_splitk.cu)
set(quantize_ops_sources
src/quantize/cutlass_extensions.cu
src/quantize/quantize.cu
src/quantize/quantize.cpp)
-set(comm_ops_sources
- src/comm/car.cu
- src/comm/car.cpp)
+# set(comm_ops_sources
+# src/comm/car.cu
+# src/comm/car.cpp)
set(experimental_gen_ai_cpp_source_files
- ${attention_ops_sources}
+ # ${attention_ops_sources}
${quantize_ops_sources}
- ${comm_ops_sources})
+ # ${comm_ops_sources}
+)
set_source_files_properties(${experimental_gen_ai_cpp_source_files}
PROPERTIES INCLUDE_DIRECTORIES

View File

@ -1,11 +0,0 @@
#!/bin/bash
# This script is required to patch torch < 2.4
# It adds the 90a cuda target (H100)
# This target is required to build FBGEMM kernels
torch_cuda_arch=$(python -c "import torch; print(torch.__file__)" | sed 's/\/__init__.py//; s|$|/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake|')
sed -i '189s/\[0-9]\\\\\.\[0-9](/[0-9]\\\\.[0-9]a?(/' $torch_cuda_arch
sed -i '245s/\[0-9()]+\+"/[0-9()]+a?"/' $torch_cuda_arch
sed -i '246s/\[0-9]+\+"/[0-9]+a?"/' $torch_cuda_arch

View File

@ -31,7 +31,7 @@ einops = "^0.6.1"
texttable = { version = "^1.6.7", optional = true } texttable = { version = "^1.6.7", optional = true }
datasets = { version = "^2.14.0", optional = true } datasets = { version = "^2.14.0", optional = true }
peft = { version = "^0.10", optional = true } peft = { version = "^0.10", optional = true }
torch = { version = "^2.3.0", optional = true } torch = { version = "^2.4.0", optional = true }
scipy = "^1.11.1" scipy = "^1.11.1"
pillow = "^10.0.0" pillow = "^10.0.0"
outlines= { version = "^0.0.34", optional = true } outlines= { version = "^0.0.34", optional = true }