diff --git a/Dockerfile b/Dockerfile index 54ddd5ef..aee4a172 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,7 +41,7 @@ RUN cargo build --profile release-opt FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS pytorch-install # NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099 -ARG PYTORCH_VERSION=2.3.0 +ARG PYTORCH_VERSION=2.4.0 ARG PYTHON_VERSION=3.10 # Keep in sync with `server/pyproject.toml @@ -167,8 +167,6 @@ FROM kernel-builder AS fbgemm-builder WORKDIR /usr/src COPY server/Makefile-fbgemm Makefile -COPY server/fbgemm_remove_unused.patch fbgemm_remove_unused.patch -COPY server/fix_torch90a.sh fix_torch90a.sh RUN make build-fbgemm @@ -254,10 +252,7 @@ COPY server/Makefile server/Makefile RUN cd server && \ make gen-server && \ pip install -r requirements_cuda.txt && \ - pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir && \ - pip install nvidia-nccl-cu12==2.22.3 - -ENV LD_PRELOAD=/opt/conda/lib/python3.10/site-packages/nvidia/nccl/lib/libnccl.so.2 + pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir # Deps before the binaries # The binaries change on every build given we burn the SHA into them diff --git a/server/Makefile b/server/Makefile index 209fc44e..af5ffa59 100644 --- a/server/Makefile +++ b/server/Makefile @@ -30,7 +30,6 @@ install: install-cuda install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention install-fbgemm pip install -e ".[bnb]" - pip install nvidia-nccl-cu12==2.22.3 install-rocm: install-server install-flash-attention-v2-rocm install-vllm-rocm diff --git a/server/Makefile-fbgemm b/server/Makefile-fbgemm index 38f8f31f..57526577 100644 --- a/server/Makefile-fbgemm +++ b/server/Makefile-fbgemm @@ -1,10 +1,8 @@ -fbgemm_commit := 9cf0429b726931cfab72b8264730bea682f32fca +fbgemm_commit := ddac8dd9fc0bee70a3f456df68b8aac38576c856 build-fbgemm: - chmod +x fix_torch90a.sh && ./fix_torch90a.sh && \ git clone https://github.com/pytorch/FBGEMM.git fbgemm && \ - cp fbgemm_remove_unused.patch fbgemm && \ - cd fbgemm && git fetch && git checkout $(fbgemm_commit) && git apply fbgemm_remove_unused.patch && \ + cd fbgemm && git fetch && git checkout $(fbgemm_commit) && \ git submodule update --init --recursive && \ cd fbgemm_gpu && \ pip install -r requirements.txt && \ diff --git a/server/fbgemm_remove_unused.patch b/server/fbgemm_remove_unused.patch deleted file mode 100644 index ad6af811..00000000 --- a/server/fbgemm_remove_unused.patch +++ /dev/null @@ -1,306 +0,0 @@ -diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt -index 2244ea6f..96265a48 100644 ---- a/fbgemm_gpu/CMakeLists.txt -+++ b/fbgemm_gpu/CMakeLists.txt -@@ -94,14 +94,14 @@ endif() - # Build Experimental Modules - ################################################################################ - --if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM) -- # TODO: Figure out NCCL/RCCL integration with ROCm -- add_subdirectory(experimental/example) --endif() -- --if(NOT FBGEMM_CPU_ONLY) -- add_subdirectory(experimental/gemm) --endif() -+# if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM) -+# # TODO: Figure out NCCL/RCCL integration with ROCm -+# add_subdirectory(experimental/example) -+# endif() -+ -+# if(NOT FBGEMM_CPU_ONLY) -+# add_subdirectory(experimental/gemm) -+# endif() - - if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM) - # CUTLASS currently doesn't build on ROCm and CK hasnt yet been added: -diff --git a/fbgemm_gpu/FbgemmGpu.cmake b/fbgemm_gpu/FbgemmGpu.cmake -index c56773fe..0c0d349e 100644 ---- a/fbgemm_gpu/FbgemmGpu.cmake -+++ b/fbgemm_gpu/FbgemmGpu.cmake -@@ -446,53 +446,55 @@ set_source_files_properties(${fbgemm_sources} - ################################################################################ - - set(fbgemm_gpu_sources_static_cpu -- codegen/training/forward/embedding_forward_split_cpu.cpp -- codegen/inference/embedding_forward_quantized_host_cpu.cpp -- codegen/training/backward/embedding_backward_dense_host_cpu.cpp -- codegen/utils/embedding_bounds_check_host_cpu.cpp -- src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp -- src/permute_multi_embedding_ops/permute_multi_embedding_function.cpp -- src/permute_multi_embedding_ops/permute_multi_embedding_ops_cpu.cpp -- src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp -- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp -- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp -- src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp -- src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp -- src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp -- src/input_combine_ops/input_combine_cpu.cpp -- src/layout_transform_ops/layout_transform_ops_cpu.cpp -+ # codegen/training/forward/embedding_forward_split_cpu.cpp -+ # codegen/inference/embedding_forward_quantized_host_cpu.cpp -+ # codegen/training/backward/embedding_backward_dense_host_cpu.cpp -+ # codegen/utils/embedding_bounds_check_host_cpu.cpp -+ # src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp -+ # src/permute_multi_embedding_ops/permute_multi_embedding_function.cpp -+ # src/permute_multi_embedding_ops/permute_multi_embedding_ops_cpu.cpp -+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp -+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp -+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp -+ # src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp -+ # src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp -+ # src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp -+ # src/input_combine_ops/input_combine_cpu.cpp -+ # src/layout_transform_ops/layout_transform_ops_cpu.cpp - src/quantize_ops/quantize_ops_cpu.cpp - src/quantize_ops/quantize_ops_meta.cpp -- src/sparse_ops/sparse_ops_cpu.cpp -- src/sparse_ops/sparse_ops_meta.cpp -- src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp -- src/split_embeddings_cache/linearize_cache_indices.cpp -- src/split_embeddings_cache/lfu_cache_populate_byte.cpp -- src/split_embeddings_cache/lru_cache_populate_byte.cpp -- src/split_embeddings_cache/lxu_cache.cpp -- src/split_embeddings_cache/split_embeddings_cache_ops.cpp -- codegen/training/index_select/batch_index_select_dim0_ops.cpp -- codegen/training/index_select/batch_index_select_dim0_cpu_host.cpp) -+ # src/sparse_ops/sparse_ops_cpu.cpp -+ # src/sparse_ops/sparse_ops_meta.cpp -+ # src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp -+ # src/split_embeddings_cache/linearize_cache_indices.cpp -+ # src/split_embeddings_cache/lfu_cache_populate_byte.cpp -+ # src/split_embeddings_cache/lru_cache_populate_byte.cpp -+ # src/split_embeddings_cache/lxu_cache.cpp -+ # src/split_embeddings_cache/split_embeddings_cache_ops.cpp -+ # codegen/training/index_select/batch_index_select_dim0_ops.cpp -+ # codegen/training/index_select/batch_index_select_dim0_cpu_host.cpp) -+) - - if(NOT FBGEMM_CPU_ONLY) - list(APPEND fbgemm_gpu_sources_static_cpu -- codegen/inference/embedding_forward_quantized_host.cpp -- codegen/utils/embedding_bounds_check_host.cpp -- src/intraining_embedding_pruning_ops/intraining_embedding_pruning_gpu.cpp -- src/layout_transform_ops/layout_transform_ops_gpu.cpp -- src/memory_utils/memory_utils.cpp -- src/memory_utils/memory_utils_ops.cpp -- src/memory_utils/memory_utils_ops_cpu.cpp -- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp -- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp -+ # codegen/inference/embedding_forward_quantized_host.cpp -+ # codegen/utils/embedding_bounds_check_host.cpp -+ # src/intraining_embedding_pruning_ops/intraining_embedding_pruning_gpu.cpp -+ # src/layout_transform_ops/layout_transform_ops_gpu.cpp -+ # src/memory_utils/memory_utils.cpp -+ # src/memory_utils/memory_utils_ops.cpp -+ # src/memory_utils/memory_utils_ops_cpu.cpp -+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp -+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp - src/quantize_ops/quantize_ops_gpu.cpp -- src/sparse_ops/sparse_ops_gpu.cpp -- src/split_embeddings_utils/split_embeddings_utils.cpp -- src/split_embeddings_cache/split_embeddings_cache_ops.cu -- src/metric_ops/metric_ops_host.cpp -- src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp -- src/input_combine_ops/input_combine_gpu.cpp -- codegen/training/index_select/batch_index_select_dim0_host.cpp) -+ # src/sparse_ops/sparse_ops_gpu.cpp -+ # src/split_embeddings_utils/split_embeddings_utils.cpp -+ # src/split_embeddings_cache/split_embeddings_cache_ops.cu -+ # src/metric_ops/metric_ops_host.cpp -+ # src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp -+ # src/input_combine_ops/input_combine_gpu.cpp -+ # codegen/training/index_select/batch_index_select_dim0_host.cpp) -+ ) - - if(NVML_LIB_PATH OR USE_ROCM) - message(STATUS "Adding merge_pooled_embeddings sources") -@@ -516,36 +518,36 @@ endif() - - if(NOT FBGEMM_CPU_ONLY) - set(fbgemm_gpu_sources_static_gpu -- codegen/utils/embedding_bounds_check.cu -- codegen/inference/embedding_forward_quantized_split_lookup.cu -- src/embedding_inplace_ops/embedding_inplace_update.cu -- src/histogram_binning_calibration_ops.cu -- src/input_combine_ops/input_combine.cu -- src/intraining_embedding_pruning_ops/intraining_embedding_pruning.cu -- src/memory_utils/memory_utils.cu -- src/memory_utils/memory_utils_ops.cu -- src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu -- src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu -- src/jagged_tensor_ops/dense_to_jagged_forward.cu -- src/jagged_tensor_ops/jagged_dense_bmm_forward.cu -- src/jagged_tensor_ops/jagged_dense_dense_elementwise_add_jagged_output_forward.cu -- src/jagged_tensor_ops/jagged_dense_elementwise_mul_backward.cu -- src/jagged_tensor_ops/jagged_dense_elementwise_mul_forward.cu -- src/jagged_tensor_ops/jagged_index_add_2d_forward.cu -- src/jagged_tensor_ops/jagged_index_select_2d_forward.cu -- src/jagged_tensor_ops/jagged_jagged_bmm_forward.cu -- src/jagged_tensor_ops/jagged_softmax_backward.cu -- src/jagged_tensor_ops/jagged_softmax_forward.cu -- src/jagged_tensor_ops/jagged_tensor_ops.cu -- src/jagged_tensor_ops/jagged_to_padded_dense_backward.cu -- src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu -- src/jagged_tensor_ops/jagged_unique_indices.cu -- src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu -- src/layout_transform_ops/layout_transform_ops.cu -- src/metric_ops/metric_ops.cu -- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu -- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu -- src/permute_multi_embedding_ops/permute_multi_embedding_ops.cu -+ # codegen/utils/embedding_bounds_check.cu -+ # codegen/inference/embedding_forward_quantized_split_lookup.cu -+ # src/embedding_inplace_ops/embedding_inplace_update.cu -+ # src/histogram_binning_calibration_ops.cu -+ # src/input_combine_ops/input_combine.cu -+ # src/intraining_embedding_pruning_ops/intraining_embedding_pruning.cu -+ # src/memory_utils/memory_utils.cu -+ # src/memory_utils/memory_utils_ops.cu -+ # src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu -+ # src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu -+ # src/jagged_tensor_ops/dense_to_jagged_forward.cu -+ # src/jagged_tensor_ops/jagged_dense_bmm_forward.cu -+ # src/jagged_tensor_ops/jagged_dense_dense_elementwise_add_jagged_output_forward.cu -+ # src/jagged_tensor_ops/jagged_dense_elementwise_mul_backward.cu -+ # src/jagged_tensor_ops/jagged_dense_elementwise_mul_forward.cu -+ # src/jagged_tensor_ops/jagged_index_add_2d_forward.cu -+ # src/jagged_tensor_ops/jagged_index_select_2d_forward.cu -+ # src/jagged_tensor_ops/jagged_jagged_bmm_forward.cu -+ # src/jagged_tensor_ops/jagged_softmax_backward.cu -+ # src/jagged_tensor_ops/jagged_softmax_forward.cu -+ # src/jagged_tensor_ops/jagged_tensor_ops.cu -+ # src/jagged_tensor_ops/jagged_to_padded_dense_backward.cu -+ # src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu -+ # src/jagged_tensor_ops/jagged_unique_indices.cu -+ # src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu -+ # src/layout_transform_ops/layout_transform_ops.cu -+ # src/metric_ops/metric_ops.cu -+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu -+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu -+ # src/permute_multi_embedding_ops/permute_multi_embedding_ops.cu - src/quantize_ops/quantize_bfloat16.cu - src/quantize_ops/quantize_fp8_rowwise.cu - src/quantize_ops/quantize_fused_8bit_rowwise.cu -@@ -554,39 +556,40 @@ if(NOT FBGEMM_CPU_ONLY) - src/quantize_ops/quantize_msfp.cu - src/quantize_ops/quantize_padded_fp8_rowwise.cu - src/quantize_ops/quantize_mx.cu -- src/sparse_ops/sparse_async_cumsum.cu -- src/sparse_ops/sparse_block_bucketize_features.cu -- src/sparse_ops/sparse_bucketize_features.cu -- src/sparse_ops/sparse_batched_unary_embeddings.cu -- src/sparse_ops/sparse_compute_frequency_sequence.cu -- src/sparse_ops/sparse_expand_into_jagged_permute.cu -- src/sparse_ops/sparse_group_index.cu -- src/sparse_ops/sparse_index_add.cu -- src/sparse_ops/sparse_index_select.cu -- src/sparse_ops/sparse_invert_permute.cu -- src/sparse_ops/sparse_pack_segments_backward.cu -- src/sparse_ops/sparse_pack_segments_forward.cu -- src/sparse_ops/sparse_permute_1d.cu -- src/sparse_ops/sparse_permute_2d.cu -- src/sparse_ops/sparse_permute102.cu -- src/sparse_ops/sparse_permute_embeddings.cu -- src/sparse_ops/sparse_range.cu -- src/sparse_ops/sparse_reorder_batched_ad.cu -- src/sparse_ops/sparse_segment_sum_csr.cu -- src/sparse_ops/sparse_zipf.cu -- src/split_embeddings_cache/lfu_cache_find.cu -- src/split_embeddings_cache/lfu_cache_populate.cu -- src/split_embeddings_cache/lfu_cache_populate_byte.cu -- src/split_embeddings_cache/lru_cache_find.cu -- src/split_embeddings_cache/lru_cache_populate.cu -- src/split_embeddings_cache/lru_cache_populate_byte.cu -- src/split_embeddings_cache/lxu_cache.cu -- src/split_embeddings_cache/linearize_cache_indices.cu -- src/split_embeddings_cache/reset_weight_momentum.cu -- src/split_embeddings_utils/generate_vbe_metadata.cu -- src/split_embeddings_utils/get_infos_metadata.cu -- src/split_embeddings_utils/radix_sort_pairs.cu -- src/split_embeddings_utils/transpose_embedding_input.cu) -+ # src/sparse_ops/sparse_async_cumsum.cu -+ # src/sparse_ops/sparse_block_bucketize_features.cu -+ # src/sparse_ops/sparse_bucketize_features.cu -+ # src/sparse_ops/sparse_batched_unary_embeddings.cu -+ # src/sparse_ops/sparse_compute_frequency_sequence.cu -+ # src/sparse_ops/sparse_expand_into_jagged_permute.cu -+ # src/sparse_ops/sparse_group_index.cu -+ # src/sparse_ops/sparse_index_add.cu -+ # src/sparse_ops/sparse_index_select.cu -+ # src/sparse_ops/sparse_invert_permute.cu -+ # src/sparse_ops/sparse_pack_segments_backward.cu -+ # src/sparse_ops/sparse_pack_segments_forward.cu -+ # src/sparse_ops/sparse_permute_1d.cu -+ # src/sparse_ops/sparse_permute_2d.cu -+ # src/sparse_ops/sparse_permute102.cu -+ # src/sparse_ops/sparse_permute_embeddings.cu -+ # src/sparse_ops/sparse_range.cu -+ # src/sparse_ops/sparse_reorder_batched_ad.cu -+ # src/sparse_ops/sparse_segment_sum_csr.cu -+ # src/sparse_ops/sparse_zipf.cu -+ # src/split_embeddings_cache/lfu_cache_find.cu -+ # src/split_embeddings_cache/lfu_cache_populate.cu -+ # src/split_embeddings_cache/lfu_cache_populate_byte.cu -+ # src/split_embeddings_cache/lru_cache_find.cu -+ # src/split_embeddings_cache/lru_cache_populate.cu -+ # src/split_embeddings_cache/lru_cache_populate_byte.cu -+ # src/split_embeddings_cache/lxu_cache.cu -+ # src/split_embeddings_cache/linearize_cache_indices.cu -+ # src/split_embeddings_cache/reset_weight_momentum.cu -+ # src/split_embeddings_utils/generate_vbe_metadata.cu -+ # src/split_embeddings_utils/get_infos_metadata.cu -+ # src/split_embeddings_utils/radix_sort_pairs.cu -+ # src/split_embeddings_utils/transpose_embedding_input.cu) -+ ) - - set_source_files_properties(${fbgemm_gpu_sources_static_gpu} - PROPERTIES COMPILE_OPTIONS -diff --git a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt -index 01f1d6ab..a6b8d7a8 100644 ---- a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt -+++ b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt -@@ -25,23 +25,24 @@ set(fbgemm_sources_include_directories - ${THIRDPARTY}/json/include - ${NCCL_INCLUDE_DIRS}) - --set(attention_ops_sources -- src/attention/attention.cpp -- src/attention/gqa_attn_splitk.cu) -+# set(attention_ops_sources -+# src/attention/attention.cpp -+# src/attention/gqa_attn_splitk.cu) - - set(quantize_ops_sources - src/quantize/cutlass_extensions.cu - src/quantize/quantize.cu - src/quantize/quantize.cpp) - --set(comm_ops_sources -- src/comm/car.cu -- src/comm/car.cpp) -+# set(comm_ops_sources -+# src/comm/car.cu -+# src/comm/car.cpp) - - set(experimental_gen_ai_cpp_source_files -- ${attention_ops_sources} -+ # ${attention_ops_sources} - ${quantize_ops_sources} -- ${comm_ops_sources}) -+ # ${comm_ops_sources} -+) - - set_source_files_properties(${experimental_gen_ai_cpp_source_files} - PROPERTIES INCLUDE_DIRECTORIES diff --git a/server/fix_torch90a.sh b/server/fix_torch90a.sh deleted file mode 100755 index 5e444828..00000000 --- a/server/fix_torch90a.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -# This script is required to patch torch < 2.4 -# It adds the 90a cuda target (H100) -# This target is required to build FBGEMM kernels - -torch_cuda_arch=$(python -c "import torch; print(torch.__file__)" | sed 's/\/__init__.py//; s|$|/share/cmake/Caffe2/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake|') - -sed -i '189s/\[0-9]\\\\\.\[0-9](/[0-9]\\\\.[0-9]a?(/' $torch_cuda_arch -sed -i '245s/\[0-9()]+\+"/[0-9()]+a?"/' $torch_cuda_arch -sed -i '246s/\[0-9]+\+"/[0-9]+a?"/' $torch_cuda_arch diff --git a/server/pyproject.toml b/server/pyproject.toml index c201a002..37bf2afd 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -31,7 +31,7 @@ einops = "^0.6.1" texttable = { version = "^1.6.7", optional = true } datasets = { version = "^2.14.0", optional = true } peft = { version = "^0.10", optional = true } -torch = { version = "^2.3.0", optional = true } +torch = { version = "^2.4.0", optional = true } scipy = "^1.11.1" pillow = "^10.0.0" outlines= { version = "^0.0.34", optional = true }