307 lines
15 KiB
Diff
307 lines
15 KiB
Diff
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
|
|
index 2244ea6f..96265a48 100644
|
|
--- a/fbgemm_gpu/CMakeLists.txt
|
|
+++ b/fbgemm_gpu/CMakeLists.txt
|
|
@@ -94,14 +94,14 @@ endif()
|
|
# Build Experimental Modules
|
|
################################################################################
|
|
|
|
-if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM)
|
|
- # TODO: Figure out NCCL/RCCL integration with ROCm
|
|
- add_subdirectory(experimental/example)
|
|
-endif()
|
|
-
|
|
-if(NOT FBGEMM_CPU_ONLY)
|
|
- add_subdirectory(experimental/gemm)
|
|
-endif()
|
|
+# if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM)
|
|
+# # TODO: Figure out NCCL/RCCL integration with ROCm
|
|
+# add_subdirectory(experimental/example)
|
|
+# endif()
|
|
+
|
|
+# if(NOT FBGEMM_CPU_ONLY)
|
|
+# add_subdirectory(experimental/gemm)
|
|
+# endif()
|
|
|
|
if(NOT FBGEMM_CPU_ONLY AND NOT USE_ROCM)
|
|
# CUTLASS currently doesn't build on ROCm and CK hasnt yet been added:
|
|
diff --git a/fbgemm_gpu/FbgemmGpu.cmake b/fbgemm_gpu/FbgemmGpu.cmake
|
|
index c56773fe..0c0d349e 100644
|
|
--- a/fbgemm_gpu/FbgemmGpu.cmake
|
|
+++ b/fbgemm_gpu/FbgemmGpu.cmake
|
|
@@ -446,53 +446,55 @@ set_source_files_properties(${fbgemm_sources}
|
|
################################################################################
|
|
|
|
set(fbgemm_gpu_sources_static_cpu
|
|
- codegen/training/forward/embedding_forward_split_cpu.cpp
|
|
- codegen/inference/embedding_forward_quantized_host_cpu.cpp
|
|
- codegen/training/backward/embedding_backward_dense_host_cpu.cpp
|
|
- codegen/utils/embedding_bounds_check_host_cpu.cpp
|
|
- src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
|
|
- src/permute_multi_embedding_ops/permute_multi_embedding_function.cpp
|
|
- src/permute_multi_embedding_ops/permute_multi_embedding_ops_cpu.cpp
|
|
- src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp
|
|
- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
|
|
- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp
|
|
- src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
|
|
- src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
|
|
- src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
|
|
- src/input_combine_ops/input_combine_cpu.cpp
|
|
- src/layout_transform_ops/layout_transform_ops_cpu.cpp
|
|
+ # codegen/training/forward/embedding_forward_split_cpu.cpp
|
|
+ # codegen/inference/embedding_forward_quantized_host_cpu.cpp
|
|
+ # codegen/training/backward/embedding_backward_dense_host_cpu.cpp
|
|
+ # codegen/utils/embedding_bounds_check_host_cpu.cpp
|
|
+ # src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp
|
|
+ # src/permute_multi_embedding_ops/permute_multi_embedding_function.cpp
|
|
+ # src/permute_multi_embedding_ops/permute_multi_embedding_ops_cpu.cpp
|
|
+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp
|
|
+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp
|
|
+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp
|
|
+ # src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp
|
|
+ # src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp
|
|
+ # src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
|
|
+ # src/input_combine_ops/input_combine_cpu.cpp
|
|
+ # src/layout_transform_ops/layout_transform_ops_cpu.cpp
|
|
src/quantize_ops/quantize_ops_cpu.cpp
|
|
src/quantize_ops/quantize_ops_meta.cpp
|
|
- src/sparse_ops/sparse_ops_cpu.cpp
|
|
- src/sparse_ops/sparse_ops_meta.cpp
|
|
- src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
|
|
- src/split_embeddings_cache/linearize_cache_indices.cpp
|
|
- src/split_embeddings_cache/lfu_cache_populate_byte.cpp
|
|
- src/split_embeddings_cache/lru_cache_populate_byte.cpp
|
|
- src/split_embeddings_cache/lxu_cache.cpp
|
|
- src/split_embeddings_cache/split_embeddings_cache_ops.cpp
|
|
- codegen/training/index_select/batch_index_select_dim0_ops.cpp
|
|
- codegen/training/index_select/batch_index_select_dim0_cpu_host.cpp)
|
|
+ # src/sparse_ops/sparse_ops_cpu.cpp
|
|
+ # src/sparse_ops/sparse_ops_meta.cpp
|
|
+ # src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp
|
|
+ # src/split_embeddings_cache/linearize_cache_indices.cpp
|
|
+ # src/split_embeddings_cache/lfu_cache_populate_byte.cpp
|
|
+ # src/split_embeddings_cache/lru_cache_populate_byte.cpp
|
|
+ # src/split_embeddings_cache/lxu_cache.cpp
|
|
+ # src/split_embeddings_cache/split_embeddings_cache_ops.cpp
|
|
+ # codegen/training/index_select/batch_index_select_dim0_ops.cpp
|
|
+ # codegen/training/index_select/batch_index_select_dim0_cpu_host.cpp)
|
|
+)
|
|
|
|
if(NOT FBGEMM_CPU_ONLY)
|
|
list(APPEND fbgemm_gpu_sources_static_cpu
|
|
- codegen/inference/embedding_forward_quantized_host.cpp
|
|
- codegen/utils/embedding_bounds_check_host.cpp
|
|
- src/intraining_embedding_pruning_ops/intraining_embedding_pruning_gpu.cpp
|
|
- src/layout_transform_ops/layout_transform_ops_gpu.cpp
|
|
- src/memory_utils/memory_utils.cpp
|
|
- src/memory_utils/memory_utils_ops.cpp
|
|
- src/memory_utils/memory_utils_ops_cpu.cpp
|
|
- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp
|
|
- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp
|
|
+ # codegen/inference/embedding_forward_quantized_host.cpp
|
|
+ # codegen/utils/embedding_bounds_check_host.cpp
|
|
+ # src/intraining_embedding_pruning_ops/intraining_embedding_pruning_gpu.cpp
|
|
+ # src/layout_transform_ops/layout_transform_ops_gpu.cpp
|
|
+ # src/memory_utils/memory_utils.cpp
|
|
+ # src/memory_utils/memory_utils_ops.cpp
|
|
+ # src/memory_utils/memory_utils_ops_cpu.cpp
|
|
+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp
|
|
+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp
|
|
src/quantize_ops/quantize_ops_gpu.cpp
|
|
- src/sparse_ops/sparse_ops_gpu.cpp
|
|
- src/split_embeddings_utils/split_embeddings_utils.cpp
|
|
- src/split_embeddings_cache/split_embeddings_cache_ops.cu
|
|
- src/metric_ops/metric_ops_host.cpp
|
|
- src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
|
|
- src/input_combine_ops/input_combine_gpu.cpp
|
|
- codegen/training/index_select/batch_index_select_dim0_host.cpp)
|
|
+ # src/sparse_ops/sparse_ops_gpu.cpp
|
|
+ # src/split_embeddings_utils/split_embeddings_utils.cpp
|
|
+ # src/split_embeddings_cache/split_embeddings_cache_ops.cu
|
|
+ # src/metric_ops/metric_ops_host.cpp
|
|
+ # src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp
|
|
+ # src/input_combine_ops/input_combine_gpu.cpp
|
|
+ # codegen/training/index_select/batch_index_select_dim0_host.cpp)
|
|
+ )
|
|
|
|
if(NVML_LIB_PATH OR USE_ROCM)
|
|
message(STATUS "Adding merge_pooled_embeddings sources")
|
|
@@ -516,36 +518,36 @@ endif()
|
|
|
|
if(NOT FBGEMM_CPU_ONLY)
|
|
set(fbgemm_gpu_sources_static_gpu
|
|
- codegen/utils/embedding_bounds_check.cu
|
|
- codegen/inference/embedding_forward_quantized_split_lookup.cu
|
|
- src/embedding_inplace_ops/embedding_inplace_update.cu
|
|
- src/histogram_binning_calibration_ops.cu
|
|
- src/input_combine_ops/input_combine.cu
|
|
- src/intraining_embedding_pruning_ops/intraining_embedding_pruning.cu
|
|
- src/memory_utils/memory_utils.cu
|
|
- src/memory_utils/memory_utils_ops.cu
|
|
- src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu
|
|
- src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu
|
|
- src/jagged_tensor_ops/dense_to_jagged_forward.cu
|
|
- src/jagged_tensor_ops/jagged_dense_bmm_forward.cu
|
|
- src/jagged_tensor_ops/jagged_dense_dense_elementwise_add_jagged_output_forward.cu
|
|
- src/jagged_tensor_ops/jagged_dense_elementwise_mul_backward.cu
|
|
- src/jagged_tensor_ops/jagged_dense_elementwise_mul_forward.cu
|
|
- src/jagged_tensor_ops/jagged_index_add_2d_forward.cu
|
|
- src/jagged_tensor_ops/jagged_index_select_2d_forward.cu
|
|
- src/jagged_tensor_ops/jagged_jagged_bmm_forward.cu
|
|
- src/jagged_tensor_ops/jagged_softmax_backward.cu
|
|
- src/jagged_tensor_ops/jagged_softmax_forward.cu
|
|
- src/jagged_tensor_ops/jagged_tensor_ops.cu
|
|
- src/jagged_tensor_ops/jagged_to_padded_dense_backward.cu
|
|
- src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu
|
|
- src/jagged_tensor_ops/jagged_unique_indices.cu
|
|
- src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu
|
|
- src/layout_transform_ops/layout_transform_ops.cu
|
|
- src/metric_ops/metric_ops.cu
|
|
- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu
|
|
- src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu
|
|
- src/permute_multi_embedding_ops/permute_multi_embedding_ops.cu
|
|
+ # codegen/utils/embedding_bounds_check.cu
|
|
+ # codegen/inference/embedding_forward_quantized_split_lookup.cu
|
|
+ # src/embedding_inplace_ops/embedding_inplace_update.cu
|
|
+ # src/histogram_binning_calibration_ops.cu
|
|
+ # src/input_combine_ops/input_combine.cu
|
|
+ # src/intraining_embedding_pruning_ops/intraining_embedding_pruning.cu
|
|
+ # src/memory_utils/memory_utils.cu
|
|
+ # src/memory_utils/memory_utils_ops.cu
|
|
+ # src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu
|
|
+ # src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu
|
|
+ # src/jagged_tensor_ops/dense_to_jagged_forward.cu
|
|
+ # src/jagged_tensor_ops/jagged_dense_bmm_forward.cu
|
|
+ # src/jagged_tensor_ops/jagged_dense_dense_elementwise_add_jagged_output_forward.cu
|
|
+ # src/jagged_tensor_ops/jagged_dense_elementwise_mul_backward.cu
|
|
+ # src/jagged_tensor_ops/jagged_dense_elementwise_mul_forward.cu
|
|
+ # src/jagged_tensor_ops/jagged_index_add_2d_forward.cu
|
|
+ # src/jagged_tensor_ops/jagged_index_select_2d_forward.cu
|
|
+ # src/jagged_tensor_ops/jagged_jagged_bmm_forward.cu
|
|
+ # src/jagged_tensor_ops/jagged_softmax_backward.cu
|
|
+ # src/jagged_tensor_ops/jagged_softmax_forward.cu
|
|
+ # src/jagged_tensor_ops/jagged_tensor_ops.cu
|
|
+ # src/jagged_tensor_ops/jagged_to_padded_dense_backward.cu
|
|
+ # src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu
|
|
+ # src/jagged_tensor_ops/jagged_unique_indices.cu
|
|
+ # src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu
|
|
+ # src/layout_transform_ops/layout_transform_ops.cu
|
|
+ # src/metric_ops/metric_ops.cu
|
|
+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu
|
|
+ # src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu
|
|
+ # src/permute_multi_embedding_ops/permute_multi_embedding_ops.cu
|
|
src/quantize_ops/quantize_bfloat16.cu
|
|
src/quantize_ops/quantize_fp8_rowwise.cu
|
|
src/quantize_ops/quantize_fused_8bit_rowwise.cu
|
|
@@ -554,39 +556,40 @@ if(NOT FBGEMM_CPU_ONLY)
|
|
src/quantize_ops/quantize_msfp.cu
|
|
src/quantize_ops/quantize_padded_fp8_rowwise.cu
|
|
src/quantize_ops/quantize_mx.cu
|
|
- src/sparse_ops/sparse_async_cumsum.cu
|
|
- src/sparse_ops/sparse_block_bucketize_features.cu
|
|
- src/sparse_ops/sparse_bucketize_features.cu
|
|
- src/sparse_ops/sparse_batched_unary_embeddings.cu
|
|
- src/sparse_ops/sparse_compute_frequency_sequence.cu
|
|
- src/sparse_ops/sparse_expand_into_jagged_permute.cu
|
|
- src/sparse_ops/sparse_group_index.cu
|
|
- src/sparse_ops/sparse_index_add.cu
|
|
- src/sparse_ops/sparse_index_select.cu
|
|
- src/sparse_ops/sparse_invert_permute.cu
|
|
- src/sparse_ops/sparse_pack_segments_backward.cu
|
|
- src/sparse_ops/sparse_pack_segments_forward.cu
|
|
- src/sparse_ops/sparse_permute_1d.cu
|
|
- src/sparse_ops/sparse_permute_2d.cu
|
|
- src/sparse_ops/sparse_permute102.cu
|
|
- src/sparse_ops/sparse_permute_embeddings.cu
|
|
- src/sparse_ops/sparse_range.cu
|
|
- src/sparse_ops/sparse_reorder_batched_ad.cu
|
|
- src/sparse_ops/sparse_segment_sum_csr.cu
|
|
- src/sparse_ops/sparse_zipf.cu
|
|
- src/split_embeddings_cache/lfu_cache_find.cu
|
|
- src/split_embeddings_cache/lfu_cache_populate.cu
|
|
- src/split_embeddings_cache/lfu_cache_populate_byte.cu
|
|
- src/split_embeddings_cache/lru_cache_find.cu
|
|
- src/split_embeddings_cache/lru_cache_populate.cu
|
|
- src/split_embeddings_cache/lru_cache_populate_byte.cu
|
|
- src/split_embeddings_cache/lxu_cache.cu
|
|
- src/split_embeddings_cache/linearize_cache_indices.cu
|
|
- src/split_embeddings_cache/reset_weight_momentum.cu
|
|
- src/split_embeddings_utils/generate_vbe_metadata.cu
|
|
- src/split_embeddings_utils/get_infos_metadata.cu
|
|
- src/split_embeddings_utils/radix_sort_pairs.cu
|
|
- src/split_embeddings_utils/transpose_embedding_input.cu)
|
|
+ # src/sparse_ops/sparse_async_cumsum.cu
|
|
+ # src/sparse_ops/sparse_block_bucketize_features.cu
|
|
+ # src/sparse_ops/sparse_bucketize_features.cu
|
|
+ # src/sparse_ops/sparse_batched_unary_embeddings.cu
|
|
+ # src/sparse_ops/sparse_compute_frequency_sequence.cu
|
|
+ # src/sparse_ops/sparse_expand_into_jagged_permute.cu
|
|
+ # src/sparse_ops/sparse_group_index.cu
|
|
+ # src/sparse_ops/sparse_index_add.cu
|
|
+ # src/sparse_ops/sparse_index_select.cu
|
|
+ # src/sparse_ops/sparse_invert_permute.cu
|
|
+ # src/sparse_ops/sparse_pack_segments_backward.cu
|
|
+ # src/sparse_ops/sparse_pack_segments_forward.cu
|
|
+ # src/sparse_ops/sparse_permute_1d.cu
|
|
+ # src/sparse_ops/sparse_permute_2d.cu
|
|
+ # src/sparse_ops/sparse_permute102.cu
|
|
+ # src/sparse_ops/sparse_permute_embeddings.cu
|
|
+ # src/sparse_ops/sparse_range.cu
|
|
+ # src/sparse_ops/sparse_reorder_batched_ad.cu
|
|
+ # src/sparse_ops/sparse_segment_sum_csr.cu
|
|
+ # src/sparse_ops/sparse_zipf.cu
|
|
+ # src/split_embeddings_cache/lfu_cache_find.cu
|
|
+ # src/split_embeddings_cache/lfu_cache_populate.cu
|
|
+ # src/split_embeddings_cache/lfu_cache_populate_byte.cu
|
|
+ # src/split_embeddings_cache/lru_cache_find.cu
|
|
+ # src/split_embeddings_cache/lru_cache_populate.cu
|
|
+ # src/split_embeddings_cache/lru_cache_populate_byte.cu
|
|
+ # src/split_embeddings_cache/lxu_cache.cu
|
|
+ # src/split_embeddings_cache/linearize_cache_indices.cu
|
|
+ # src/split_embeddings_cache/reset_weight_momentum.cu
|
|
+ # src/split_embeddings_utils/generate_vbe_metadata.cu
|
|
+ # src/split_embeddings_utils/get_infos_metadata.cu
|
|
+ # src/split_embeddings_utils/radix_sort_pairs.cu
|
|
+ # src/split_embeddings_utils/transpose_embedding_input.cu)
|
|
+ )
|
|
|
|
set_source_files_properties(${fbgemm_gpu_sources_static_gpu}
|
|
PROPERTIES COMPILE_OPTIONS
|
|
diff --git a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt
|
|
index 01f1d6ab..a6b8d7a8 100644
|
|
--- a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt
|
|
+++ b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt
|
|
@@ -25,23 +25,24 @@ set(fbgemm_sources_include_directories
|
|
${THIRDPARTY}/json/include
|
|
${NCCL_INCLUDE_DIRS})
|
|
|
|
-set(attention_ops_sources
|
|
- src/attention/attention.cpp
|
|
- src/attention/gqa_attn_splitk.cu)
|
|
+# set(attention_ops_sources
|
|
+# src/attention/attention.cpp
|
|
+# src/attention/gqa_attn_splitk.cu)
|
|
|
|
set(quantize_ops_sources
|
|
src/quantize/cutlass_extensions.cu
|
|
src/quantize/quantize.cu
|
|
src/quantize/quantize.cpp)
|
|
|
|
-set(comm_ops_sources
|
|
- src/comm/car.cu
|
|
- src/comm/car.cpp)
|
|
+# set(comm_ops_sources
|
|
+# src/comm/car.cu
|
|
+# src/comm/car.cpp)
|
|
|
|
set(experimental_gen_ai_cpp_source_files
|
|
- ${attention_ops_sources}
|
|
+ # ${attention_ops_sources}
|
|
${quantize_ops_sources}
|
|
- ${comm_ops_sources})
|
|
+ # ${comm_ops_sources}
|
|
+)
|
|
|
|
set_source_files_properties(${experimental_gen_ai_cpp_source_files}
|
|
PROPERTIES INCLUDE_DIRECTORIES
|