diff --git a/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp b/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp index 86cf11f647..4b946011c2 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp @@ -15,6 +15,9 @@ #include #include #include +#include +#include "ck_tile/ops/epilogue.hpp" +#include namespace ck_tile::reflect::conv { diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp index a0def3e5d9..6913889c4f 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp @@ -4,6 +4,7 @@ #pragma once #include "instance_traits.hpp" +#include "instance_traits_util.hpp" #include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp" // Forward declaration to avoid circular dependency diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp new file mode 100644 index 0000000000..f364b37ae5 --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp @@ -0,0 +1,140 @@ +// Copyright (C) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +// InstanceTraits specialization for GroupedConvolutionForwardKernel +// +// CRITICAL MAINTENANCE NOTE: +// This InstanceTraits file MUST be kept strictly in sync with the device implementation header: +// ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp +// "In sync" means that the template parameter order, names, and types in the declaration below +// MUST EXACTLY MATCH those in the device implementation. If these diverge, you may encounter +// compilation errors, subtle template instantiation mismatches, or silent runtime bugs that are +// difficult to diagnose. Always update both files together and review changes carefully. + +#pragma once + +#include "instance_traits.hpp" +#include "instance_traits_util.hpp" + +// Forward declaration to avoid circular dependency. +namespace ck_tile::device { + +template +struct GroupedConvolutionForwardKernel; + +} // namespace ck_tile::device + +namespace ck_tile { +namespace reflect { + +// Specialization for GroupedConvolutionForwardKernel +template +struct InstanceTraits> +{ + // CK Tile Conv Traits + // Spatial dimension + static constexpr int kSpatialDim = GroupedConvTraitsType_::NDimSpatial; + // Specialization + static constexpr ck_tile::ConvolutionSpecialization ConvSpecialization = + GroupedConvTraitsType_::ConvSpecialization; + // DataType types + using InLayout = typename GroupedConvTraitsType_::InLayout; + using WeiLayout = typename GroupedConvTraitsType_::WeiLayout; + using DsLayout = typename GroupedConvTraitsType_::DsLayout; + using OutLayout = typename GroupedConvTraitsType_::OutLayout; + // Vector size + static constexpr int kVectorSizeA = GroupedConvTraitsType_::VectorSizeA; + static constexpr int kVectorSizeB = GroupedConvTraitsType_::VectorSizeB; + static constexpr int kVectorSizeC = GroupedConvTraitsType_::VectorSizeC; + // Num Groups To Merge + static constexpr int kNumGroupsToMerge = GroupedConvTraitsType_::NumGroupsToMerge; + // Split image (large tensors) + static constexpr bool kEnableSplitImage = GroupedConvTraitsType_::EnableSplitImage; + + // TilePartitioner + // Block configuration + static constexpr int kMPerBlock = TilePartitioner_::MPerBlock; + static constexpr int kNPerBlock = TilePartitioner_::NPerBlock; + static constexpr int kKPerBlock = TilePartitioner_::KPerBlock; + + static constexpr int kMWarp = TilePartitioner_::BlockGemmShape::BlockWarps::at(number<0>{}); + static constexpr int kNWarp = TilePartitioner_::BlockGemmShape::BlockWarps::at(number<1>{}); + static constexpr int kKWarp = TilePartitioner_::BlockGemmShape::BlockWarps::at(number<2>{}); + + static constexpr int kMWarpTile = TilePartitioner_::BlockGemmShape::WarpTile::at(number<0>{}); + static constexpr int kNWarpTile = TilePartitioner_::BlockGemmShape::WarpTile::at(number<1>{}); + static constexpr int kKWarpTile = TilePartitioner_::BlockGemmShape::WarpTile::at(number<2>{}); + + // Data types + using ADataType = typename GemmPipeline_::ADataType; + using BDataType = typename GemmPipeline_::BDataType; + // Gemm Pipeline + using GemmPipeline = GemmPipeline_; + static constexpr ck_tile::GemmPipelineScheduler kPipelineScheduler = GemmPipeline_::Scheduler; + static constexpr bool kDoubleSmemBuffer = GemmPipeline_::DoubleSmemBuffer; + static constexpr int kNumWaveGroups = GemmPipeline_::NumWaveGroups; + + // Epilogue Pipeline + using AccDataType = typename EpiloguePipeline_::AccDataType; + using EDataType = typename EpiloguePipeline_::ODataType; + using DsDataType = typename EpiloguePipeline_::DsDataType; + using CDEElementwiseOperation = typename EpiloguePipeline_::CDElementwise; + + // Static member function to generate instance string + static std::string instance_string() + { + std::ostringstream oss; + + // Kernel type name + oss << "GroupedConvolutionForwardKernel"; + + // Template parameters in exact order matching InstanceTraits member order + oss << "<" << kSpatialDim; // 1. NDimSpatial + oss << "," + << ck_tile::getConvSpecializationString(ConvSpecialization); // 2. ConvSpecialization + oss << "," << detail::layout_name(); // 3. InLayout + oss << "," << detail::layout_name(); // 4. WeiLayout + oss << "," << detail::tuple_name(); // 5. DsLayout + oss << "," << detail::layout_name(); // 6. OutLayout + oss << "," << kVectorSizeA; // 7. VectorSizeA + oss << "," << kVectorSizeB; // 8. VectorSizeB + oss << "," << kVectorSizeC; // 9. VectorSizeC + oss << "," << kNumGroupsToMerge; // 10. NumGroupsToMerge + oss << "," << kEnableSplitImage; // 11. EnableSplitImage + oss << "," << kMPerBlock; // 12. MPerBlock + oss << "," << kNPerBlock; // 13. NPerBlock + oss << "," << kKPerBlock; // 14. KPerBlock + oss << "," << kMWarp; // 15. MWarp + oss << "," << kNWarp; // 16. NWarp + oss << "," << kKWarp; // 17. KWarp + oss << "," << kMWarpTile; // 18. MWarpTile + oss << "," << kNWarpTile; // 19. NWarpTile + oss << "," << kKWarpTile; // 20. KWarpTile + oss << "," << detail::type_name(); // 21. ADataType + oss << "," << detail::type_name(); // 22. BDataType + oss << "," << GemmPipeline::GetPipelineName(); // 23. BlkGemmPipelineVer + oss << "," << detail::pipeline_scheduler_name(kPipelineScheduler); // 24. BlkGemmPipeSched + oss << "," << kDoubleSmemBuffer; // 25. NumWaveGroups + oss << "," << kNumWaveGroups; // 26. NumWaveGroups + oss << "," << detail::type_name(); // 27. AccDataType + oss << "," << detail::type_name(); // 28. EDataType + oss << "," << detail::tuple_name(); // 29. DsDataType + oss << "," + << detail::elementwise_op_name(); // 30. + // CDEElementwiseOperation + oss << ">"; + + return oss.str(); + } +}; + +} // namespace reflect +} // namespace ck_tile diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp index e4d154ae10..2e918c5c2d 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp @@ -28,6 +28,10 @@ #include #include #include +#include +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp" +#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp" namespace ck_tile::reflect::detail { @@ -38,7 +42,7 @@ namespace impl { template consteval std::string_view type_name_impl() { - if constexpr(std::is_same_v) + if constexpr(std::is_same_v || std::is_same_v) return "fp16"; else if constexpr(std::is_same_v) return "fp32"; @@ -50,11 +54,11 @@ consteval std::string_view type_name_impl() return "s8"; else if constexpr(std::is_same_v) return "s32"; - else if constexpr(std::is_same_v) + else if constexpr(std::is_same_v || std::is_same_v) return "bf16"; - else if constexpr(std::is_same_v) + else if constexpr(std::is_same_v || std::is_same_v) return "fp8"; - else if constexpr(std::is_same_v) + else if constexpr(std::is_same_v || std::is_same_v) return "bf8"; else return std::string_view{}; // Return empty for supported types @@ -168,6 +172,17 @@ constexpr std::string_view pipeline_scheduler_name(ck::BlockGemmPipelineSchedule } } +constexpr std::string_view pipeline_scheduler_name(ck_tile::GemmPipelineScheduler sched) +{ + using enum ck_tile::GemmPipelineScheduler; + switch(sched) + { + case Default: return "Default"; + case Intrawave: return "Intrawave"; + case Interwave: return "Interwave"; + } +} + // Convert BlockGemmPipelineVersion enum to string constexpr std::string_view pipeline_version_name(ck::BlockGemmPipelineVersion ver) { @@ -206,6 +221,26 @@ constexpr std::string_view loop_scheduler_name(ck::LoopScheduler sched) } } +// Convert TailNumber enum to string +constexpr std::string_view tail_number_name(ck_tile::TailNumber tail_num) +{ + using enum ck_tile::TailNumber; + switch(tail_num) + { + case Odd: return "Odd"; + case Even: return "Even"; + case One: return "One"; + case Two: return "Two"; + case Three: return "Three"; + case Four: return "Four"; + case Five: return "Five"; + case Six: return "Six"; + case Seven: return "Seven"; + case Empty: return "Empty"; + case Full: return "Full"; + } +} + // Convert std::array to string template inline std::string array_to_string(const std::array& arr) @@ -356,17 +391,53 @@ constexpr std::string tuple_name() }(static_cast(nullptr)); } +template + requires requires { [](ck_tile::tuple*) {}(static_cast(nullptr)); } +constexpr std::string tuple_name() +{ + return [](ck_tile::tuple*) constexpr { + if constexpr(sizeof...(Ts) == 0) + { + return std::string("EmptyTuple"); + } + else if constexpr((IsLayoutType && ...)) + { + // Lambda wrapper for layout_name + auto layout_name_fn = []() { return layout_name(); }; + return detail::build_list_string("tuple", + layout_name_fn); + } + else if constexpr((IsDataType && ...)) + { + // Lambda wrapper for type_name + auto type_name_fn = []() { return type_name(); }; + return detail::build_list_string("tuple", type_name_fn); + } + else + { + static_assert((IsLayoutType && ...) || (IsDataType && ...), + "tuple elements must be all layouts or all data types, not mixed"); + return std::string{}; // unreachable + } + }(static_cast(nullptr)); +} + // Concept to check if a type is a ck::Tuple template concept IsCkTuple = requires { [](ck::Tuple*) {}(static_cast(nullptr)); }; +// Concept to check if a type is a ck_tile::tuple +template +concept IsCkTileTuple = + requires { [](ck_tile::tuple*) {}(static_cast(nullptr)); }; + // Deduces whether to use tuple_name or type_name // Handles both scalar data types and ck::Tuple types template constexpr std::string type_or_type_tuple_name() { - if constexpr(IsCkTuple) + if constexpr(IsCkTuple || IsCkTileTuple) { return tuple_name(); } diff --git a/experimental/builder/test/test_fwd_instance_traits.cpp b/experimental/builder/test/test_fwd_instance_traits.cpp index b57b20eb7d..af950b441c 100644 --- a/experimental/builder/test/test_fwd_instance_traits.cpp +++ b/experimental/builder/test/test_fwd_instance_traits.cpp @@ -11,6 +11,7 @@ #include #include #include +#include namespace { @@ -720,4 +721,126 @@ TEST(InstanceTraits, DlInstanceStringReturnsCorrectFormat) EXPECT_EQ(instance_str, expected_str); } +TEST(InstanceTraits, TileInstanceStringReturnsCorrectFormat) +{ + using GroupedConvTraitsType = + ck_tile::GroupedConvTraits<2 /*NDimSpatial*/, + ck_tile::ConvolutionSpecialization::Default /*ConvSpec*/, + ck_tile::tensor_layout::convolution::NHWGC /*InLayout*/, + ck_tile::tensor_layout::convolution::GKYXC /*WeiLayout*/, + ck_tile::tuple<> /*DsLayout*/, + ck_tile::tensor_layout::convolution::NHWGK /*OutLayout*/, + 4 /*VectorSizeA*/, + 4 /*VectorSizeB*/, + 4 /*VectorSizeC*/, + 1 /*NumGroupsToMerge*/, + false /*EnableSplitImage*/>; + + using GemmShape = ck_tile::TileGemmShape< + ck_tile::sequence<128 /*M_Tile*/, 128 /*N_Tile*/, 32 /*K_Tile*/>, + ck_tile::sequence<4 /*M_Warp*/, 1 /*N_Warp*/, 1 /*K_Warp*/>, + ck_tile::sequence<16 /*M_Warp_Tile*/, 16 /*N_Warp_Tile*/, 16 /*K_Warp_Tile*/>>; + + using TilePartitioner = ck_tile::GemmSpatiallyLocalTilePartitioner< + GemmShape, + GroupedConvTraitsType::FixedGemmParams::TilePartitionerGroupNum, + GroupedConvTraitsType::FixedGemmParams::TilePartitionerM01>; + + using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits< + GroupedConvTraitsType::FixedGemmParams::kPadM, + GroupedConvTraitsType::FixedGemmParams::kPadN, + GroupedConvTraitsType::FixedGemmParams::kPadK, + false /*DoubleSmemBuffer*/, + typename GroupedConvTraitsType::AsLayoutFwd, + typename GroupedConvTraitsType::BsLayoutFwd, + typename GroupedConvTraitsType::CLayoutFwd, + GroupedConvTraitsType::FixedGemmParams::TransposeC, + GroupedConvTraitsType::FixedGemmParams::UseStructuredSparsity, + GroupedConvTraitsType::FixedGemmParams::Persistent, + 1 /*NumWaveGroups*/>; + + using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem< + ck_tile::bf16_t /*InDataType*/, + ck_tile::bf16_t /*WeiDataType*/, + float /*AccDataType*/, + GemmShape, + GemmUniversalTraits, + ck_tile::GemmPipelineScheduler::Intrawave /*scheduler*/, + true /*has_hot_loop_v*/, + ck_tile::TailNumber::Full /*tail_number_v*/, + ck_tile::element_wise::PassThrough /*AElementwiseOperation*/, + ck_tile::element_wise::PassThrough /*BElementwiseOperation*/, + ck_tile::bf16_t /*OutDataType*/, + GroupedConvTraitsType::FixedGemmParams::FixedVectorSize, + GroupedConvTraitsType::VectorSizeA, + GroupedConvTraitsType::VectorSizeB>; + + using GemmPipeline = typename ck_tile::GemmPipelineAgBgCrCompV3; + + using ConvEpilogue = ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem /*DsDataType*/, + float /*AccDataType*/, + ck_tile::bf16_t /*OutDataType*/, + typename GroupedConvTraitsType::ImplicitGemmDsLayout, + typename GroupedConvTraitsType::FixedGemmParams::ELayout, + ck_tile::element_wise::PassThrough /*CDElementWise*/, + 128 /*MPerBlock*/, + 128 /*NPerBlock*/, + 4 /*M_Warp*/, + 1 /*N_Warp*/, + 16 /*M_Warp_Tile*/, + 16 /*N_Warp_Tile*/, + 16 /*K_Warp_Tile*/, + GroupedConvTraitsType::FixedGemmParams::TransposeC, + ck_tile::memory_operation_enum::set /*memory_operation*/, + 1 /*kNumWaveGroups*/, + GroupedConvTraitsType::FixedGemmParams::FixedVectorSize, + GroupedConvTraitsType::VectorSizeC>>; + + using GroupedConvFwdKernel = + ck_tile::device::GroupedConvolutionForwardKernel; + + std::string instance_str = ck_tile::reflect::instance_string(); + + std::string expected_str = "GroupedConvolutionForwardKernel" + "<2" // NDimSpatial + ",Default" // ConvSpecialization + ",NHWGC" // InLayout + ",GKYXC" // WeiLayout + ",EmptyTuple" // DsLayout + ",NHWGK" // OutLayout + ",4" // VectorSizeA + ",4" // VectorSizeB + ",4" // VectorSizeC + ",1" // NumGroupsToMerge + ",0" // EnableSplitImage + ",128" // MPerBlock + ",128" // NPerBlock + ",32" // KPerBlock + ",4" // MWarp + ",1" // NWarp + ",1" // KWarp + ",16" // MWarpTile + ",16" // NWarpTile + ",16" // KWarpTile + ",bf16" // ADataType + ",bf16" // BDataType + ",COMPUTE_V3" // BlkGemmPipelineVer + ",Intrawave" // BlkGemmPipeSched + ",0" // DoubleSmemBuffer + ",1" // NumWaveGroups + ",fp32" // AccDataType + ",bf16" // EDataType + ",EmptyTuple" // DsDataType + ",PassThrough" // CDEElementwiseOperation + ">"; + + EXPECT_EQ(instance_str, expected_str); +} + } // anonymous namespace diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp old mode 100755 new mode 100644 index 5bf8548470..b66c00e392 --- a/include/ck_tile/core/arch/arch.hpp +++ b/include/ck_tile/core/arch/arch.hpp @@ -299,12 +299,12 @@ CK_TILE_DEVICE void s_nop(index_t cnt = 0) #endif } -#define CK_CONSTANT_ADDRESS_SPACE \ - __attribute__((address_space( \ +#define CK_TILE_CONSTANT_ADDRESS_SPACE \ + __attribute__((address_space( \ static_cast>(address_space_enum::constant)))) template -__device__ T* cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE* p) +__device__ T* cast_pointer_to_generic_address_space(T CK_TILE_CONSTANT_ADDRESS_SPACE* p) { // cast a pointer in "Constant" address space (4) to "Generic" address space (0) // only c-style pointer cast seems be able to be compiled @@ -315,13 +315,13 @@ __device__ T* cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE* } template -__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_address_space(T* p) +__host__ __device__ T CK_TILE_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_address_space(T* p) { // cast a pointer in "Generic" address space (0) to "Constant" address space (4) // only c-style pointer cast seems be able to be compiled; #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wold-style-cast" - return (T CK_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast) + return (T CK_TILE_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast) #pragma clang diagnostic pop } diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp index 551dc6f50d..a72b1ba544 100644 --- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp +++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp @@ -190,7 +190,7 @@ struct GroupedGemmKernel */ CK_TILE_HOST static auto MaxOccupancyGridSize(const stream_config& s) -> dim3 { - using ConstantPointer = const void CK_CONSTANT_ADDRESS_SPACE*; + using ConstantPointer = const void CK_TILE_CONSTANT_ADDRESS_SPACE*; const auto kernel = kentry<1, Kernel, ConstantPointer, index_t>; int occupancy; HIP_CHECK_ERROR( @@ -518,7 +518,7 @@ struct GroupedGemmKernel // For non-persistent kernels template > - CK_TILE_DEVICE void operator()(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const, + CK_TILE_DEVICE void operator()(const void CK_TILE_CONSTANT_ADDRESS_SPACE* gemm_descs_const, index_t group_count) const { const index_t block_id = ck_tile::get_block_1d_id(); @@ -541,7 +541,7 @@ struct GroupedGemmKernel template , typename = void> // extra template parameter to avoid redefinition - CK_TILE_DEVICE void operator()(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const, + CK_TILE_DEVICE void operator()(const void CK_TILE_CONSTANT_ADDRESS_SPACE* gemm_descs_const, const index_t group_count) const { const index_t grid_size = ck_tile::get_grid_size(); diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp index 91da3cd27b..b293097d89 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp @@ -164,6 +164,13 @@ struct GemmPipelineAgBgCrCompAsync : public BaseGemmPipelineAgBgCrCompAsync{}; static constexpr auto is_b_load_tr_v = bool_constant{}; + [[nodiscard]] CK_TILE_HOST static const std::string GetPipelineName() + { + // clang-format off + return "COMPUTE_ASYNC"; + // clang-format on + } + CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize() { return Policy::template GetSmemSize(); diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp index aaa04615fd..a1bbcbe990 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp @@ -170,6 +170,13 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3 using Base::PrefetchStages; using Base::UsePersistentKernel; + [[nodiscard]] CK_TILE_HOST static const std::string GetPipelineName() + { + // clang-format off + return "COMPUTE_V3"; + // clang-format on + } + [[nodiscard]] CK_TILE_HOST static const std::string GetName() { // clang-format off diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp index ff1e33bd5d..238b4e2389 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp @@ -172,6 +172,13 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4 static constexpr auto is_a_load_tr_v = bool_constant{}; static constexpr auto is_b_load_tr_v = bool_constant{}; + [[nodiscard]] CK_TILE_HOST static const std::string GetPipelineName() + { + // clang-format off + return "COMPUTE_V4"; + // clang-format on + } + [[nodiscard]] CK_TILE_HOST static const std::string GetName() { // clang-format off diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp index 7263ddd5a1..6343ff9872 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp @@ -99,6 +99,13 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5 static constexpr index_t NumWarps = BlockGemmShape::NumWarps; static constexpr index_t KTileSize = BlockGemmShape::WarpTile::at(I2{}); + [[nodiscard]] CK_TILE_HOST static const std::string GetPipelineName() + { + // clang-format off + return "COMPUTE_V5"; + // clang-format on + } + [[nodiscard]] CK_TILE_HOST static const std::string GetName() { // clang-format off diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v6.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v6.hpp index 2ae9001098..5b57560f6e 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v6.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v6.hpp @@ -159,6 +159,13 @@ struct GemmPipelineAgBgCrCompV6 : public BaseGemmPipelineAgBgCrCompV6 static constexpr auto is_a_load_tr_v = bool_constant{}; static constexpr auto is_b_load_tr_v = bool_constant{}; + [[nodiscard]] CK_TILE_HOST static const std::string GetPipelineName() + { + // clang-format off + return "COMPUTE_V6"; + // clang-format on + } + [[nodiscard]] CK_TILE_HOST static const std::string GetName() { // clang-format off diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp index d363626efd..ba71e3b6cb 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp @@ -214,6 +214,13 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem static constexpr auto is_a_load_tr_v = bool_constant{}; static constexpr auto is_b_load_tr_v = bool_constant{}; + [[nodiscard]] CK_TILE_HOST static const std::string GetPipelineName() + { + // clang-format off + return "MEMORY"; + // clang-format on + } + [[nodiscard]] CK_TILE_HOST static const std::string GetName() { // clang-format off diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp index eb363d59b8..8a4fb59b51 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp @@ -70,6 +70,13 @@ struct GemmPipelineAGmemBGmemCRegV1 static constexpr index_t kLdsAlignmentInBytes = 16; + [[nodiscard]] CK_TILE_HOST static const std::string GetPipelineName() + { + // clang-format off + return "BASIC_V1"; + // clang-format on + } + [[nodiscard]] CK_TILE_HOST static const std::string GetName() { // clang-format off diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp index c309f8908a..32217e0024 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp @@ -70,6 +70,13 @@ struct GemmPipelineAGmemBGmemCRegV2 // For the basic gemm pipelien DoubleSmemBuffer set to be false naturally. static constexpr bool DoubleSmemBuffer = false; + [[nodiscard]] CK_TILE_HOST static const std::string GetPipelineName() + { + // clang-format off + return "BASIC_V2"; + // clang-format on + } + [[nodiscard]] CK_TILE_HOST static const std::string GetName() { // clang-format off diff --git a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp index 87f6c753b4..cae2bd0e9f 100644 --- a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp +++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp @@ -176,6 +176,13 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2 static constexpr index_t dswrite_mIter = (DsWritePreIssue - 1) % MIterPerWarp; static constexpr index_t dswrite_kIter = (DsWritePreIssue - 1) / MIterPerWarp; + [[nodiscard]] CK_TILE_HOST static const std::string GetPipelineName() + { + // clang-format off + return "PRESHUFFLE_V2"; + // clang-format on + } + [[nodiscard]] CK_TILE_HOST static const std::string GetName() { // clang-format off diff --git a/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp b/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp index 75ac1ca6ab..32f1279e93 100644 --- a/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp +++ b/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp @@ -208,7 +208,7 @@ struct QuantGroupedGemmKernel */ CK_TILE_HOST static auto MaxOccupancyGridSize(const stream_config& s) -> dim3 { - using ConstantPointer = const void CK_CONSTANT_ADDRESS_SPACE*; + using ConstantPointer = const void CK_TILE_CONSTANT_ADDRESS_SPACE*; const auto kernel_func = kentry<1, Kernel, ConstantPointer, index_t>; int occupancy; HIP_CHECK_ERROR( @@ -499,7 +499,7 @@ struct QuantGroupedGemmKernel template , typename = void> // extra template parameter to avoid redefinition - CK_TILE_DEVICE void operator()(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const, + CK_TILE_DEVICE void operator()(const void CK_TILE_CONSTANT_ADDRESS_SPACE* gemm_descs_const, const index_t group_count) const { const index_t grid_size = ck_tile::get_grid_size(); diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp index 7e70d2b422..6de331fe6d 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp @@ -16,6 +16,10 @@ #include "ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp" #include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp" +#ifdef CK_EXPERIMENTAL_BUILDER +#include "ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp" +#endif + namespace ck_tile { /// @brief The Grouped Convolution kernel device arguments. @@ -568,6 +572,19 @@ struct GroupedConvolutionForwardKernel // clang-format on } +#ifdef CK_EXPERIMENTAL_BUILDER + CK_TILE_HOST std::string GetInstanceString() const + { + static_assert(ck_tile::reflect::HasInstanceTraits, + "Specialization of instance_traits not found. Please check that a " + "specialization exists in file " + "ck_tile/builder/reflect/" + "instance_traits_tile_grouped_convolution_forward.hpp " + "for the given template parameters."); + return ck_tile::reflect::instance_string(); + } +#endif + CK_TILE_HOST static auto GridSize(const GroupedConvFwdKernelArgsSpecialized& kargs) { return dim3(