From 6bcd8bea15c2912a557536c44b218266ad9f54a3 Mon Sep 17 00:00:00 2001 From: pciolkosz Date: Thu, 4 Dec 2025 14:42:10 -0800 Subject: [PATCH] Move launch API from cudax to libcu++ (#6667) * Move launch API from cudax to libcu++ * Review feedback and test fixes * More fixes * GCC7 fix * Update libcudacxx/include/cuda/__launch/configuration.h * Fix old GCC --------- Co-authored-by: David Bayer <48736217+davebayer@users.noreply.github.com> (cherry picked from commit 4ab39a768f23001ae2327093a20b1e303705aa0c) --- cudax/examples/simple_p2p.cu | 2 +- cudax/examples/vector_add.cu | 2 +- .../cuda/experimental/__execution/bulk.cuh | 4 +- .../cuda/experimental/__execution/queries.cuh | 6 +- .../__execution/stream/adaptor.cuh | 2 +- .../cuda/experimental/__launch/launch.cuh | 192 +++++----- cudax/include/cuda/experimental/launch.cuh | 2 +- cudax/test/CMakeLists.txt | 10 +- cudax/test/common/utility.cuh | 6 +- cudax/test/containers/uninitialized_buffer.cu | 4 +- cudax/test/launch/launch_smoke.cu | 32 +- cudax/test/stream/stream_smoke.cu | 2 +- examples/cudax/vector_add/vector_add.cu | 2 +- .../include/cuda/__launch/configuration.h | 182 +++++----- .../include/cuda/__launch/host_launch.h | 22 +- libcudacxx/include/cuda/__launch/launch.h | 334 +++++++++++++++++ libcudacxx/include/cuda/launch | 28 ++ .../cuda/ccclrt}/launch/configuration.cu | 91 +++-- .../ccclrt}/launch/dynamic_shared_memory.cu | 29 +- .../cuda/ccclrt/launch/launch_smoke.cu | 341 ++++++++++++++++++ .../cuda/containers/buffer/transform.cu | 3 +- 21 files changed, 974 insertions(+), 322 deletions(-) rename cudax/include/cuda/experimental/__launch/configuration.cuh => libcudacxx/include/cuda/__launch/configuration.h (84%) create mode 100644 libcudacxx/include/cuda/__launch/launch.h create mode 100644 libcudacxx/include/cuda/launch rename {cudax/test => libcudacxx/test/libcudacxx/cuda/ccclrt}/launch/configuration.cu (66%) rename {cudax/test => libcudacxx/test/libcudacxx/cuda/ccclrt}/launch/dynamic_shared_memory.cu (68%) create mode 100644 libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu diff --git a/cudax/examples/simple_p2p.cu b/cudax/examples/simple_p2p.cu index 050be513a69..42767b3972f 100644 --- a/cudax/examples/simple_p2p.cu +++ b/cudax/examples/simple_p2p.cu @@ -130,7 +130,7 @@ void test_cross_device_access_from_kernel( dev1_stream.wait(dev0_stream); // Kernel launch configuration - auto config = cudax::distribute<512>(dev0_buffer.size()); + auto config = cuda::distribute<512>(dev0_buffer.size()); // Run kernel on GPU 1, reading input from the GPU 0 buffer, writing output to the GPU 1 buffer printf("Run kernel on GPU%d, taking source data from GPU%d and writing to " diff --git a/cudax/examples/vector_add.cu b/cudax/examples/vector_add.cu index 7fe938c8b8c..3779c7b7692 100644 --- a/cudax/examples/vector_add.cu +++ b/cudax/examples/vector_add.cu @@ -92,7 +92,7 @@ try // Define the kernel launch parameters constexpr int threadsPerBlock = 256; - auto config = cudax::distribute(numElements); + auto config = cuda::distribute(numElements); // Launch the vectorAdd kernel printf( diff --git a/cudax/include/cuda/experimental/__execution/bulk.cuh b/cudax/include/cuda/experimental/__execution/bulk.cuh index 5e09938a514..e243d88e5ea 100644 --- a/cudax/include/cuda/experimental/__execution/bulk.cuh +++ b/cudax/include/cuda/experimental/__execution/bulk.cuh @@ -22,6 +22,7 @@ #endif // no system header #include +#include #include #include #include @@ -45,7 +46,6 @@ #include #include #include -#include #include @@ -73,7 +73,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __attrs_t { constexpr int __block_threads = 256; const int __grid_blocks = ::cuda::ceil_div(static_cast(__shape), __block_threads); - return experimental::make_config(block_dims<__block_threads>(), grid_dims(__grid_blocks)); + return make_config(block_dims<__block_threads>(), grid_dims(__grid_blocks)); } using __launch_config_t = decltype(__get_launch_config(_Shape())); diff --git a/cudax/include/cuda/experimental/__execution/queries.cuh b/cudax/include/cuda/experimental/__execution/queries.cuh index eca7b476efc..c0017cf2b62 100644 --- a/cudax/include/cuda/experimental/__execution/queries.cuh +++ b/cudax/include/cuda/experimental/__execution/queries.cuh @@ -25,6 +25,7 @@ _CCCL_SUPPRESS_DEPRECATED_PUSH #include _CCCL_SUPPRESS_DEPRECATED_POP +#include #include #include #include @@ -39,7 +40,6 @@ _CCCL_SUPPRESS_DEPRECATED_POP #include #include #include -#include #include @@ -314,14 +314,14 @@ _CCCL_GLOBAL_CONSTANT struct get_forward_progress_guarantee_t } get_forward_progress_guarantee{}; // By default, CUDA kernels are launched with a single thread and a single block. -using __single_threaded_config_base_t = decltype(experimental::make_config(grid_dims<1>(), block_dims<1>())); +using __single_threaded_config_base_t = decltype(make_config(grid_dims<1>(), block_dims<1>())); // We hide the complicated type of the default launch configuration so diagnositics are // easier to read. struct __single_threaded_config_t : __single_threaded_config_base_t { _CCCL_HOST_API constexpr __single_threaded_config_t() noexcept - : __single_threaded_config_base_t{experimental::make_config(grid_dims<1>(), block_dims<1>())} + : __single_threaded_config_base_t{make_config(grid_dims<1>(), block_dims<1>())} {} }; diff --git a/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh b/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh index 00b1ee54b76..7498827acb6 100644 --- a/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh +++ b/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh @@ -21,6 +21,7 @@ # pragma system_header #endif // no system header +#include #include #include #include @@ -36,7 +37,6 @@ #include #include #include -#include #include #include diff --git a/cudax/include/cuda/experimental/__launch/launch.cuh b/cudax/include/cuda/experimental/__launch/launch.cuh index e672ac08213..b5b5cfcddeb 100644 --- a/cudax/include/cuda/experimental/__launch/launch.cuh +++ b/cudax/include/cuda/experimental/__launch/launch.cuh @@ -22,6 +22,8 @@ #endif // no system header #include +#include +#include #include #include #include @@ -38,43 +40,21 @@ #include #include #include -#include #include #include -namespace cuda::experimental -{ -template -__global__ static void __kernel_launcher(const _CCCL_GRID_CONSTANT _Config __conf, _Kernel __kernel_fn, _Args... __args) -{ - __kernel_fn(__conf, __args...); -} - -template -__global__ static void __kernel_launcher_no_config(_Kernel __kernel_fn, _Args... __args) -{ - __kernel_fn(__args...); -} +_CCCL_BEGIN_NAMESPACE_CUDA template -[[nodiscard]] _CCCL_HOST_API CUfunction __get_cufunction_of(kernel_ref __kernel) +[[nodiscard]] _CCCL_HOST_API ::CUfunction __get_cufunction_of(experimental::kernel_ref __kernel) { return ::cuda::__driver::__kernelGetFunction(__kernel.get()); } -template -[[nodiscard]] _CCCL_HOST_API ::CUfunction __get_cufunction_of(void (*__kernel)(_Args...)) -{ - ::cudaFunction_t __kernel_cufunction{}; - _CCCL_TRY_CUDA_API( - ::cudaGetFuncBySymbol, "Failed to get function from symbol", &__kernel_cufunction, (const void*) __kernel); - return (CUfunction) __kernel_cufunction; -} - _CCCL_TEMPLATE(typename _GraphInserter) -_CCCL_REQUIRES(graph_inserter<_GraphInserter>) -_CCCL_HOST_API graph_node_ref +_CCCL_REQUIRES(experimental::graph_inserter<_GraphInserter>) +_CCCL_HOST_API experimental::graph_node_ref __do_launch(_GraphInserter&& __inserter, ::CUlaunchConfig& __config, ::CUfunction __kernel, void** __args_ptrs) { ::CUDA_KERNEL_NODE_PARAMS __node_params{}; @@ -101,36 +81,44 @@ __do_launch(_GraphInserter&& __inserter, ::CUlaunchConfig& __config, ::CUfunctio // TODO skip the update if called on rvalue? __inserter.__clear_and_set_dependency_node(__node); - return graph_node_ref{__node, __inserter.get_graph().get()}; + return experimental::graph_node_ref{__node, __inserter.get_graph().get()}; } -_CCCL_HOST_API void inline __do_launch( - ::cuda::stream_ref __stream, ::CUlaunchConfig& __config, ::CUfunction __kernel, void** __args_ptrs) +_CCCL_TEMPLATE(typename _GraphInserter) +_CCCL_REQUIRES(experimental::graph_inserter<_GraphInserter>) +_CCCL_HOST_API ::cuda::stream_ref __stream_or_invalid([[maybe_unused]] const _GraphInserter& __inserter) +{ + return ::cuda::stream_ref{::cuda::invalid_stream}; +} + +_CCCL_TEMPLATE(typename _GraphInserter) +_CCCL_REQUIRES(experimental::graph_inserter<_GraphInserter>) +_CCCL_HOST_API _GraphInserter&& __forward_or_cast_to_stream_ref(_GraphInserter&& __inserter) { - __config.hStream = __stream.get(); -#if defined(_CUDAX_LAUNCH_CONFIG_TEST) - test_launch_kernel_replacement(__config, __kernel, __args_ptrs); -#else // ^^^ _CUDAX_LAUNCH_CONFIG_TEST ^^^ / vvv !_CUDAX_LAUNCH_CONFIG_TEST vvv - ::cuda::__driver::__launchKernel(__config, __kernel, __args_ptrs); -#endif // ^^^ !_CUDAX_LAUNCH_CONFIG_TEST ^^^ + return ::cuda::std::forward<_GraphInserter>(__inserter); } +_CCCL_END_NAMESPACE_CUDA + +namespace cuda::experimental +{ template _CCCL_HOST_API auto __launch_impl(_Dst&& __dst, _Config __conf, ::CUfunction __kernel, _ExpTypes... __args) { static_assert(!::cuda::std::is_same_v, "Can't launch a configuration without hierarchy dimensions"); ::CUlaunchConfig __config{}; - constexpr bool __has_cluster_level = has_level; - constexpr unsigned int __num_attrs_needed = __detail::kernel_config_count_attr_space(__conf) + __has_cluster_level; + constexpr bool __has_cluster_level = has_level; + constexpr unsigned int __num_attrs_needed = + ::cuda::__detail::kernel_config_count_attr_space(__conf) + __has_cluster_level; ::CUlaunchAttribute __attrs[__num_attrs_needed == 0 ? 1 : __num_attrs_needed]; __config.attrs = &__attrs[0]; __config.numAttrs = 0; - ::cudaError_t __status = __detail::apply_kernel_config(__conf, __config, __kernel); + ::cudaError_t __status = cuda::__detail::apply_kernel_config(__conf, __config, __kernel); if (__status != ::cudaSuccess) { - __throw_cuda_error(__status, "Failed to prepare a launch configuration"); + ::cuda::__throw_cuda_error(__status, "Failed to prepare a launch configuration"); } __config.gridDimX = static_cast(__conf.dims.extents(block, grid).x); @@ -151,33 +139,7 @@ _CCCL_HOST_API auto __launch_impl(_Dst&& __dst, _Config __conf, ::CUfunction __k } const void* __pArgs[(sizeof...(__args) > 0) ? sizeof...(__args) : 1]{::cuda::std::addressof(__args)...}; - return __do_launch(::cuda::std::forward<_Dst>(__dst), __config, __kernel, const_cast(__pArgs)); -} - -_CCCL_TEMPLATE(typename _GraphInserter) -_CCCL_REQUIRES(graph_inserter<_GraphInserter>) -_CCCL_HOST_API ::cuda::stream_ref __stream_or_invalid([[maybe_unused]] const _GraphInserter& __inserter) -{ - return ::cuda::stream_ref{::cuda::invalid_stream}; -} - -_CCCL_HOST_API ::cuda::stream_ref inline __stream_or_invalid(::cuda::stream_ref __stream) -{ - return __stream; -} - -_CCCL_TEMPLATE(typename _GraphInserter) -_CCCL_REQUIRES(graph_inserter<_GraphInserter>) -_CCCL_HOST_API _GraphInserter&& __forward_or_cast_to_stream_ref(_GraphInserter&& __inserter) -{ - return ::cuda::std::forward<_GraphInserter>(__inserter); -} - -// cast to stream_ref to avoid instantiating launch_impl for every type convertible to stream_ref -template -_CCCL_HOST_API ::cuda::stream_ref __forward_or_cast_to_stream_ref(::cuda::stream_ref __stream) -{ - return __stream; + return ::cuda::__do_launch(::cuda::std::forward<_Dst>(__dst), __config, __kernel, const_cast(__pArgs)); } template @@ -186,8 +148,8 @@ _CCCL_CONCEPT work_submitter = //! @brief Launch a kernel functor with specified configuration and arguments //! -//! Launches a kernel functor object on the specified stream and with specified configuration. -//! Kernel functor object is a type with __device__ operator(). +//! Launches a kernel functor object on the specified stream and with specified +//! configuration. Kernel functor object is a type with __device__ operator(). //! Functor might or might not accept the configuration as its first argument. //! //! @par Snippet @@ -197,7 +159,8 @@ _CCCL_CONCEPT work_submitter = //! //! struct kernel { //! template -//! __device__ void operator()(Configuration conf, unsigned int thread_to_print) { +//! __device__ void operator()(Configuration conf, unsigned int +//! thread_to_print) { //! if (conf.dims.rank(cudax::thread, cudax::grid) == thread_to_print) { //! printf("Hello from the GPU\n"); //! } @@ -205,8 +168,9 @@ _CCCL_CONCEPT work_submitter = //! }; //! //! void launch_kernel(cuda::stream_ref stream) { -//! auto dims = cudax::make_hierarchy(cudax::block_dims<128>(), cudax::grid_dims(4)); -//! auto config = cudax::make_config(dims, cudax::launch_cooperative()); +//! auto dims = cudax::make_hierarchy(cudax::block_dims<128>(), +//! cudax::grid_dims(4)); auto config = cudax::make_config(dims, +//! cudax::launch_cooperative()); //! //! cudax::launch(stream, config, kernel(), 42); //! } @@ -237,34 +201,34 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter, kernel_config<_Dimensions, _Config...>, ::cuda::std::decay_t>...>) { - auto __launcher = + auto __launcher = ::cuda:: __kernel_launcher>...>; - return __launch_impl( - __forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)), + return ::cuda::experimental::__launch_impl( + ::cuda::__forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)), __combined, - __get_cufunction_of(__launcher), + ::cuda::__get_cufunction_of(__launcher), __combined, __kernel, - device_transform(__stream_or_invalid(__submitter), ::cuda::std::forward<_Args>(__args))...); + device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_Args>(__args))...); } else { static_assert(::cuda::std::is_invocable_v<_Kernel, ::cuda::std::decay_t>...>); auto __launcher = - __kernel_launcher_no_config<_Kernel, ::cuda::std::decay_t>...>; - return __launch_impl( - __forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)), + ::cuda::__kernel_launcher_no_config<_Kernel, ::cuda::std::decay_t>...>; + return ::cuda::experimental::__launch_impl( + ::cuda::__forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)), __combined, - __get_cufunction_of(__launcher), + ::cuda::__get_cufunction_of(__launcher), __kernel, - device_transform(__stream_or_invalid(__submitter), ::cuda::std::forward<_Args>(__args))...); + device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_Args>(__args))...); } } //! @brief Launch a kernel function with specified configuration and arguments //! -//! Launches a kernel function on the specified stream and with specified configuration. -//! Kernel function is a function with __global__ annotation. +//! Launches a kernel function on the specified stream and with specified +//! configuration. Kernel function is a function with __global__ annotation. //! Function might or might not accept the configuration as its first argument. //! //! @par Snippet @@ -280,8 +244,9 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter, //! } //! //! void launch_kernel(cuda::stream_ref stream) { -//! auto dims = cudax::make_hierarchy(cudax::block_dims<128>(), cudax::grid_dims(4)); -//! auto config = cudax::make_config(dims, cudax::launch_cooperative()); +//! auto dims = cudax::make_hierarchy(cudax::block_dims<128>(), +//! cudax::grid_dims(4)); auto config = cudax::make_config(dims, +//! cudax::launch_cooperative()); //! //! cudax::launch(stream, config, kernel, 42); //! } @@ -308,12 +273,12 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter, _ActArgs&&... __args) { __ensure_current_device __dev_setter{__submitter}; - return __launch_impl, _ExpArgs...>( - __forward_or_cast_to_stream_ref<_Submitter>(__submitter), // + return ::cuda::experimental::__launch_impl, _ExpArgs...>( + ::cuda::__forward_or_cast_to_stream_ref<_Submitter>(__submitter), // __conf, - __get_cufunction_of(__kernel), + ::cuda::__get_cufunction_of(__kernel), __conf, - device_transform(__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...); + device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...); } //! @brief Launch a kernel with specified configuration and arguments @@ -334,10 +299,12 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter, //! } //! //! void launch_kernel(cuda::stream_ref stream) { -//! auto dims = cudax::make_hierarchy(cudax::block_dims<128>(), cudax::grid_dims(4)); -//! auto config = cudax::make_config(dims, cudax::launch_cooperative()); +//! auto dims = cudax::make_hierarchy(cudax::block_dims<128>(), +//! cudax::grid_dims(4)); auto config = cudax::make_config(dims, +//! cudax::launch_cooperative()); //! -//! cudax::launch(stream, config, cudax::kernel_ref{kernel, 42); +//! cudax::launch(stream, config, +//! cudax::kernel_ref{kernel, 42); //! } //! @endcode //! @@ -362,18 +329,18 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter, _ActArgs&&... __args) { __ensure_current_device __dev_setter{__submitter}; - return __launch_impl, _ExpArgs...>( - __forward_or_cast_to_stream_ref<_Submitter>(__submitter), // + return ::cuda::experimental::__launch_impl, _ExpArgs...>( + ::cuda::__forward_or_cast_to_stream_ref<_Submitter>(__submitter), // __conf, - __get_cufunction_of(__kernel), + ::cuda::__get_cufunction_of(__kernel), __conf, - device_transform(__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...); + device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...); } //! @brief Launch a kernel function with specified configuration and arguments //! -//! Launches a kernel function on the specified stream and with specified configuration. -//! Kernel function is a function with __global__ annotation. +//! Launches a kernel function on the specified stream and with specified +//! configuration. Kernel function is a function with __global__ annotation. //! Function might or might not accept the configuration as its first argument. //! //! @par Snippet @@ -389,8 +356,9 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter, //! } //! //! void launch_kernel(cuda::stream_ref stream) { -//! auto dims = cudax::make_hierarchy(cudax::block_dims<128>(), cudax::grid_dims(4)); -//! auto config = cudax::make_config(dims, cudax::launch_cooperative()); +//! auto dims = cudax::make_hierarchy(cudax::block_dims<128>(), +//! cudax::grid_dims(4)); auto config = cudax::make_config(dims, +//! cudax::launch_cooperative()); //! //! cudax::launch(stream, config, kernel, 42); //! } @@ -416,11 +384,11 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter, _ActArgs&&... __args) { __ensure_current_device __dev_setter{__submitter}; - return __launch_impl<_ExpArgs...>( - __forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)), // + return ::cuda::experimental::__launch_impl<_ExpArgs...>( + ::cuda::__forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)), // __conf, - __get_cufunction_of(__kernel), - device_transform(__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...); + ::cuda::__get_cufunction_of(__kernel), + device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...); } //! @brief Launch a kernel with specified configuration and arguments @@ -441,10 +409,12 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter, //! } //! //! void launch_kernel(cuda::stream_ref stream) { -//! auto dims = cudax::make_hierarchy(cudax::block_dims<128>(), cudax::grid_dims(4)); -//! auto config = cudax::make_config(dims, cudax::launch_cooperative()); +//! auto dims = cudax::make_hierarchy(cudax::block_dims<128>(), +//! cudax::grid_dims(4)); auto config = cudax::make_config(dims, +//! cudax::launch_cooperative()); //! -//! cudax::launch(stream, config, cudax::kernel_ref{kernel}, 42); +//! cudax::launch(stream, config, +//! cudax::kernel_ref{kernel}, 42); //! } //! @endcode //! @@ -468,11 +438,11 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter, _ActArgs&&... __args) { __ensure_current_device __dev_setter{__submitter}; - return __launch_impl<_ExpArgs...>( - __forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)), // + return ::cuda::experimental::__launch_impl<_ExpArgs...>( + ::cuda::__forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)), // __conf, - __get_cufunction_of(__kernel), - device_transform(__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...); + ::cuda::__get_cufunction_of(__kernel), + device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...); } // diff --git a/cudax/include/cuda/experimental/launch.cuh b/cudax/include/cuda/experimental/launch.cuh index 04b4cd04e9e..cf65b9471b7 100644 --- a/cudax/include/cuda/experimental/launch.cuh +++ b/cudax/include/cuda/experimental/launch.cuh @@ -11,9 +11,9 @@ #ifndef __CUDAX_LAUNCH___ #define __CUDAX_LAUNCH___ +#include #include -#include #include #include diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index c6bd29a333c..00168604f5f 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -40,15 +40,7 @@ cudax_add_catch2_test(test_target launch launch/launch_smoke.cu ) -cudax_add_catch2_test(test_target launch_configuration - launch/configuration.cu -) - -cudax_add_catch2_test(test_target launch_config_dynamic_smem - launch/dynamic_shared_memory.cu -) - -cudax_add_catch2_test(test_target execution +cudax_add_catch2_test(test_target execution ${cudax_target} execution/env.cu execution/policies/policies.cu execution/policies/get_execution_policy.cu diff --git a/cudax/test/common/utility.cuh b/cudax/test/common/utility.cuh index 059a8c8d7f0..e8404b164ac 100644 --- a/cudax/test/common/utility.cuh +++ b/cudax/test/common/utility.cuh @@ -29,7 +29,7 @@ namespace { namespace test { -constexpr auto one_thread_dims = cudax::make_config(cuda::block_dims<1>(), cuda::grid_dims<1>()); +constexpr auto one_thread_dims = cuda::make_config(cuda::block_dims<1>(), cuda::grid_dims<1>()); struct _malloc_pinned { @@ -39,13 +39,13 @@ private: public: explicit _malloc_pinned(std::size_t size) { - cudax::__ensure_current_device guard(cuda::device_ref{0}); + cuda::__ensure_current_context guard(cuda::device_ref{0}); _CCCL_TRY_CUDA_API(::cudaMallocHost, "failed to allocate pinned memory", &pv, size); } ~_malloc_pinned() { - cudax::__ensure_current_device guard(cuda::device_ref{0}); + cuda::__ensure_current_context guard(cuda::device_ref{0}); [[maybe_unused]] auto status = ::cudaFreeHost(pv); } diff --git a/cudax/test/containers/uninitialized_buffer.cu b/cudax/test/containers/uninitialized_buffer.cu index 8850373d472..e3f5df9bda3 100644 --- a/cudax/test/containers/uninitialized_buffer.cu +++ b/cudax/test/containers/uninitialized_buffer.cu @@ -241,7 +241,7 @@ C2H_TEST("uninitialized_buffer is usable with cudax::launch", "[container]") const int grid_size = 4; cudax::uninitialized_buffer buffer{ cuda::device_default_memory_pool(cuda::device_ref{0}), 1024}; - auto configuration = cudax::make_config(cuda::grid_dims(grid_size), cuda::block_dims<256>()); + auto configuration = cuda::make_config(cuda::grid_dims(grid_size), cuda::block_dims<256>()); cudax::stream stream{cuda::device_ref{0}}; @@ -253,7 +253,7 @@ C2H_TEST("uninitialized_buffer is usable with cudax::launch", "[container]") const int grid_size = 4; const cudax::uninitialized_buffer buffer{ cuda::device_default_memory_pool(cuda::device_ref{0}), 1024}; - auto configuration = cudax::make_config(cuda::grid_dims(grid_size), cuda::block_dims<256>()); + auto configuration = cuda::make_config(cuda::grid_dims(grid_size), cuda::block_dims<256>()); cudax::stream stream{cuda::device_ref{0}}; diff --git a/cudax/test/launch/launch_smoke.cu b/cudax/test/launch/launch_smoke.cu index 3e6eb0615fc..fc884bf74de 100644 --- a/cudax/test/launch/launch_smoke.cu +++ b/cudax/test/launch/launch_smoke.cu @@ -38,7 +38,7 @@ struct kernel_run_proof_check void check_kernel_run(cudax::path_builder& pb) { - cudax::launch(pb, cudax::make_config(cuda::block_dims<1>, cuda::grid_dims<1>), kernel_run_proof_check{}); + cudax::launch(pb, cuda::make_config(cuda::block_dims<1>, cuda::grid_dims<1>), kernel_run_proof_check{}); } struct functor_int_argument @@ -88,7 +88,7 @@ struct dynamic_smem_single template __device__ void operator()(Config config) { - decltype(auto) dynamic_smem = cudax::device::dynamic_shared_memory_view(config); + decltype(auto) dynamic_smem = cuda::dynamic_shared_memory_view(config); static_assert(::cuda::std::is_same_v); CUDAX_REQUIRE(::cuda::device::is_object_from(dynamic_smem, ::cuda::device::address_space::shared)); kernel_run_proof = true; @@ -101,7 +101,7 @@ struct dynamic_smem_span template __device__ void operator()(Config config, int size) { - auto dynamic_smem = cudax::device::dynamic_shared_memory_view(config); + auto dynamic_smem = cuda::dynamic_shared_memory_view(config); static_assert(decltype(dynamic_smem)::extent == Extent); static_assert(::cuda::std::is_same_v); CUDAX_REQUIRE(dynamic_smem.size() == size); @@ -168,7 +168,7 @@ void launch_smoke_test(StreamOrPathBuilder& dst) const int grid_size = 4; constexpr int block_size = 256; auto dimensions = cuda::make_hierarchy(cuda::grid_dims(grid_size), cuda::block_dims<256>()); - auto config = cudax::make_config(dimensions); + auto config = cuda::make_config(dimensions); // Not taking dims { @@ -264,7 +264,7 @@ void launch_smoke_test(StreamOrPathBuilder& dst) auto test = [&](const auto& input_config) { // Single element { - auto config = input_config.add(cudax::dynamic_shared_memory()); + auto config = input_config.add(cuda::dynamic_shared_memory()); cudax::launch(dst, config, dynamic_smem_single()); check_kernel_run(dst); @@ -273,7 +273,7 @@ void launch_smoke_test(StreamOrPathBuilder& dst) // Dynamic span { const int size = 2; - auto config = input_config.add(cudax::dynamic_shared_memory(size)); + auto config = input_config.add(cuda::dynamic_shared_memory(size)); cudax::launch(dst, config, dynamic_smem_span(), size); check_kernel_run(dst); } @@ -281,14 +281,14 @@ void launch_smoke_test(StreamOrPathBuilder& dst) // Static span { constexpr int size = 3; - auto config = input_config.add(cudax::dynamic_shared_memory()); + auto config = input_config.add(cuda::dynamic_shared_memory()); cudax::launch(dst, config, dynamic_smem_span(), size); check_kernel_run(dst); } }; test(config); - test(config.add(cudax::cooperative_launch(), cudax::launch_priority(0))); + test(config.add(cuda::cooperative_launch(), cuda::launch_priority(0))); } } @@ -361,23 +361,23 @@ void test_default_config() SECTION("Combine with empty") { - kernel_with_default_config kernel{cudax::make_config(block, grid, cudax::cooperative_launch())}; - static_assert(cudax::__is_kernel_config); - static_assert(cudax::__kernel_has_default_config); + kernel_with_default_config kernel{cuda::make_config(block, grid, cuda::cooperative_launch())}; + static_assert(cuda::__is_kernel_config); + static_assert(cuda::__kernel_has_default_config); - cudax::launch(stream, cudax::make_config(), kernel, verify_lambda); + cudax::launch(stream, cuda::make_config(), kernel, verify_lambda); stream.sync(); } SECTION("Combine with no overlap") { - kernel_with_default_config kernel{cudax::make_config(block)}; - cudax::launch(stream, cudax::make_config(grid, cudax::cooperative_launch()), kernel, verify_lambda); + kernel_with_default_config kernel{cuda::make_config(block)}; + cudax::launch(stream, cuda::make_config(grid, cuda::cooperative_launch()), kernel, verify_lambda); stream.sync(); } SECTION("Combine with overlap") { - kernel_with_default_config kernel{cudax::make_config(cuda::block_dims<1>, cudax::cooperative_launch())}; - cudax::launch(stream, cudax::make_config(block, grid, cudax::cooperative_launch()), kernel, verify_lambda); + kernel_with_default_config kernel{cuda::make_config(cuda::block_dims<1>, cuda::cooperative_launch())}; + cudax::launch(stream, cuda::make_config(block, grid, cuda::cooperative_launch()), kernel, verify_lambda); stream.sync(); } } diff --git a/cudax/test/stream/stream_smoke.cu b/cudax/test/stream/stream_smoke.cu index 738d0a37824..734df9fc4f7 100644 --- a/cudax/test/stream/stream_smoke.cu +++ b/cudax/test/stream/stream_smoke.cu @@ -28,7 +28,7 @@ C2H_CCCLRT_TEST("Can create a stream and launch work into it", "[stream]") C2H_CCCLRT_TEST("From native handle", "[stream]") { - cudax::__ensure_current_device guard(cuda::device_ref{0}); + cuda::__ensure_current_context guard(cuda::device_ref{0}); cudaStream_t handle; CUDART(cudaStreamCreate(&handle)); { diff --git a/examples/cudax/vector_add/vector_add.cu b/examples/cudax/vector_add/vector_add.cu index 7fe938c8b8c..3779c7b7692 100644 --- a/examples/cudax/vector_add/vector_add.cu +++ b/examples/cudax/vector_add/vector_add.cu @@ -92,7 +92,7 @@ try // Define the kernel launch parameters constexpr int threadsPerBlock = 256; - auto config = cudax::distribute(numElements); + auto config = cuda::distribute(numElements); // Launch the vectorAdd kernel printf( diff --git a/cudax/include/cuda/experimental/__launch/configuration.cuh b/libcudacxx/include/cuda/__launch/configuration.h similarity index 84% rename from cudax/include/cuda/experimental/__launch/configuration.cuh rename to libcudacxx/include/cuda/__launch/configuration.h index ddb9cf6b1bd..d9b52872138 100644 --- a/cudax/include/cuda/experimental/__launch/configuration.cuh +++ b/libcudacxx/include/cuda/__launch/configuration.h @@ -1,6 +1,6 @@ //===----------------------------------------------------------------------===// // -// Part of CUDA Experimental in CUDA C++ Core Libraries, +// Part of libcu++, the C++ Standard Library for your entire system, // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception @@ -8,28 +8,37 @@ // //===----------------------------------------------------------------------===// -#ifndef _CUDAX__LAUNCH_CONFIGURATION_CUH -#define _CUDAX__LAUNCH_CONFIGURATION_CUH +#ifndef _CUDA___LAUNCH_CONFIGURATION_H +#define _CUDA___LAUNCH_CONFIGURATION_H -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include -#include +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header -#include +#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC) + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA -#if _CCCL_STD_VER >= 2017 -namespace cuda::experimental -{ template struct kernel_config; @@ -103,7 +112,7 @@ inline constexpr bool no_duplicate_options = * * @par Snippet * @code - * #include + * #include * #include * * template @@ -114,10 +123,10 @@ inline constexpr bool no_duplicate_options = * } * * void kernel_launch(cuda::stream_ref stream) { - * auto dims = cudax::make_hierarchy(cudax::block<128>(), cudax::grid(4)); - * auto conf = cudax::make_configuration(dims, cooperative_launch()); + * auto dims = cuda::make_hierarchy(cuda::block<128>(), cuda::grid(4)); + * auto conf = cuda::make_configuration(dims, cooperative_launch()); * - * cudax::launch(stream, conf, kernel); + * cuda::launch(stream, conf, kernel); * } * @endcode */ @@ -188,8 +197,8 @@ inline constexpr ::cuda::std::size_t __max_portable_dyn_smem_size = 48 * 1024; * * When launch configuration contains this option, that configuration can be * then passed to dynamic_shared_memory_view to get the view_type over the - * dynamic shared memory. It is also possible to obtain that memory through the - * original extern __shared__ variable[] declaration. + * dynamic shared memory. It is also possible to obtain that memory through + * the original extern __shared__ variable[] declaration. * * CUDA guarantees that each device has at least 48kB of shared memory * per block, but most devices have more than that. @@ -199,21 +208,21 @@ inline constexpr ::cuda::std::size_t __max_portable_dyn_smem_size = 48 * 1024; * * @par Snippet * @code - * #include + * #include * * template * __global__ void kernel(Configuration conf) * { - * auto dynamic_shared = cudax::dynamic_shared_memory_view(conf); + * auto dynamic_shared = cuda::dynamic_shared_memory_view(conf); * dynamic_shared[0] = 1; * } * * void kernel_launch(cuda::stream_ref stream) { - * auto dims = cudax::make_hierarchy(cudax::block<128>(), cudax::grid(4)); - * auto conf = cudax::make_configuration(dims, + * auto dims = cuda::make_hierarchy(cuda::block<128>(), cuda::grid(4)); + * auto conf = cuda::make_configuration(dims, * dynamic_shared_memory()); * - * cudax::launch(stream, conf, kernel); + * cuda::launch(stream, conf, kernel); * } * @endcode * @par @@ -246,10 +255,10 @@ class _CCCL_DECLSPEC_EMPTY_BASES dynamic_shared_memory bool __non_portable_{}; //!< \c true if the object was created with //!< non_portable flag. - using typename __base_type::value_type; //!< Value type of the dynamic shared - //!< memory elements. + using typename __base_type::value_type; //!< Value type of the dynamic + //!< shared memory elements. using typename __base_type::view_type; //!< The view type returned by the - //!< cuda::device::dynamic_shared_memory_view(config). + //!< cuda::dynamic_shared_memory_view(config). static constexpr bool is_relevant_on_device = true; static constexpr __detail::launch_option_kind kind = __detail::launch_option_kind::dynamic_shared_memory; @@ -327,8 +336,8 @@ template { ::cudaError_t __status = ::cudaSuccess; - // Since CUDA 12.4, querying CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES requires the - // function to be loaded. + // Since CUDA 12.4, querying CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES requires + // the function to be loaded. if (::cuda::__driver::__version_at_least(12, 4)) { __status = ::cuda::__driver::__functionLoadNoThrow(__kernel); @@ -452,11 +461,11 @@ _CCCL_CONCEPT __kernel_has_default_config = /** * @brief Type describing a kernel launch configuration * - * This type should not be constructed directly and make_config helper function - * should be used instead + * This type should not be constructed directly and make_config helper + * function should be used instead * * @tparam Dimensions - * cuda::experimental::hierarchy_dimensions instance that describes dimensions + * cuda::hierarchy_dimensions instance that describes dimensions * of thread hierarchy in this configuration object * * @tparam Options @@ -501,13 +510,13 @@ struct kernel_config * and the configuration from argument. It contains dimensions that are * combination of dimensions in this object and the other configuration. The * resulting hierarchy holds levels present in both hierarchies. In case of - * overlap of levels hierarchy from this configuration is prioritized, so the - * result always holds all levels from this hierarchy and non-overlapping - * levels from the other hierarchy. This behavior is the same as `combine()` - * member function of the hierarchy type. The result also contains - * configuration options from both configurations. In case the same type of a - * configuration option is present in both configuration this configuration is - * copied into the resulting configuration. + * overlap of levels hierarchy from this configuration is prioritized, so + * the result always holds all levels from this hierarchy and + * non-overlapping levels from the other hierarchy. This behavior is the + * same as `combine()` member function of the hierarchy type. The result + * also contains configuration options from both configurations. In case the + * same type of a configuration option is present in both configuration this + * configuration is copied into the resulting configuration. * * @param __other_config * Other configuration to combine with this configuration @@ -527,12 +536,12 @@ struct kernel_config * functor * * Returns a new `kernel_config` that is a combination of this configuration - * and a default configuration from the kernel argument. Default configuration - * is a `kernel_config` object returned from `default_config()` member - * function of the kernel type. The configurations are combined using the - * `combine()` member function of this configuration. If the kernel has no - * default configuration, a copy of this configuration is returned without any - * changes. + * and a default configuration from the kernel argument. Default + * configuration is a `kernel_config` object returned from + * `default_config()` member function of the kernel type. The configurations + * are combined using the `combine()` member function of this configuration. + * If the kernel has no default configuration, a copy of this configuration + * is returned without any changes. * * @param __kernel * Kernel functor to search for the default configuration @@ -550,22 +559,19 @@ struct kernel_config } } }; -} // namespace cuda::experimental - -_CCCL_BEGIN_NAMESPACE_CUDA // We can consider removing the operator&, but its convenient for in-line // construction template _CCCL_HOST_API constexpr auto -operator&(const experimental::kernel_config& config, const NewLevel& new_level) noexcept +operator&(const kernel_config& config, const NewLevel& new_level) noexcept { return kernel_config(hierarchy_add_level(config.dims, new_level), config.options); } template _CCCL_HOST_API constexpr auto -operator&(const NewLevel& new_level, const experimental::kernel_config& config) noexcept +operator&(const NewLevel& new_level, const kernel_config& config) noexcept { return kernel_config(hierarchy_add_level(config.dims, new_level), config.options); } @@ -574,13 +580,9 @@ template _CCCL_HOST_API constexpr auto operator&(const level_dimensions& l1, const level_dimensions& l2) noexcept { - return experimental::kernel_config(cuda::make_hierarchy(l1, l2)); + return kernel_config(::cuda::make_hierarchy(l1, l2)); } -_CCCL_END_NAMESPACE_CUDA - -namespace cuda::experimental -{ template auto __make_config_from_tuple(const _Dimensions& __dims, const ::cuda::std::tuple<_Options...>& __opts) { @@ -634,8 +636,8 @@ make_config(const hierarchy_dimensions& dims, const Opts& * * @par Snippet * @code - * #include - * using namespace cuda::experimental; + * #include + * using namespace cuda; * * constexpr int threadsPerBlock = 256; * auto dims = distribute(numElements); @@ -706,49 +708,30 @@ inline unsigned int constexpr kernel_config_count_attr_space(const kernel_config return (0 + ... + Options::needs_attribute_space); } -template -[[nodiscard]] cudaError_t apply_kernel_config( - const kernel_config& config, cudaLaunchConfig_t& cuda_config, void* kernel) noexcept -{ - cudaError_t status = cudaSuccess; - - ::cuda::std::apply( - [&](auto&... config_options) { - // Use short-cutting && to skip the rest on error, is this too - // convoluted? - (void) (... && [&](cudaError_t call_status) { - status = call_status; - return call_status == cudaSuccess; - }(config_options.apply(cuda_config, kernel))); - }, - config.options); - - return status; -} - template [[nodiscard]] cudaError_t apply_kernel_config( const kernel_config& config, CUlaunchConfig& cuda_config, CUfunction kernel) noexcept { - cudaError_t status = cudaSuccess; - - ::cuda::std::apply( + return ::cuda::std::apply( [&](auto&... config_options) { + cudaError_t __status = cudaSuccess; + // Use short-cutting && to skip the rest on error, is this too // convoluted? - (void) (... && [&](cudaError_t call_status) { - status = call_status; + // For some reason gcc 7 complains about __status capture, so we pass it as a reference + (void) (... && [](cudaError_t call_status, cudaError_t& __status_out) { + __status_out = call_status; return call_status == cudaSuccess; - }(::cuda::experimental::__apply_launch_option(config_options, cuda_config, kernel))); + }(::cuda::__apply_launch_option(config_options, cuda_config, kernel), __status)); + + return __status; }, config.options); - - return status; } } // namespace __detail -namespace device -{ +# if _CCCL_CUDA_COMPILATION() + template _CCCL_DEVICE_API decltype(auto) dynamic_shared_memory_view(const kernel_config<_Dims, _Opts...>& __config) noexcept { @@ -759,10 +742,13 @@ _CCCL_DEVICE_API decltype(auto) dynamic_shared_memory_view(const kernel_config<_ extern __shared__ unsigned char __cccl_device_dyn_smem[]; return __opt.__make_view(reinterpret_cast(__cccl_device_dyn_smem)); } -} // namespace device -} // namespace cuda::experimental -#endif // _CCCL_STD_VER >= 2017 -#include +# endif // _CCCL_CUDA_COMPILATION() + +_CCCL_END_NAMESPACE_CUDA + +# include + +#endif // _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC) -#endif // _CUDAX__LAUNCH_CONFIGURATION_CUH +#endif // _CUDA___LAUNCH_CONFIGURATION_H diff --git a/libcudacxx/include/cuda/__launch/host_launch.h b/libcudacxx/include/cuda/__launch/host_launch.h index cf44f020d0e..7c700bfa42e 100644 --- a/libcudacxx/include/cuda/__launch/host_launch.h +++ b/libcudacxx/include/cuda/__launch/host_launch.h @@ -21,15 +21,17 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include -#include -#include -#include +#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC) -#include +# include +# include +# include +# include +# include +# include +# include + +# include _CCCL_BEGIN_NAMESPACE_CUDA @@ -106,6 +108,8 @@ _CCCL_HOST_API void host_launch(stream_ref __stream, ::cuda::std::reference_wrap } _CCCL_END_NAMESPACE_CUDA -#include +# include + +#endif // _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC) #endif // !_CUDA___LAUNCH_HOST_LAUNCH_H diff --git a/libcudacxx/include/cuda/__launch/launch.h b/libcudacxx/include/cuda/__launch/launch.h new file mode 100644 index 00000000000..a9066e00b91 --- /dev/null +++ b/libcudacxx/include/cuda/__launch/launch.h @@ -0,0 +1,334 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___LAUNCH_LAUNCH_H +#define _CUDA___LAUNCH_LAUNCH_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC) + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA + +# if _CCCL_CUDA_COMPILATION() + +template +__global__ static void __kernel_launcher(const _CCCL_GRID_CONSTANT _Config __conf, _Kernel __kernel_fn, _Args... __args) +{ + __kernel_fn(__conf, __args...); +} + +template +__global__ static void __kernel_launcher_no_config(_Kernel __kernel_fn, _Args... __args) +{ + __kernel_fn(__args...); +} + +# endif // _CCCL_CUDA_COMPILATION() + +template +[[nodiscard]] _CCCL_HOST_API ::CUfunction __get_cufunction_of(void (*__kernel)(_Args...)) +{ + ::cudaFunction_t __kernel_cufunction{}; + _CCCL_TRY_CUDA_API( + ::cudaGetFuncBySymbol, "Failed to get function from symbol", &__kernel_cufunction, (const void*) __kernel); + return (::CUfunction) __kernel_cufunction; +} + +_CCCL_HOST_API void inline __do_launch( + ::cuda::stream_ref __stream, ::CUlaunchConfig& __config, ::CUfunction __kernel, void** __args_ptrs) +{ + __config.hStream = __stream.get(); +# if defined(_CCCLRT_LAUNCH_CONFIG_TEST) + test_launch_kernel_replacement(__config, __kernel, __args_ptrs); +# else // ^^^ _CUDAX_LAUNCH_CONFIG_TEST ^^^ / vvv !_CUDAX_LAUNCH_CONFIG_TEST vvv + ::cuda::__driver::__launchKernel(__config, __kernel, __args_ptrs); +# endif // ^^^ !_CUDAX_LAUNCH_CONFIG_TEST ^^^ +} + +template +_CCCL_HOST_API auto __launch_impl(_Dst&& __dst, _Config __conf, ::CUfunction __kernel, _ExpTypes... __args) +{ + static_assert(!::cuda::std::is_same_v, + "Can't launch a configuration without hierarchy dimensions"); + ::CUlaunchConfig __config{}; + constexpr bool __has_cluster_level = has_level; + constexpr unsigned int __num_attrs_needed = __detail::kernel_config_count_attr_space(__conf) + __has_cluster_level; + ::CUlaunchAttribute __attrs[__num_attrs_needed == 0 ? 1 : __num_attrs_needed]; + __config.attrs = &__attrs[0]; + __config.numAttrs = 0; + + ::cudaError_t __status = __detail::apply_kernel_config(__conf, __config, __kernel); + if (__status != ::cudaSuccess) + { + ::cuda::__throw_cuda_error(__status, "Failed to prepare a launch configuration"); + } + + __config.gridDimX = static_cast(__conf.dims.extents(block, grid).x); + __config.gridDimY = static_cast(__conf.dims.extents(block, grid).y); + __config.gridDimZ = static_cast(__conf.dims.extents(block, grid).z); + __config.blockDimX = static_cast(__conf.dims.extents(thread, block).x); + __config.blockDimY = static_cast(__conf.dims.extents(thread, block).y); + __config.blockDimZ = static_cast(__conf.dims.extents(thread, block).z); + + if constexpr (__has_cluster_level) + { + ::CUlaunchAttribute __cluster_dims_attr{}; + __cluster_dims_attr.id = ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; + __cluster_dims_attr.value.clusterDim.x = static_cast(__conf.dims.extents(block, cluster).x); + __cluster_dims_attr.value.clusterDim.y = static_cast(__conf.dims.extents(block, cluster).y); + __cluster_dims_attr.value.clusterDim.z = static_cast(__conf.dims.extents(block, cluster).z); + __config.attrs[__config.numAttrs++] = __cluster_dims_attr; + } + + const void* __pArgs[(sizeof...(__args) > 0) ? sizeof...(__args) : 1]{::cuda::std::addressof(__args)...}; + return ::cuda::__do_launch(::cuda::std::forward<_Dst>(__dst), __config, __kernel, const_cast(__pArgs)); +} + +_CCCL_HOST_API ::cuda::stream_ref inline __stream_or_invalid(::cuda::stream_ref __stream) +{ + return __stream; +} + +// cast to stream_ref to avoid instantiating launch_impl for every type +// convertible to stream_ref +template +_CCCL_HOST_API ::cuda::stream_ref __forward_or_cast_to_stream_ref(::cuda::stream_ref __stream) +{ + return __stream; +} + +template +_CCCL_CONCEPT work_submitter = ::cuda::std::is_convertible_v<_Submitter, ::cuda::stream_ref>; + +# if _CCCL_CUDA_COMPILATION() + +//! @brief Launch a kernel functor with specified configuration and arguments +//! +//! Launches a kernel functor object on the specified stream and with specified +//! configuration. Kernel functor object is a type with __device__ operator(). +//! Functor might or might not accept the configuration as its first argument. +//! +//! @par Snippet +//! @code +//! #include +//! #include +//! +//! struct kernel { +//! template +//! __device__ void operator()(Configuration conf, unsigned int +//! thread_to_print) { +//! if (conf.dims.rank(cuda::thread, cuda::grid) == thread_to_print) { +//! printf("Hello from the GPU\n"); +//! } +//! } +//! }; +//! +//! void launch_kernel(cuda::stream_ref stream) { +//! auto dims = cuda::make_hierarchy(cuda::block_dims<128>(), +//! cuda::grid_dims(4)); auto config = cuda::make_config(dims, +//! cuda::launch_cooperative()); +//! +//! cuda::launch(stream, config, kernel(), 42); +//! } +//! @endcode +//! +//! @param stream +//! cuda::stream_ref to launch the kernel into +//! +//! @param conf +//! configuration for this launch +//! +//! @param kernel +//! kernel functor to be launched +//! +//! @param args +//! arguments to be passed into the kernel functor +_CCCL_TEMPLATE(typename... _Args, typename... _Config, typename _Submitter, typename _Dimensions, typename _Kernel) +_CCCL_REQUIRES(work_submitter<_Submitter> _CCCL_AND(!::cuda::std::is_pointer_v<_Kernel>) + _CCCL_AND(!::cuda::std::is_function_v<_Kernel>)) +_CCCL_HOST_API auto launch(_Submitter&& __submitter, + const kernel_config<_Dimensions, _Config...>& __conf, + const _Kernel& __kernel, + _Args&&... __args) +{ + __ensure_current_context __dev_setter{__submitter}; + auto __combined = __conf.combine_with_default(__kernel); + if constexpr (::cuda::std::is_invocable_v<_Kernel, + kernel_config<_Dimensions, _Config...>, + ::cuda::std::decay_t>...>) + { + auto __launcher = + __kernel_launcher>...>; + return __launch_impl( + cuda::__forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)), + __combined, + ::cuda::__get_cufunction_of(__launcher), + __combined, + __kernel, + device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_Args>(__args))...); + } + else + { + static_assert(::cuda::std::is_invocable_v<_Kernel, ::cuda::std::decay_t>...>); + auto __launcher = + __kernel_launcher_no_config<_Kernel, ::cuda::std::decay_t>...>; + return ::cuda::__launch_impl( + cuda::__forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)), + __combined, + ::cuda::__get_cufunction_of(__launcher), + __kernel, + device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_Args>(__args))...); + } +} + +# endif // _CCCL_CUDA_COMPILATION() + +//! @brief Launch a kernel function with specified configuration and arguments +//! +//! Launches a kernel function on the specified stream and with specified +//! configuration. Kernel function is a function with __global__ annotation. +//! Function might or might not accept the configuration as its first argument. +//! +//! @par Snippet +//! @code +//! #include +//! #include +//! +//! template +//! __global__ void kernel(Configuration conf, unsigned int thread_to_print) { +//! if (conf.dims.rank(cuda::thread, cuda::grid) == thread_to_print) { +//! printf("Hello from the GPU\n"); +//! } +//! } +//! +//! void launch_kernel(cuda::stream_ref stream) { +//! auto dims = cuda::make_hierarchy(cuda::block_dims<128>(), +//! cuda::grid_dims(4)); auto config = cuda::make_config(dims, +//! cuda::launch_cooperative()); +//! +//! cuda::launch(stream, config, kernel, 42); +//! } +//! @endcode +//! +//! @param stream +//! cuda::stream_ref to launch the kernel into +//! +//! @param conf +//! configuration for this launch +//! +//! @param kernel +//! kernel function to be launched +//! +//! @param args +//! arguments to be passed into the kernel function +//! +_CCCL_TEMPLATE( + typename... _ExpArgs, typename... _ActArgs, typename _Submitter, typename... _Config, typename _Dimensions) +_CCCL_REQUIRES(work_submitter<_Submitter> _CCCL_AND(sizeof...(_ExpArgs) == sizeof...(_ActArgs))) +_CCCL_HOST_API auto launch(_Submitter&& __submitter, + const kernel_config<_Dimensions, _Config...>& __conf, + void (*__kernel)(kernel_config<_Dimensions, _Config...>, _ExpArgs...), + _ActArgs&&... __args) +{ + __ensure_current_context __dev_setter{__submitter}; + return ::cuda::__launch_impl, + _ExpArgs...>( + cuda::__forward_or_cast_to_stream_ref<_Submitter>(__submitter), // + __conf, + ::cuda::__get_cufunction_of(__kernel), + __conf, + device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...); +} + +//! @brief Launch a kernel function with specified configuration and arguments +//! +//! Launches a kernel function on the specified stream and with specified +//! configuration. Kernel function is a function with __global__ annotation. +//! Function might or might not accept the configuration as its first argument. +//! +//! @par Snippet +//! @code +//! #include +//! #include +//! +//! template +//! __global__ void kernel(Configuration conf, unsigned int thread_to_print) { +//! if (conf.dims.rank(cuda::thread, cuda::grid) == thread_to_print) { +//! printf("Hello from the GPU\n"); +//! } +//! } +//! +//! void launch_kernel(cuda::stream_ref stream) { +//! auto dims = cuda::make_hierarchy(cuda::block_dims<128>(), +//! cuda::grid_dims(4)); auto config = cuda::make_config(dims, +//! cuda::launch_cooperative()); +//! +//! cuda::launch(stream, config, kernel, 42); +//! } +//! @endcode +//! +//! @param __stream +//! cuda::stream_ref to launch the kernel into +//! +//! @param __conf +//! configuration for this launch +//! +//! @param __kernel +//! kernel function to be launched +//! +//! @param __args +//! arguments to be passed into the kernel function +_CCCL_TEMPLATE( + typename... _ExpArgs, typename... _ActArgs, typename _Submitter, typename... _Config, typename _Dimensions) +_CCCL_REQUIRES(work_submitter<_Submitter> _CCCL_AND(sizeof...(_ExpArgs) == sizeof...(_ActArgs))) +_CCCL_HOST_API auto launch(_Submitter&& __submitter, + const kernel_config<_Dimensions, _Config...>& __conf, + void (*__kernel)(_ExpArgs...), + _ActArgs&&... __args) +{ + __ensure_current_context __dev_setter{__submitter}; + return ::cuda::__launch_impl<_ExpArgs...>( + cuda::__forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)), // + __conf, + ::cuda::__get_cufunction_of(__kernel), + device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...); +} + +_CCCL_END_NAMESPACE_CUDA + +# include + +#endif // _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC) + +#endif // _CUDA___LAUNCH_LAUNCH_H diff --git a/libcudacxx/include/cuda/launch b/libcudacxx/include/cuda/launch new file mode 100644 index 00000000000..5647ee62c47 --- /dev/null +++ b/libcudacxx/include/cuda/launch @@ -0,0 +1,28 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_LAUNCH +#define _CUDA_LAUNCH + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#endif //_CUDA_LAUNCH diff --git a/cudax/test/launch/configuration.cu b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/configuration.cu similarity index 66% rename from cudax/test/launch/configuration.cu rename to libcudacxx/test/libcudacxx/cuda/ccclrt/launch/configuration.cu index 9f171fce2f1..cf7e15183cd 100644 --- a/cudax/test/launch/configuration.cu +++ b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/configuration.cu @@ -17,8 +17,8 @@ void test_launch_kernel_replacement(CUlaunchConfig& config, CUfunction kernel, v // matches the expected configuration and calls the original launch kernel // function if it does. If the configuration does not match, it will fail the // test. -#define _CUDAX_LAUNCH_CONFIG_TEST -#include +#define _CCCLRT_LAUNCH_CONFIG_TEST +#include #include @@ -30,15 +30,15 @@ void test_launch_kernel_replacement(CUlaunchConfig& config, CUfunction kernel, v replacementCalled = true; bool has_cluster = false; - CUDAX_CHECK(expectedConfig.gridDimX == config.gridDimX); - CUDAX_CHECK(expectedConfig.gridDimY == config.gridDimY); - CUDAX_CHECK(expectedConfig.gridDimZ == config.gridDimZ); - CUDAX_CHECK(expectedConfig.blockDimX == config.blockDimX); - CUDAX_CHECK(expectedConfig.blockDimY == config.blockDimY); - CUDAX_CHECK(expectedConfig.blockDimZ == config.blockDimZ); - CUDAX_CHECK(expectedConfig.sharedMemBytes == config.sharedMemBytes); - CUDAX_CHECK(expectedConfig.hStream == config.hStream); - CUDAX_CHECK(expectedConfig.numAttrs == config.numAttrs); + CCCLRT_CHECK(expectedConfig.gridDimX == config.gridDimX); + CCCLRT_CHECK(expectedConfig.gridDimY == config.gridDimY); + CCCLRT_CHECK(expectedConfig.gridDimZ == config.gridDimZ); + CCCLRT_CHECK(expectedConfig.blockDimX == config.blockDimX); + CCCLRT_CHECK(expectedConfig.blockDimY == config.blockDimY); + CCCLRT_CHECK(expectedConfig.blockDimZ == config.blockDimZ); + CCCLRT_CHECK(expectedConfig.sharedMemBytes == config.sharedMemBytes); + CCCLRT_CHECK(expectedConfig.hStream == config.hStream); + CCCLRT_CHECK(expectedConfig.numAttrs == config.numAttrs); for (unsigned int i = 0; i < expectedConfig.numAttrs; ++i) { @@ -52,26 +52,26 @@ void test_launch_kernel_replacement(CUlaunchConfig& config, CUfunction kernel, v switch (expectedAttr.id) { case CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION: - CUDAX_CHECK(expectedAttr.value.clusterDim.x == actualAttr.value.clusterDim.x); - CUDAX_CHECK(expectedAttr.value.clusterDim.y == actualAttr.value.clusterDim.y); - CUDAX_CHECK(expectedAttr.value.clusterDim.z == actualAttr.value.clusterDim.z); + CCCLRT_CHECK(expectedAttr.value.clusterDim.x == actualAttr.value.clusterDim.x); + CCCLRT_CHECK(expectedAttr.value.clusterDim.y == actualAttr.value.clusterDim.y); + CCCLRT_CHECK(expectedAttr.value.clusterDim.z == actualAttr.value.clusterDim.z); has_cluster = true; break; case CU_LAUNCH_ATTRIBUTE_COOPERATIVE: - CUDAX_CHECK(expectedAttr.value.cooperative == actualAttr.value.cooperative); + CCCLRT_CHECK(expectedAttr.value.cooperative == actualAttr.value.cooperative); break; case CU_LAUNCH_ATTRIBUTE_PRIORITY: - CUDAX_CHECK(expectedAttr.value.priority == actualAttr.value.priority); + CCCLRT_CHECK(expectedAttr.value.priority == actualAttr.value.priority); break; default: - CUDAX_CHECK(false); + CCCLRT_CHECK(false); break; } break; } } INFO("Searched attribute is " << expectedAttr.id); - CUDAX_CHECK(j != expectedConfig.numAttrs); + CCCLRT_CHECK(j != expectedConfig.numAttrs); } if (!has_cluster || !skip_device_exec(arch_filter, 90>)) @@ -128,7 +128,7 @@ auto configuration_test( SECTION("Simple cooperative launch") { CUlaunchAttribute attrs[2]; - auto config = cudax::make_config(dims, cudax::cooperative_launch()); + auto config = cuda::make_config(dims, cuda::cooperative_launch()); expectedConfig.numAttrs = 1 + HasCluster; expectedConfig.attrs = &attrs[0]; expectedConfig.attrs[0].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE; @@ -137,7 +137,7 @@ auto configuration_test( { add_cluster(cluster_dims, expectedConfig.attrs[1]); } - cudax::launch(stream, config, empty_kernel, 0); + cuda::launch(stream, config, empty_kernel, 0); } SECTION("Priority and dynamic smem") @@ -146,7 +146,7 @@ auto configuration_test( constexpr int priority = 42; constexpr int num_ints = 128; auto config = - cudax::make_config(dims, cudax::launch_priority(priority), cudax::dynamic_shared_memory()); + cuda::make_config(dims, cuda::launch_priority(priority), cuda::dynamic_shared_memory()); expectedConfig.sharedMemBytes = num_ints * sizeof(int); expectedConfig.numAttrs = 1 + HasCluster; expectedConfig.attrs = &attrs[0]; @@ -156,7 +156,7 @@ auto configuration_test( { add_cluster(cluster_dims, expectedConfig.attrs[1]); } - cudax::launch(stream, config, empty_kernel, 0); + cuda::launch(stream, config, empty_kernel, 0); } SECTION("Large dynamic smem") @@ -168,7 +168,7 @@ auto configuration_test( int arr[13 * 1024]; }; CUlaunchAttribute attrs[1]; - auto config = cudax::make_config(dims, cudax::dynamic_shared_memory(cudax::non_portable)); + auto config = cuda::make_config(dims, cuda::dynamic_shared_memory(cuda::non_portable)); expectedConfig.sharedMemBytes = sizeof(S); expectedConfig.numAttrs = HasCluster; expectedConfig.attrs = &attrs[0]; @@ -176,7 +176,7 @@ auto configuration_test( { add_cluster(cluster_dims, expectedConfig.attrs[0]); } - cudax::launch(stream, config, empty_kernel, 0); + cuda::launch(stream, config, empty_kernel, 0); } stream.sync(); } @@ -195,21 +195,21 @@ C2H_TEST("Launch configuration", "[launch]") } CUDART(cudaStreamDestroy(stream)); - CUDAX_CHECK(replacementCalled); + CCCLRT_CHECK(replacementCalled); } C2H_TEST("Hierarchy construction in config", "[launch]") { - auto config = cudax::make_config(cuda::grid_dims<2>(), cudax::cooperative_launch()); + auto config = cuda::make_config(cuda::grid_dims<2>(), cuda::cooperative_launch()); static_assert(config.dims.count(cuda::block) == 2); - auto config_larger = cudax::make_config(cuda::grid_dims<2>(), cuda::block_dims(256), cudax::cooperative_launch()); - CUDAX_REQUIRE(config_larger.dims.count(cuda::thread) == 512); + auto config_larger = cuda::make_config(cuda::grid_dims<2>(), cuda::block_dims(256), cuda::cooperative_launch()); + CCCLRT_REQUIRE(config_larger.dims.count(cuda::thread) == 512); - auto config_no_options = cudax::make_config(cuda::grid_dims(2), cuda::block_dims<128>()); - CUDAX_REQUIRE(config_no_options.dims.count(cuda::thread) == 256); + auto config_no_options = cuda::make_config(cuda::grid_dims(2), cuda::block_dims<128>()); + CCCLRT_REQUIRE(config_no_options.dims.count(cuda::thread) == 256); - [[maybe_unused]] auto config_no_dims = cudax::make_config(cudax::cooperative_launch()); + [[maybe_unused]] auto config_no_dims = cuda::make_config(cuda::cooperative_launch()); static_assert(cuda::std::is_same_v); } @@ -220,33 +220,32 @@ C2H_TEST("Configuration combine", "[launch]") auto block = cuda::block_dims(256); SECTION("Combine with no overlap") { - auto config_part1 = cudax::make_config(grid); - auto config_part2 = cudax::make_config(block, cudax::launch_priority(2)); + auto config_part1 = cuda::make_config(grid); + auto config_part2 = cuda::make_config(block, cuda::launch_priority(2)); auto combined = config_part1.combine(config_part2); [[maybe_unused]] auto combined_other_way = config_part2.combine(config_part1); - [[maybe_unused]] auto combined_with_empty = combined.combine(cudax::make_config()); - [[maybe_unused]] auto empty_with_combined = cudax::make_config().combine(combined); + [[maybe_unused]] auto combined_with_empty = combined.combine(cuda::make_config()); + [[maybe_unused]] auto empty_with_combined = cuda::make_config().combine(combined); static_assert( - cuda::std::is_same_v); + cuda::std::is_same_v); static_assert(cuda::std::is_same_v); static_assert(cuda::std::is_same_v); static_assert(cuda::std::is_same_v); - CUDAX_REQUIRE(combined.dims.count(cuda::thread) == 512); + CCCLRT_REQUIRE(combined.dims.count(cuda::thread) == 512); } SECTION("Combine with overlap") { - auto config_part1 = make_config(grid, cluster, cudax::launch_priority(2)); - auto config_part2 = make_config(cuda::cluster_dims<256>, block, cudax::launch_priority(42)); + auto config_part1 = make_config(grid, cluster, cuda::launch_priority(2)); + auto config_part2 = make_config(cuda::cluster_dims<256>(), block, cuda::launch_priority(42)); auto combined = config_part1.combine(config_part2); - CUDAX_REQUIRE(combined.dims.count(cuda::thread) == 2048); - CUDAX_REQUIRE(cuda::std::get<0>(combined.options).priority == 2); + CCCLRT_REQUIRE(combined.dims.count(cuda::thread) == 2048); + CCCLRT_REQUIRE(cuda::std::get<0>(combined.options).priority == 2); - auto replaced_one_option = cudax::make_config(cudax::launch_priority(3)).combine(combined); - CUDAX_REQUIRE(replaced_one_option.dims.count(cuda::thread) == 2048); - CUDAX_REQUIRE(cuda::std::get<0>(replaced_one_option.options).priority == 3); + auto replaced_one_option = cuda::make_config(cuda::launch_priority(3)).combine(combined); + CCCLRT_REQUIRE(replaced_one_option.dims.count(cuda::thread) == 2048); + CCCLRT_REQUIRE(cuda::std::get<0>(replaced_one_option.options).priority == 3); - [[maybe_unused]] auto combined_with_extra_option = - combined.combine(cudax::make_config(cudax::cooperative_launch())); + [[maybe_unused]] auto combined_with_extra_option = combined.combine(cuda::make_config(cuda::cooperative_launch())); static_assert(cuda::std::is_same_v); static_assert(cuda::std::tuple_size_v == 2); } diff --git a/cudax/test/launch/dynamic_shared_memory.cu b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/dynamic_shared_memory.cu similarity index 68% rename from cudax/test/launch/dynamic_shared_memory.cu rename to libcudacxx/test/libcudacxx/cuda/ccclrt/launch/dynamic_shared_memory.cu index 1cce086a97e..c0eaab93c05 100644 --- a/cudax/test/launch/dynamic_shared_memory.cu +++ b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/dynamic_shared_memory.cu @@ -10,14 +10,13 @@ #include #include +#include #include #include #include #include #include -#include - #include template @@ -26,16 +25,16 @@ struct TestKernel template __device__ void operator()(const Config& config) { - static_assert(cuda::std::is_same_v); - static_assert(noexcept(cudax::device::dynamic_shared_memory_view(config))); + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cuda::dynamic_shared_memory_view(config))); - write_smem(cudax::device::dynamic_shared_memory_view(config)); + write_smem(cuda::dynamic_shared_memory_view(config)); } __device__ void write_smem(T& view) { view = T{}; - CUDAX_REQUIRE(view == T{}); + CCCLRT_REQUIRE_DEVICE(view == T{}); } template @@ -44,7 +43,7 @@ struct TestKernel for (cuda::std::size_t i = 0; i < view.size(); ++i) { view[i] = T{}; - CUDAX_REQUIRE(view[i] == T{}); + CCCLRT_REQUIRE_DEVICE(view[i] == T{}); } } }; @@ -55,16 +54,16 @@ void test_opt_and_launch(cuda::stream_ref stream, Opt opt) static_assert(cuda::std::is_same_v); static_assert(cuda::std::is_same_v); - const auto config = cudax::make_config(cuda::block_dims<1, 1>(), cuda::grid_dims<1, 1>(), opt); - cudax::launch(stream, config, TestKernel{}); + const auto config = cuda::make_config(cuda::block_dims<1, 1>(), cuda::grid_dims<1, 1>(), opt); + cuda::launch(stream, config, TestKernel{}); stream.sync(); } template void test_ref(cuda::stream_ref stream) { - static_assert(noexcept(cudax::dynamic_shared_memory())); - test_opt_and_launch(stream, cudax::dynamic_shared_memory()); + static_assert(noexcept(cuda::dynamic_shared_memory())); + test_opt_and_launch(stream, cuda::dynamic_shared_memory()); } void test_ref(cuda::stream_ref stream) @@ -78,11 +77,11 @@ void test_ref(cuda::stream_ref stream) template void test_span(cuda::stream_ref stream) { - static_assert(!noexcept(cudax::dynamic_shared_memory(N))); - test_opt_and_launch>(stream, cudax::dynamic_shared_memory(N)); + static_assert(!noexcept(cuda::dynamic_shared_memory(N * 1024 * 1024))); + test_opt_and_launch>(stream, cuda::dynamic_shared_memory(N)); - static_assert(noexcept(cudax::dynamic_shared_memory())); - test_opt_and_launch>(stream, cudax::dynamic_shared_memory()); + static_assert(noexcept(cuda::dynamic_shared_memory())); + test_opt_and_launch>(stream, cuda::dynamic_shared_memory()); } void test_span(cuda::stream_ref stream) diff --git a/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu new file mode 100644 index 00000000000..f42e0a0b977 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu @@ -0,0 +1,341 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +#include +#include +#include +#include + +#include +#include + +#if !_CCCL_CUDA_COMPILER(CLANG) + +__managed__ bool kernel_run_proof = false; + +void check_kernel_run(cudaStream_t stream) +{ + CUDART(cudaStreamSynchronize(stream)); + CCCLRT_CHECK(kernel_run_proof); + kernel_run_proof = false; +} + +struct kernel_run_proof_check +{ + __device__ void operator()() + { + CCCLRT_CHECK_DEVICE(kernel_run_proof); + kernel_run_proof = false; + } +}; + +struct functor_int_argument +{ + __device__ void operator()(int dummy) + { + kernel_run_proof = true; + } +}; + +template +struct functor_taking_config +{ + template + __device__ void operator()(Config config, int grid_size) + { + static_assert(config.dims.static_count(cuda::thread, cuda::block) == BlockSize); + CCCLRT_REQUIRE_DEVICE(config.dims.count(cuda::block, cuda::grid) == grid_size); + kernel_run_proof = true; + } +}; + +__global__ void kernel_no_arguments() +{ + kernel_run_proof = true; +} + +__global__ void kernel_int_argument(int dummy) +{ + kernel_run_proof = true; +} + +template +__global__ void kernel_taking_config(Config config, int grid_size) +{ + functor_taking_config()(config, grid_size); +} + +struct my_dynamic_smem_t +{ + int i; +}; + +template +struct dynamic_smem_single +{ + template + __device__ void operator()(Config config) + { + decltype(auto) dynamic_smem = cuda::dynamic_shared_memory_view(config); + static_assert(::cuda::std::is_same_v); + CCCLRT_REQUIRE_DEVICE(::cuda::device::is_object_from(dynamic_smem, ::cuda::device::address_space::shared)); + kernel_run_proof = true; + } +}; + +template +struct dynamic_smem_span +{ + template + __device__ void operator()(Config config, int size) + { + auto dynamic_smem = cuda::dynamic_shared_memory_view(config); + static_assert(decltype(dynamic_smem)::extent == Extent); + static_assert(::cuda::std::is_same_v); + CCCLRT_REQUIRE_DEVICE(dynamic_smem.size() == size); + CCCLRT_REQUIRE_DEVICE(::cuda::device::is_object_from(dynamic_smem[1], ::cuda::device::address_space::shared)); + kernel_run_proof = true; + } +}; + +struct launch_transform_to_int_convertible +{ + int value_; + + struct int_convertible + { + cudaStream_t stream_; + int value_; + + int_convertible(cudaStream_t stream, int value) noexcept + : stream_(stream) + , value_(value) + { + // Check that the constructor runs before the kernel is launched + // Disabled for now because we don't handle it with graphs + // CUDAX_CHECK_FALSE(kernel_run_proof); + } + + // Immovable to ensure that device_transform doesn't copy the returned + // object + int_convertible(int_convertible&&) noexcept = delete; + + ~int_convertible() noexcept + { + // Check that the destructor runs after the kernel is launched + // Disabled for now because we don't handle it with graphs + // CUDART(cudaStreamSynchronize(stream_)); + // CCCLRT_CHECK(kernel_run_proof); + } + + // This is the value that will be passed to the kernel + int transformed_argument() const + { + return value_; + } + }; + + [[nodiscard]] friend int_convertible + transform_device_argument(::cuda::stream_ref stream, launch_transform_to_int_convertible self) noexcept + { + return int_convertible(stream.get(), self.value_); + } +}; + +// Needs a separate function for Windows extended lambda +void launch_smoke_test(cudaStream_t dst) +{ + cuda::__ensure_current_context guard(cuda::device_ref{0}); + // Use raw stream to make sure it can be implicitly converted on call to + // launch + cudaStream_t stream; + + CUDART(cudaStreamCreate(&stream)); + // Spell out all overloads to make sure they compile, include a check for + // implicit conversions + { + const int grid_size = 4; + constexpr int block_size = 256; + auto dimensions = cuda::make_hierarchy(cuda::grid_dims(grid_size), cuda::block_dims<256>()); + auto config = cuda::make_config(dimensions); + + // Not taking dims + { + cuda::launch(dst, config, kernel_no_arguments); + check_kernel_run(dst); + + const int dummy = 1; + cuda::launch(dst, config, kernel_int_argument, dummy); + check_kernel_run(dst); + cuda::launch(dst, config, kernel_int_argument, 1); + check_kernel_run(dst); + cuda::launch(dst, config, kernel_int_argument, launch_transform_to_int_convertible{1}); + check_kernel_run(dst); + cuda::launch(dst, config, kernel_int_argument, 1U); + check_kernel_run(dst); + + cuda::launch(dst, config, functor_int_argument(), dummy); + check_kernel_run(dst); + cuda::launch(dst, config, functor_int_argument(), 1); + check_kernel_run(dst); + cuda::launch(dst, config, functor_int_argument(), launch_transform_to_int_convertible{1}); + check_kernel_run(dst); + cuda::launch(dst, config, functor_int_argument(), 1U); + check_kernel_run(dst); + } + + // Config argument + { + auto functor_instance = functor_taking_config(); + auto kernel_instance = kernel_taking_config; + + cuda::launch(dst, config, functor_instance, grid_size); + check_kernel_run(dst); + cuda::launch(dst, config, functor_instance, ::cuda::std::move(grid_size)); + check_kernel_run(dst); + cuda::launch(dst, config, functor_instance, launch_transform_to_int_convertible{grid_size}); + check_kernel_run(dst); + cuda::launch(dst, config, functor_instance, static_cast(grid_size)); + check_kernel_run(dst); + + cuda::launch(dst, config, kernel_instance, grid_size); + check_kernel_run(dst); + cuda::launch(dst, config, kernel_instance, ::cuda::std::move(grid_size)); + check_kernel_run(dst); + cuda::launch(dst, config, kernel_instance, launch_transform_to_int_convertible{grid_size}); + check_kernel_run(dst); + cuda::launch(dst, config, kernel_instance, static_cast(grid_size)); + check_kernel_run(dst); + } + } + + /* Comment out for now until I figure how to enable extended lambda for only this file + // Lambda + { + cuda::launch(dst, cuda::block_dims<256>() & cuda::grid_dims(1), + [] __device__(auto config) { + if (config.dims.rank(cuda::thread, cuda::block) == 0) { + printf("Hello from the GPU\n"); + kernel_run_proof = true; + } + }); + check_kernel_run(dst); + } + */ + + // Dynamic shared memory option + { + auto config = cuda::block_dims<32>() & cuda::grid_dims<1>(); + + auto test = [&](const auto& input_config) { + // Single element + { + auto config = input_config.add(cuda::dynamic_shared_memory()); + + cuda::launch(dst, config, dynamic_smem_single()); + check_kernel_run(dst); + } + + // Dynamic span + { + const int size = 2; + auto config = input_config.add(cuda::dynamic_shared_memory(size)); + cuda::launch(dst, config, dynamic_smem_span(), size); + check_kernel_run(dst); + } + + // Static span + { + constexpr int size = 3; + auto config = input_config.add(cuda::dynamic_shared_memory()); + cuda::launch(dst, config, dynamic_smem_span(), size); + check_kernel_run(dst); + } + }; + + test(config); + test(config.add(cuda::cooperative_launch(), cuda::launch_priority(0))); + } +} + +C2H_TEST("Launch smoke stream", "[launch]") +{ + // Use raw stream to make sure it can be implicitly converted on call to + // launch + cudaStream_t stream; + + CUDART(cudaStreamCreate(&stream)); + + launch_smoke_test(stream); + + CUDART(cudaStreamSynchronize(stream)); + CUDART(cudaStreamDestroy(stream)); +} +#endif // !_CCCL_CUDA_COMPILER(CLANG) + +template +struct kernel_with_default_config +{ + DefaultConfig config; + + kernel_with_default_config(DefaultConfig c) + : config(c) + {} + + DefaultConfig default_config() const + { + return config; + } + + template + __device__ void operator()(Config config, ConfigCheckFn check_fn) + { + check_fn(config); + } +}; + +/* Comment out for now until I figure how to enable extended lambda for only this file +void test_default_config() { + cuda::stream stream{cuda::device_ref{0}}; + auto grid = cuda::grid_dims(4); + auto block = cuda::block_dims<256>; + + auto verify_lambda = [] __device__(auto config) { + static_assert(config.dims.count(cuda::thread, cuda::block) == 256); + CCCLRT_REQUIRE(config.dims.count(cuda::block) == 4); + cooperative_groups::this_grid().sync(); + }; + + SECTION("Combine with empty") { + kernel_with_default_config kernel{ + cuda::make_config(block, grid, cuda::cooperative_launch())}; + static_assert(cuda::__is_kernel_config); + static_assert(cuda::__kernel_has_default_config); + + cuda::launch(stream, cuda::make_config(), kernel, verify_lambda); + stream.sync(); + } + SECTION("Combine with no overlap") { + kernel_with_default_config kernel{cuda::make_config(block)}; + cuda::launch(stream, cuda::make_config(grid, cuda::cooperative_launch()), + kernel, verify_lambda); + stream.sync(); + } + SECTION("Combine with overlap") { + kernel_with_default_config kernel{ + cuda::make_config(cuda::block_dims<1>, cuda::cooperative_launch())}; + cuda::launch(stream, + cuda::make_config(block, grid, cuda::cooperative_launch()), + kernel, verify_lambda); + stream.sync(); + } +} + +C2H_TEST("Launch with default config", "") { test_default_config(); } +*/ diff --git a/libcudacxx/test/libcudacxx/cuda/containers/buffer/transform.cu b/libcudacxx/test/libcudacxx/cuda/containers/buffer/transform.cu index 7101ceb2cc5..560850d01b5 100644 --- a/libcudacxx/test/libcudacxx/cuda/containers/buffer/transform.cu +++ b/libcudacxx/test/libcudacxx/cuda/containers/buffer/transform.cu @@ -87,8 +87,7 @@ C2H_CCCLRT_TEST("cuda::buffer launch transform", "[container][buffer]") cuda::device_buffer a = cuda::make_buffer(stream, resource, array); const cuda::device_buffer b = cuda::make_buffer(stream, resource, a.size(), 1); - cuda::experimental::launch( - stream, cuda::experimental::make_config(cuda::grid_dims<1>(), cuda::block_dims<32>()), add_kernel{}, a, b); + cuda::launch(stream, cuda::make_config(cuda::grid_dims<1>(), cuda::block_dims<32>()), add_kernel{}, a, b); std::vector host_result(a.size()); cuda::copy_bytes(stream, a, host_result);