From 6bcd8bea15c2912a557536c44b218266ad9f54a3 Mon Sep 17 00:00:00 2001
From: pciolkosz <pciolkosz@nvidia.com>
Date: Thu, 4 Dec 2025 14:42:10 -0800
Subject: [PATCH] Move launch API from cudax to libcu++ (#6667)

* Move launch API from cudax to libcu++

* Review feedback and test fixes

* More fixes

* GCC7 fix

* Update libcudacxx/include/cuda/__launch/configuration.h

* Fix old GCC

---------

Co-authored-by: David Bayer <48736217+davebayer@users.noreply.github.com>
(cherry picked from commit 4ab39a768f23001ae2327093a20b1e303705aa0c)
---
 cudax/examples/simple_p2p.cu                  |   2 +-
 cudax/examples/vector_add.cu                  |   2 +-
 .../cuda/experimental/__execution/bulk.cuh    |   4 +-
 .../cuda/experimental/__execution/queries.cuh |   6 +-
 .../__execution/stream/adaptor.cuh            |   2 +-
 .../cuda/experimental/__launch/launch.cuh     | 192 +++++-----
 cudax/include/cuda/experimental/launch.cuh    |   2 +-
 cudax/test/CMakeLists.txt                     |  10 +-
 cudax/test/common/utility.cuh                 |   6 +-
 cudax/test/containers/uninitialized_buffer.cu |   4 +-
 cudax/test/launch/launch_smoke.cu             |  32 +-
 cudax/test/stream/stream_smoke.cu             |   2 +-
 examples/cudax/vector_add/vector_add.cu       |   2 +-
 .../include/cuda/__launch/configuration.h     | 182 +++++-----
 .../include/cuda/__launch/host_launch.h       |  22 +-
 libcudacxx/include/cuda/__launch/launch.h     | 334 +++++++++++++++++
 libcudacxx/include/cuda/launch                |  28 ++
 .../cuda/ccclrt}/launch/configuration.cu      |  91 +++--
 .../ccclrt}/launch/dynamic_shared_memory.cu   |  29 +-
 .../cuda/ccclrt/launch/launch_smoke.cu        | 341 ++++++++++++++++++
 .../cuda/containers/buffer/transform.cu       |   3 +-
 21 files changed, 974 insertions(+), 322 deletions(-)
 rename cudax/include/cuda/experimental/__launch/configuration.cuh => libcudacxx/include/cuda/__launch/configuration.h (84%)
 create mode 100644 libcudacxx/include/cuda/__launch/launch.h
 create mode 100644 libcudacxx/include/cuda/launch
 rename {cudax/test => libcudacxx/test/libcudacxx/cuda/ccclrt}/launch/configuration.cu (66%)
 rename {cudax/test => libcudacxx/test/libcudacxx/cuda/ccclrt}/launch/dynamic_shared_memory.cu (68%)
 create mode 100644 libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu
diff --git a/cudax/examples/simple_p2p.cu b/cudax/examples/simple_p2p.cu
index 050be513a69..42767b3972f 100644
--- a/cudax/examples/simple_p2p.cu
+++ b/cudax/examples/simple_p2p.cu
@@ -130,7 +130,7 @@ void test_cross_device_access_from_kernel(
   dev1_stream.wait(dev0_stream);
 
   // Kernel launch configuration
-  auto config = cudax::distribute<512>(dev0_buffer.size());
+  auto config = cuda::distribute<512>(dev0_buffer.size());
 
   // Run kernel on GPU 1, reading input from the GPU 0 buffer, writing output to the GPU 1 buffer
   printf("Run kernel on GPU%d, taking source data from GPU%d and writing to "
diff --git a/cudax/examples/vector_add.cu b/cudax/examples/vector_add.cu
index 7fe938c8b8c..3779c7b7692 100644
--- a/cudax/examples/vector_add.cu
+++ b/cudax/examples/vector_add.cu
@@ -92,7 +92,7 @@ try
 
   // Define the kernel launch parameters
   constexpr int threadsPerBlock = 256;
-  auto config                   = cudax::distribute<threadsPerBlock>(numElements);
+  auto config                   = cuda::distribute<threadsPerBlock>(numElements);
 
   // Launch the vectorAdd kernel
   printf(
diff --git a/cudax/include/cuda/experimental/__execution/bulk.cuh b/cudax/include/cuda/experimental/__execution/bulk.cuh
index 5e09938a514..e243d88e5ea 100644
--- a/cudax/include/cuda/experimental/__execution/bulk.cuh
+++ b/cudax/include/cuda/experimental/__execution/bulk.cuh
@@ -22,6 +22,7 @@
 #endif // no system header
 
 #include <cuda/__cmath/ceil_div.h>
+#include <cuda/__launch/configuration.h>
 #include <cuda/__utility/immovable.h>
 #include <cuda/std/__concepts/arithmetic.h>
 #include <cuda/std/__concepts/same_as.h>
@@ -45,7 +46,6 @@
 #include <cuda/experimental/__execution/transform_completion_signatures.cuh>
 #include <cuda/experimental/__execution/transform_sender.cuh>
 #include <cuda/experimental/__execution/type_traits.cuh>
-#include <cuda/experimental/__launch/configuration.cuh>
 
 #include <cuda/experimental/__execution/prologue.cuh>
 
@@ -73,7 +73,7 @@ struct _CCCL_TYPE_VISIBILITY_DEFAULT __attrs_t
   {
     constexpr int __block_threads = 256;
     const int __grid_blocks       = ::cuda::ceil_div(static_cast<int>(__shape), __block_threads);
-    return experimental::make_config(block_dims<__block_threads>(), grid_dims(__grid_blocks));
+    return make_config(block_dims<__block_threads>(), grid_dims(__grid_blocks));
   }
 
   using __launch_config_t = decltype(__get_launch_config(_Shape()));
diff --git a/cudax/include/cuda/experimental/__execution/queries.cuh b/cudax/include/cuda/experimental/__execution/queries.cuh
index eca7b476efc..c0017cf2b62 100644
--- a/cudax/include/cuda/experimental/__execution/queries.cuh
+++ b/cudax/include/cuda/experimental/__execution/queries.cuh
@@ -25,6 +25,7 @@ _CCCL_SUPPRESS_DEPRECATED_PUSH
 #include <cuda/std/__memory/allocator.h>
 _CCCL_SUPPRESS_DEPRECATED_POP
 
+#include <cuda/__launch/configuration.h>
 #include <cuda/std/__execution/env.h>
 #include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/is_callable.h>
@@ -39,7 +40,6 @@ _CCCL_SUPPRESS_DEPRECATED_POP
 #include <cuda/experimental/__execution/stop_token.cuh>
 #include <cuda/experimental/__execution/type_traits.cuh>
 #include <cuda/experimental/__execution/utility.cuh>
-#include <cuda/experimental/__launch/configuration.cuh>
 
 #include <cuda/experimental/__execution/prologue.cuh>
 
@@ -314,14 +314,14 @@ _CCCL_GLOBAL_CONSTANT struct get_forward_progress_guarantee_t
 } get_forward_progress_guarantee{};
 
 // By default, CUDA kernels are launched with a single thread and a single block.
-using __single_threaded_config_base_t = decltype(experimental::make_config(grid_dims<1>(), block_dims<1>()));
+using __single_threaded_config_base_t = decltype(make_config(grid_dims<1>(), block_dims<1>()));
 
 // We hide the complicated type of the default launch configuration so diagnositics are
 // easier to read.
 struct __single_threaded_config_t : __single_threaded_config_base_t
 {
   _CCCL_HOST_API constexpr __single_threaded_config_t() noexcept
-      : __single_threaded_config_base_t{experimental::make_config(grid_dims<1>(), block_dims<1>())}
+      : __single_threaded_config_base_t{make_config(grid_dims<1>(), block_dims<1>())}
   {}
 };
 
diff --git a/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh b/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh
index 00b1ee54b76..7498827acb6 100644
--- a/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh
+++ b/cudax/include/cuda/experimental/__execution/stream/adaptor.cuh
@@ -21,6 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__launch/configuration.h>
 #include <cuda/__utility/immovable.h>
 #include <cuda/std/__concepts/concept_macros.h>
 #include <cuda/std/__memory/unique_ptr.h>
@@ -36,7 +37,6 @@
 #include <cuda/experimental/__execution/utility.cuh>
 #include <cuda/experimental/__execution/variant.cuh>
 #include <cuda/experimental/__execution/visit.cuh>
-#include <cuda/experimental/__launch/configuration.cuh>
 #include <cuda/experimental/__launch/launch.cuh>
 #include <cuda/experimental/__stream/stream_ref.cuh>
 
diff --git a/cudax/include/cuda/experimental/__launch/launch.cuh b/cudax/include/cuda/experimental/__launch/launch.cuh
index e672ac08213..b5b5cfcddeb 100644
--- a/cudax/include/cuda/experimental/__launch/launch.cuh
+++ b/cudax/include/cuda/experimental/__launch/launch.cuh
@@ -22,6 +22,8 @@
 #endif // no system header
 
 #include <cuda/__driver/driver_api.h>
+#include <cuda/__launch/configuration.h>
+#include <cuda/__launch/launch.h>
 #include <cuda/__stream/device_transform.h>
 #include <cuda/__stream/stream_ref.h>
 #include <cuda/std/__exception/cuda_error.h>
@@ -38,43 +40,21 @@
 #include <cuda/experimental/__graph/graph_node_ref.cuh>
 #include <cuda/experimental/__graph/path_builder.cuh>
 #include <cuda/experimental/__kernel/kernel_ref.cuh>
-#include <cuda/experimental/__launch/configuration.cuh>
 #include <cuda/experimental/__utility/ensure_current_device.cuh>
 
 #include <cuda/std/__cccl/prologue.h>
 
-namespace cuda::experimental
-{
-template <typename _Config, typename _Kernel, class... _Args>
-__global__ static void __kernel_launcher(const _CCCL_GRID_CONSTANT _Config __conf, _Kernel __kernel_fn, _Args... __args)
-{
-  __kernel_fn(__conf, __args...);
-}
-
-template <typename _Kernel, class... _Args>
-__global__ static void __kernel_launcher_no_config(_Kernel __kernel_fn, _Args... __args)
-{
-  __kernel_fn(__args...);
-}
+_CCCL_BEGIN_NAMESPACE_CUDA
 
 template <class... _Args>
-[[nodiscard]] _CCCL_HOST_API CUfunction __get_cufunction_of(kernel_ref<void(_Args...)> __kernel)
+[[nodiscard]] _CCCL_HOST_API ::CUfunction __get_cufunction_of(experimental::kernel_ref<void(_Args...)> __kernel)
 {
   return ::cuda::__driver::__kernelGetFunction(__kernel.get());
 }
 
-template <class... _Args>
-[[nodiscard]] _CCCL_HOST_API ::CUfunction __get_cufunction_of(void (*__kernel)(_Args...))
-{
-  ::cudaFunction_t __kernel_cufunction{};
-  _CCCL_TRY_CUDA_API(
-    ::cudaGetFuncBySymbol, "Failed to get function from symbol", &__kernel_cufunction, (const void*) __kernel);
-  return (CUfunction) __kernel_cufunction;
-}
-
 _CCCL_TEMPLATE(typename _GraphInserter)
-_CCCL_REQUIRES(graph_inserter<_GraphInserter>)
-_CCCL_HOST_API graph_node_ref
+_CCCL_REQUIRES(experimental::graph_inserter<_GraphInserter>)
+_CCCL_HOST_API experimental::graph_node_ref
 __do_launch(_GraphInserter&& __inserter, ::CUlaunchConfig& __config, ::CUfunction __kernel, void** __args_ptrs)
 {
   ::CUDA_KERNEL_NODE_PARAMS __node_params{};
@@ -101,36 +81,44 @@ __do_launch(_GraphInserter&& __inserter, ::CUlaunchConfig& __config, ::CUfunctio
   // TODO skip the update if called on rvalue?
   __inserter.__clear_and_set_dependency_node(__node);
 
-  return graph_node_ref{__node, __inserter.get_graph().get()};
+  return experimental::graph_node_ref{__node, __inserter.get_graph().get()};
 }
 
-_CCCL_HOST_API void inline __do_launch(
-  ::cuda::stream_ref __stream, ::CUlaunchConfig& __config, ::CUfunction __kernel, void** __args_ptrs)
+_CCCL_TEMPLATE(typename _GraphInserter)
+_CCCL_REQUIRES(experimental::graph_inserter<_GraphInserter>)
+_CCCL_HOST_API ::cuda::stream_ref __stream_or_invalid([[maybe_unused]] const _GraphInserter& __inserter)
+{
+  return ::cuda::stream_ref{::cuda::invalid_stream};
+}
+
+_CCCL_TEMPLATE(typename _GraphInserter)
+_CCCL_REQUIRES(experimental::graph_inserter<_GraphInserter>)
+_CCCL_HOST_API _GraphInserter&& __forward_or_cast_to_stream_ref(_GraphInserter&& __inserter)
 {
-  __config.hStream = __stream.get();
-#if defined(_CUDAX_LAUNCH_CONFIG_TEST)
-  test_launch_kernel_replacement(__config, __kernel, __args_ptrs);
-#else // ^^^ _CUDAX_LAUNCH_CONFIG_TEST ^^^ / vvv !_CUDAX_LAUNCH_CONFIG_TEST vvv
-  ::cuda::__driver::__launchKernel(__config, __kernel, __args_ptrs);
-#endif // ^^^ !_CUDAX_LAUNCH_CONFIG_TEST ^^^
+  return ::cuda::std::forward<_GraphInserter>(__inserter);
 }
 
+_CCCL_END_NAMESPACE_CUDA
+
+namespace cuda::experimental
+{
 template <typename... _ExpTypes, typename _Dst, typename _Config>
 _CCCL_HOST_API auto __launch_impl(_Dst&& __dst, _Config __conf, ::CUfunction __kernel, _ExpTypes... __args)
 {
   static_assert(!::cuda::std::is_same_v<decltype(__conf.dims), no_init_t>,
                 "Can't launch a configuration without hierarchy dimensions");
   ::CUlaunchConfig __config{};
-  constexpr bool __has_cluster_level        = has_level<cluster_level, decltype(__conf.dims)>;
-  constexpr unsigned int __num_attrs_needed = __detail::kernel_config_count_attr_space(__conf) + __has_cluster_level;
+  constexpr bool __has_cluster_level = has_level<cluster_level, decltype(__conf.dims)>;
+  constexpr unsigned int __num_attrs_needed =
+    ::cuda::__detail::kernel_config_count_attr_space(__conf) + __has_cluster_level;
   ::CUlaunchAttribute __attrs[__num_attrs_needed == 0 ? 1 : __num_attrs_needed];
   __config.attrs    = &__attrs[0];
   __config.numAttrs = 0;
 
-  ::cudaError_t __status = __detail::apply_kernel_config(__conf, __config, __kernel);
+  ::cudaError_t __status = cuda::__detail::apply_kernel_config(__conf, __config, __kernel);
   if (__status != ::cudaSuccess)
   {
-    __throw_cuda_error(__status, "Failed to prepare a launch configuration");
+    ::cuda::__throw_cuda_error(__status, "Failed to prepare a launch configuration");
   }
 
   __config.gridDimX  = static_cast<unsigned>(__conf.dims.extents(block, grid).x);
@@ -151,33 +139,7 @@ _CCCL_HOST_API auto __launch_impl(_Dst&& __dst, _Config __conf, ::CUfunction __k
   }
 
   const void* __pArgs[(sizeof...(__args) > 0) ? sizeof...(__args) : 1]{::cuda::std::addressof(__args)...};
-  return __do_launch(::cuda::std::forward<_Dst>(__dst), __config, __kernel, const_cast<void**>(__pArgs));
-}
-
-_CCCL_TEMPLATE(typename _GraphInserter)
-_CCCL_REQUIRES(graph_inserter<_GraphInserter>)
-_CCCL_HOST_API ::cuda::stream_ref __stream_or_invalid([[maybe_unused]] const _GraphInserter& __inserter)
-{
-  return ::cuda::stream_ref{::cuda::invalid_stream};
-}
-
-_CCCL_HOST_API ::cuda::stream_ref inline __stream_or_invalid(::cuda::stream_ref __stream)
-{
-  return __stream;
-}
-
-_CCCL_TEMPLATE(typename _GraphInserter)
-_CCCL_REQUIRES(graph_inserter<_GraphInserter>)
-_CCCL_HOST_API _GraphInserter&& __forward_or_cast_to_stream_ref(_GraphInserter&& __inserter)
-{
-  return ::cuda::std::forward<_GraphInserter>(__inserter);
-}
-
-// cast to stream_ref to avoid instantiating launch_impl for every type convertible to stream_ref
-template <typename _Dummy>
-_CCCL_HOST_API ::cuda::stream_ref __forward_or_cast_to_stream_ref(::cuda::stream_ref __stream)
-{
-  return __stream;
+  return ::cuda::__do_launch(::cuda::std::forward<_Dst>(__dst), __config, __kernel, const_cast<void**>(__pArgs));
 }
 
 template <typename _Submitter>
@@ -186,8 +148,8 @@ _CCCL_CONCEPT work_submitter =
 
 //! @brief Launch a kernel functor with specified configuration and arguments
 //!
-//! Launches a kernel functor object on the specified stream and with specified configuration.
-//! Kernel functor object is a type with __device__ operator().
+//! Launches a kernel functor object on the specified stream and with specified
+//! configuration. Kernel functor object is a type with __device__ operator().
 //! Functor might or might not accept the configuration as its first argument.
 //!
 //! @par Snippet
@@ -197,7 +159,8 @@ _CCCL_CONCEPT work_submitter =
 //!
 //! struct kernel {
 //!     template <typename Configuration>
-//!     __device__ void operator()(Configuration conf, unsigned int thread_to_print) {
+//!     __device__ void operator()(Configuration conf, unsigned int
+//!     thread_to_print) {
 //!         if (conf.dims.rank(cudax::thread, cudax::grid) == thread_to_print) {
 //!             printf("Hello from the GPU\n");
 //!         }
@@ -205,8 +168,9 @@ _CCCL_CONCEPT work_submitter =
 //! };
 //!
 //! void launch_kernel(cuda::stream_ref stream) {
-//!     auto dims    = cudax::make_hierarchy(cudax::block_dims<128>(), cudax::grid_dims(4));
-//!     auto config = cudax::make_config(dims, cudax::launch_cooperative());
+//!     auto dims    = cudax::make_hierarchy(cudax::block_dims<128>(),
+//!     cudax::grid_dims(4)); auto config = cudax::make_config(dims,
+//!     cudax::launch_cooperative());
 //!
 //!     cudax::launch(stream, config, kernel(), 42);
 //! }
@@ -237,34 +201,34 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter,
                                             kernel_config<_Dimensions, _Config...>,
                                             ::cuda::std::decay_t<transformed_device_argument_t<_Args>>...>)
   {
-    auto __launcher =
+    auto __launcher = ::cuda::
       __kernel_launcher<decltype(__combined), _Kernel, ::cuda::std::decay_t<transformed_device_argument_t<_Args>>...>;
-    return __launch_impl(
-      __forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)),
+    return ::cuda::experimental::__launch_impl(
+      ::cuda::__forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)),
       __combined,
-      __get_cufunction_of(__launcher),
+      ::cuda::__get_cufunction_of(__launcher),
       __combined,
       __kernel,
-      device_transform(__stream_or_invalid(__submitter), ::cuda::std::forward<_Args>(__args))...);
+      device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_Args>(__args))...);
   }
   else
   {
     static_assert(::cuda::std::is_invocable_v<_Kernel, ::cuda::std::decay_t<transformed_device_argument_t<_Args>>...>);
     auto __launcher =
-      __kernel_launcher_no_config<_Kernel, ::cuda::std::decay_t<transformed_device_argument_t<_Args>>...>;
-    return __launch_impl(
-      __forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)),
+      ::cuda::__kernel_launcher_no_config<_Kernel, ::cuda::std::decay_t<transformed_device_argument_t<_Args>>...>;
+    return ::cuda::experimental::__launch_impl(
+      ::cuda::__forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)),
       __combined,
-      __get_cufunction_of(__launcher),
+      ::cuda::__get_cufunction_of(__launcher),
       __kernel,
-      device_transform(__stream_or_invalid(__submitter), ::cuda::std::forward<_Args>(__args))...);
+      device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_Args>(__args))...);
   }
 }
 
 //! @brief Launch a kernel function with specified configuration and arguments
 //!
-//! Launches a kernel function on the specified stream and with specified configuration.
-//! Kernel function is a function with __global__ annotation.
+//! Launches a kernel function on the specified stream and with specified
+//! configuration. Kernel function is a function with __global__ annotation.
 //! Function might or might not accept the configuration as its first argument.
 //!
 //! @par Snippet
@@ -280,8 +244,9 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter,
 //! }
 //!
 //! void launch_kernel(cuda::stream_ref stream) {
-//!     auto dims    = cudax::make_hierarchy(cudax::block_dims<128>(), cudax::grid_dims(4));
-//!     auto config = cudax::make_config(dims, cudax::launch_cooperative());
+//!     auto dims    = cudax::make_hierarchy(cudax::block_dims<128>(),
+//!     cudax::grid_dims(4)); auto config = cudax::make_config(dims,
+//!     cudax::launch_cooperative());
 //!
 //!     cudax::launch(stream, config, kernel<decltype(config)>, 42);
 //! }
@@ -308,12 +273,12 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter,
                            _ActArgs&&... __args)
 {
   __ensure_current_device __dev_setter{__submitter};
-  return __launch_impl<kernel_config<_Dimensions, _Config...>, _ExpArgs...>(
-    __forward_or_cast_to_stream_ref<_Submitter>(__submitter), //
+  return ::cuda::experimental::__launch_impl<kernel_config<_Dimensions, _Config...>, _ExpArgs...>(
+    ::cuda::__forward_or_cast_to_stream_ref<_Submitter>(__submitter), //
     __conf,
-    __get_cufunction_of(__kernel),
+    ::cuda::__get_cufunction_of(__kernel),
     __conf,
-    device_transform(__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...);
+    device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...);
 }
 
 //! @brief Launch a kernel with specified configuration and arguments
@@ -334,10 +299,12 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter,
 //! }
 //!
 //! void launch_kernel(cuda::stream_ref stream) {
-//!     auto dims    = cudax::make_hierarchy(cudax::block_dims<128>(), cudax::grid_dims(4));
-//!     auto config = cudax::make_config(dims, cudax::launch_cooperative());
+//!     auto dims    = cudax::make_hierarchy(cudax::block_dims<128>(),
+//!     cudax::grid_dims(4)); auto config = cudax::make_config(dims,
+//!     cudax::launch_cooperative());
 //!
-//!     cudax::launch(stream, config, cudax::kernel_ref{kernel<decltype(config)}>, 42);
+//!     cudax::launch(stream, config,
+//!     cudax::kernel_ref{kernel<decltype(config)}>, 42);
 //! }
 //! @endcode
 //!
@@ -362,18 +329,18 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter,
                            _ActArgs&&... __args)
 {
   __ensure_current_device __dev_setter{__submitter};
-  return __launch_impl<kernel_config<_Dimensions, _Config...>, _ExpArgs...>(
-    __forward_or_cast_to_stream_ref<_Submitter>(__submitter), //
+  return ::cuda::experimental::__launch_impl<kernel_config<_Dimensions, _Config...>, _ExpArgs...>(
+    ::cuda::__forward_or_cast_to_stream_ref<_Submitter>(__submitter), //
     __conf,
-    __get_cufunction_of(__kernel),
+    ::cuda::__get_cufunction_of(__kernel),
     __conf,
-    device_transform(__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...);
+    device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...);
 }
 
 //! @brief Launch a kernel function with specified configuration and arguments
 //!
-//! Launches a kernel function on the specified stream and with specified configuration.
-//! Kernel function is a function with __global__ annotation.
+//! Launches a kernel function on the specified stream and with specified
+//! configuration. Kernel function is a function with __global__ annotation.
 //! Function might or might not accept the configuration as its first argument.
 //!
 //! @par Snippet
@@ -389,8 +356,9 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter,
 //! }
 //!
 //! void launch_kernel(cuda::stream_ref stream) {
-//!     auto dims    = cudax::make_hierarchy(cudax::block_dims<128>(), cudax::grid_dims(4));
-//!     auto config = cudax::make_config(dims, cudax::launch_cooperative());
+//!     auto dims    = cudax::make_hierarchy(cudax::block_dims<128>(),
+//!     cudax::grid_dims(4)); auto config = cudax::make_config(dims,
+//!     cudax::launch_cooperative());
 //!
 //!     cudax::launch(stream, config, kernel<decltype(config)>, 42);
 //! }
@@ -416,11 +384,11 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter,
                            _ActArgs&&... __args)
 {
   __ensure_current_device __dev_setter{__submitter};
-  return __launch_impl<_ExpArgs...>(
-    __forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)), //
+  return ::cuda::experimental::__launch_impl<_ExpArgs...>(
+    ::cuda::__forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)), //
     __conf,
-    __get_cufunction_of(__kernel),
-    device_transform(__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...);
+    ::cuda::__get_cufunction_of(__kernel),
+    device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...);
 }
 
 //! @brief Launch a kernel with specified configuration and arguments
@@ -441,10 +409,12 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter,
 //! }
 //!
 //! void launch_kernel(cuda::stream_ref stream) {
-//!     auto dims    = cudax::make_hierarchy(cudax::block_dims<128>(), cudax::grid_dims(4));
-//!     auto config = cudax::make_config(dims, cudax::launch_cooperative());
+//!     auto dims    = cudax::make_hierarchy(cudax::block_dims<128>(),
+//!     cudax::grid_dims(4)); auto config = cudax::make_config(dims,
+//!     cudax::launch_cooperative());
 //!
-//!     cudax::launch(stream, config, cudax::kernel_ref{kernel<decltype(config)>}, 42);
+//!     cudax::launch(stream, config,
+//!     cudax::kernel_ref{kernel<decltype(config)>}, 42);
 //! }
 //! @endcode
 //!
@@ -468,11 +438,11 @@ _CCCL_HOST_API auto launch(_Submitter&& __submitter,
                            _ActArgs&&... __args)
 {
   __ensure_current_device __dev_setter{__submitter};
-  return __launch_impl<_ExpArgs...>(
-    __forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)), //
+  return ::cuda::experimental::__launch_impl<_ExpArgs...>(
+    ::cuda::__forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)), //
     __conf,
-    __get_cufunction_of(__kernel),
-    device_transform(__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...);
+    ::cuda::__get_cufunction_of(__kernel),
+    device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...);
 }
 
 //
diff --git a/cudax/include/cuda/experimental/launch.cuh b/cudax/include/cuda/experimental/launch.cuh
index 04b4cd04e9e..cf65b9471b7 100644
--- a/cudax/include/cuda/experimental/launch.cuh
+++ b/cudax/include/cuda/experimental/launch.cuh
@@ -11,9 +11,9 @@
 #ifndef __CUDAX_LAUNCH___
 #define __CUDAX_LAUNCH___
 
+#include <cuda/__launch/configuration.h>
 #include <cuda/__stream/device_transform.h>
 
-#include <cuda/experimental/__launch/configuration.cuh>
 #include <cuda/experimental/__launch/launch.cuh>
 #include <cuda/experimental/__launch/param_kind.cuh>
 
diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt
index c6bd29a333c..00168604f5f 100644
--- a/cudax/test/CMakeLists.txt
+++ b/cudax/test/CMakeLists.txt
@@ -40,15 +40,7 @@ cudax_add_catch2_test(test_target launch
     launch/launch_smoke.cu
 )
 
-cudax_add_catch2_test(test_target launch_configuration
-    launch/configuration.cu
-)
-
-cudax_add_catch2_test(test_target launch_config_dynamic_smem
-    launch/dynamic_shared_memory.cu
-)
-
-cudax_add_catch2_test(test_target execution
+cudax_add_catch2_test(test_target execution ${cudax_target}
     execution/env.cu
     execution/policies/policies.cu
     execution/policies/get_execution_policy.cu
diff --git a/cudax/test/common/utility.cuh b/cudax/test/common/utility.cuh
index 059a8c8d7f0..e8404b164ac 100644
--- a/cudax/test/common/utility.cuh
+++ b/cudax/test/common/utility.cuh
@@ -29,7 +29,7 @@ namespace
 {
 namespace test
 {
-constexpr auto one_thread_dims = cudax::make_config(cuda::block_dims<1>(), cuda::grid_dims<1>());
+constexpr auto one_thread_dims = cuda::make_config(cuda::block_dims<1>(), cuda::grid_dims<1>());
 
 struct _malloc_pinned
 {
@@ -39,13 +39,13 @@ private:
 public:
   explicit _malloc_pinned(std::size_t size)
   {
-    cudax::__ensure_current_device guard(cuda::device_ref{0});
+    cuda::__ensure_current_context guard(cuda::device_ref{0});
     _CCCL_TRY_CUDA_API(::cudaMallocHost, "failed to allocate pinned memory", &pv, size);
   }
 
   ~_malloc_pinned()
   {
-    cudax::__ensure_current_device guard(cuda::device_ref{0});
+    cuda::__ensure_current_context guard(cuda::device_ref{0});
     [[maybe_unused]] auto status = ::cudaFreeHost(pv);
   }
 
diff --git a/cudax/test/containers/uninitialized_buffer.cu b/cudax/test/containers/uninitialized_buffer.cu
index 8850373d472..e3f5df9bda3 100644
--- a/cudax/test/containers/uninitialized_buffer.cu
+++ b/cudax/test/containers/uninitialized_buffer.cu
@@ -241,7 +241,7 @@ C2H_TEST("uninitialized_buffer is usable with cudax::launch", "[container]")
     const int grid_size = 4;
     cudax::uninitialized_buffer<int, ::cuda::mr::device_accessible> buffer{
       cuda::device_default_memory_pool(cuda::device_ref{0}), 1024};
-    auto configuration = cudax::make_config(cuda::grid_dims(grid_size), cuda::block_dims<256>());
+    auto configuration = cuda::make_config(cuda::grid_dims(grid_size), cuda::block_dims<256>());
 
     cudax::stream stream{cuda::device_ref{0}};
 
@@ -253,7 +253,7 @@ C2H_TEST("uninitialized_buffer is usable with cudax::launch", "[container]")
     const int grid_size = 4;
     const cudax::uninitialized_buffer<int, ::cuda::mr::device_accessible> buffer{
       cuda::device_default_memory_pool(cuda::device_ref{0}), 1024};
-    auto configuration = cudax::make_config(cuda::grid_dims(grid_size), cuda::block_dims<256>());
+    auto configuration = cuda::make_config(cuda::grid_dims(grid_size), cuda::block_dims<256>());
 
     cudax::stream stream{cuda::device_ref{0}};
 
diff --git a/cudax/test/launch/launch_smoke.cu b/cudax/test/launch/launch_smoke.cu
index 3e6eb0615fc..fc884bf74de 100644
--- a/cudax/test/launch/launch_smoke.cu
+++ b/cudax/test/launch/launch_smoke.cu
@@ -38,7 +38,7 @@ struct kernel_run_proof_check
 
 void check_kernel_run(cudax::path_builder& pb)
 {
-  cudax::launch(pb, cudax::make_config(cuda::block_dims<1>, cuda::grid_dims<1>), kernel_run_proof_check{});
+  cudax::launch(pb, cuda::make_config(cuda::block_dims<1>, cuda::grid_dims<1>), kernel_run_proof_check{});
 }
 
 struct functor_int_argument
@@ -88,7 +88,7 @@ struct dynamic_smem_single
   template <typename Config>
   __device__ void operator()(Config config)
   {
-    decltype(auto) dynamic_smem = cudax::device::dynamic_shared_memory_view(config);
+    decltype(auto) dynamic_smem = cuda::dynamic_shared_memory_view(config);
     static_assert(::cuda::std::is_same_v<SmemType&, decltype(dynamic_smem)>);
     CUDAX_REQUIRE(::cuda::device::is_object_from(dynamic_smem, ::cuda::device::address_space::shared));
     kernel_run_proof = true;
@@ -101,7 +101,7 @@ struct dynamic_smem_span
   template <typename Config>
   __device__ void operator()(Config config, int size)
   {
-    auto dynamic_smem = cudax::device::dynamic_shared_memory_view(config);
+    auto dynamic_smem = cuda::dynamic_shared_memory_view(config);
     static_assert(decltype(dynamic_smem)::extent == Extent);
     static_assert(::cuda::std::is_same_v<SmemType&, decltype(dynamic_smem[1])>);
     CUDAX_REQUIRE(dynamic_smem.size() == size);
@@ -168,7 +168,7 @@ void launch_smoke_test(StreamOrPathBuilder& dst)
     const int grid_size      = 4;
     constexpr int block_size = 256;
     auto dimensions          = cuda::make_hierarchy(cuda::grid_dims(grid_size), cuda::block_dims<256>());
-    auto config              = cudax::make_config(dimensions);
+    auto config              = cuda::make_config(dimensions);
 
     // Not taking dims
     {
@@ -264,7 +264,7 @@ void launch_smoke_test(StreamOrPathBuilder& dst)
     auto test = [&](const auto& input_config) {
       // Single element
       {
-        auto config = input_config.add(cudax::dynamic_shared_memory<my_dynamic_smem_t>());
+        auto config = input_config.add(cuda::dynamic_shared_memory<my_dynamic_smem_t>());
 
         cudax::launch(dst, config, dynamic_smem_single<my_dynamic_smem_t>());
         check_kernel_run(dst);
@@ -273,7 +273,7 @@ void launch_smoke_test(StreamOrPathBuilder& dst)
       // Dynamic span
       {
         const int size = 2;
-        auto config    = input_config.add(cudax::dynamic_shared_memory<my_dynamic_smem_t[]>(size));
+        auto config    = input_config.add(cuda::dynamic_shared_memory<my_dynamic_smem_t[]>(size));
         cudax::launch(dst, config, dynamic_smem_span<my_dynamic_smem_t, ::cuda::std::dynamic_extent>(), size);
         check_kernel_run(dst);
       }
@@ -281,14 +281,14 @@ void launch_smoke_test(StreamOrPathBuilder& dst)
       // Static span
       {
         constexpr int size = 3;
-        auto config        = input_config.add(cudax::dynamic_shared_memory<my_dynamic_smem_t[size]>());
+        auto config        = input_config.add(cuda::dynamic_shared_memory<my_dynamic_smem_t[size]>());
         cudax::launch(dst, config, dynamic_smem_span<my_dynamic_smem_t, size>(), size);
         check_kernel_run(dst);
       }
     };
 
     test(config);
-    test(config.add(cudax::cooperative_launch(), cudax::launch_priority(0)));
+    test(config.add(cuda::cooperative_launch(), cuda::launch_priority(0)));
   }
 }
 
@@ -361,23 +361,23 @@ void test_default_config()
 
   SECTION("Combine with empty")
   {
-    kernel_with_default_config kernel{cudax::make_config(block, grid, cudax::cooperative_launch())};
-    static_assert(cudax::__is_kernel_config<decltype(kernel.default_config())>);
-    static_assert(cudax::__kernel_has_default_config<decltype(kernel)>);
+    kernel_with_default_config kernel{cuda::make_config(block, grid, cuda::cooperative_launch())};
+    static_assert(cuda::__is_kernel_config<decltype(kernel.default_config())>);
+    static_assert(cuda::__kernel_has_default_config<decltype(kernel)>);
 
-    cudax::launch(stream, cudax::make_config(), kernel, verify_lambda);
+    cudax::launch(stream, cuda::make_config(), kernel, verify_lambda);
     stream.sync();
   }
   SECTION("Combine with no overlap")
   {
-    kernel_with_default_config kernel{cudax::make_config(block)};
-    cudax::launch(stream, cudax::make_config(grid, cudax::cooperative_launch()), kernel, verify_lambda);
+    kernel_with_default_config kernel{cuda::make_config(block)};
+    cudax::launch(stream, cuda::make_config(grid, cuda::cooperative_launch()), kernel, verify_lambda);
     stream.sync();
   }
   SECTION("Combine with overlap")
   {
-    kernel_with_default_config kernel{cudax::make_config(cuda::block_dims<1>, cudax::cooperative_launch())};
-    cudax::launch(stream, cudax::make_config(block, grid, cudax::cooperative_launch()), kernel, verify_lambda);
+    kernel_with_default_config kernel{cuda::make_config(cuda::block_dims<1>, cuda::cooperative_launch())};
+    cudax::launch(stream, cuda::make_config(block, grid, cuda::cooperative_launch()), kernel, verify_lambda);
     stream.sync();
   }
 }
diff --git a/cudax/test/stream/stream_smoke.cu b/cudax/test/stream/stream_smoke.cu
index 738d0a37824..734df9fc4f7 100644
--- a/cudax/test/stream/stream_smoke.cu
+++ b/cudax/test/stream/stream_smoke.cu
@@ -28,7 +28,7 @@ C2H_CCCLRT_TEST("Can create a stream and launch work into it", "[stream]")
 
 C2H_CCCLRT_TEST("From native handle", "[stream]")
 {
-  cudax::__ensure_current_device guard(cuda::device_ref{0});
+  cuda::__ensure_current_context guard(cuda::device_ref{0});
   cudaStream_t handle;
   CUDART(cudaStreamCreate(&handle));
   {
diff --git a/examples/cudax/vector_add/vector_add.cu b/examples/cudax/vector_add/vector_add.cu
index 7fe938c8b8c..3779c7b7692 100644
--- a/examples/cudax/vector_add/vector_add.cu
+++ b/examples/cudax/vector_add/vector_add.cu
@@ -92,7 +92,7 @@ try
 
   // Define the kernel launch parameters
   constexpr int threadsPerBlock = 256;
-  auto config                   = cudax::distribute<threadsPerBlock>(numElements);
+  auto config                   = cuda::distribute<threadsPerBlock>(numElements);
 
   // Launch the vectorAdd kernel
   printf(
diff --git a/cudax/include/cuda/experimental/__launch/configuration.cuh b/libcudacxx/include/cuda/__launch/configuration.h
similarity index 84%
rename from cudax/include/cuda/experimental/__launch/configuration.cuh
rename to libcudacxx/include/cuda/__launch/configuration.h
index ddb9cf6b1bd..d9b52872138 100644
--- a/cudax/include/cuda/experimental/__launch/configuration.cuh
+++ b/libcudacxx/include/cuda/__launch/configuration.h
@@ -1,6 +1,6 @@
 //===----------------------------------------------------------------------===//
 //
-// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// Part of libcu++, the C++ Standard Library for your entire system,
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -8,28 +8,37 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _CUDAX__LAUNCH_CONFIGURATION_CUH
-#define _CUDAX__LAUNCH_CONFIGURATION_CUH
+#ifndef _CUDA___LAUNCH_CONFIGURATION_H
+#define _CUDA___LAUNCH_CONFIGURATION_H
 
-#include <cuda/__driver/driver_api.h>
-#include <cuda/__hierarchy/hierarchy_dimensions.h>
-#include <cuda/__numeric/overflow_cast.h>
-#include <cuda/__ptx/instructions/get_sreg.h>
-#include <cuda/std/__cstddef/types.h>
-#include <cuda/std/__type_traits/is_const.h>
-#include <cuda/std/__type_traits/is_reference.h>
-#include <cuda/std/__type_traits/is_unbounded_array.h>
-#include <cuda/std/__type_traits/rank.h>
-#include <cuda/std/span>
-#include <cuda/std/tuple>
+#include <cuda/std/detail/__config>
 
-#include <cuda/experimental/__detail/utility.cuh>
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
 
-#include <cuda/std/__cccl/prologue.h>
+#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+
+#  include <cuda/__driver/driver_api.h>
+#  include <cuda/__hierarchy/hierarchy_dimensions.h>
+#  include <cuda/__numeric/overflow_cast.h>
+#  include <cuda/__ptx/instructions/get_sreg.h>
+#  include <cuda/std/__cstddef/types.h>
+#  include <cuda/std/__type_traits/is_const.h>
+#  include <cuda/std/__type_traits/is_reference.h>
+#  include <cuda/std/__type_traits/is_unbounded_array.h>
+#  include <cuda/std/__type_traits/rank.h>
+#  include <cuda/std/span>
+#  include <cuda/std/tuple>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA
 
-#if _CCCL_STD_VER >= 2017
-namespace cuda::experimental
-{
 template <typename Dimensions, typename... Options>
 struct kernel_config;
 
@@ -103,7 +112,7 @@ inline constexpr bool no_duplicate_options<Option, Rest...> =
  *
  * @par Snippet
  * @code
- * #include <cudax/launch.cuh>
+ * #include <cuda/launch>
  * #include <cooperative_groups.h>
  *
  * template <typename Configuration>
@@ -114,10 +123,10 @@ inline constexpr bool no_duplicate_options<Option, Rest...> =
  * }
  *
  * void kernel_launch(cuda::stream_ref stream) {
- *     auto dims = cudax::make_hierarchy(cudax::block<128>(), cudax::grid(4));
- *     auto conf = cudax::make_configuration(dims, cooperative_launch());
+ *     auto dims = cuda::make_hierarchy(cuda::block<128>(), cuda::grid(4));
+ *     auto conf = cuda::make_configuration(dims, cooperative_launch());
  *
- *     cudax::launch(stream, conf, kernel);
+ *     cuda::launch(stream, conf, kernel);
  * }
  * @endcode
  */
@@ -188,8 +197,8 @@ inline constexpr ::cuda::std::size_t __max_portable_dyn_smem_size = 48 * 1024;
  *
  * When launch configuration contains this option, that configuration can be
  * then passed to dynamic_shared_memory_view to get the view_type over the
- * dynamic shared memory. It is also possible to obtain that memory through the
- * original extern __shared__ variable[] declaration.
+ * dynamic shared memory. It is also possible to obtain that memory through
+ * the original extern __shared__ variable[] declaration.
  *
  * CUDA guarantees that each device has at least 48kB of shared memory
  * per block, but most devices have more than that.
@@ -199,21 +208,21 @@ inline constexpr ::cuda::std::size_t __max_portable_dyn_smem_size = 48 * 1024;
  *
  * @par Snippet
  * @code
- * #include <cudax/launch.cuh>
+ * #include <cuda/launch>
  *
  * template <typename Configuration>
  * __global__ void kernel(Configuration conf)
  * {
- *     auto dynamic_shared = cudax::dynamic_shared_memory_view(conf);
+ *     auto dynamic_shared = cuda::dynamic_shared_memory_view(conf);
  *     dynamic_shared[0] = 1;
  * }
  *
  * void kernel_launch(cuda::stream_ref stream) {
- *     auto dims = cudax::make_hierarchy(cudax::block<128>(), cudax::grid(4));
- *     auto conf = cudax::make_configuration(dims,
+ *     auto dims = cuda::make_hierarchy(cuda::block<128>(), cuda::grid(4));
+ *     auto conf = cuda::make_configuration(dims,
  * dynamic_shared_memory<int[128]>());
  *
- *     cudax::launch(stream, conf, kernel);
+ *     cuda::launch(stream, conf, kernel);
  * }
  * @endcode
  * @par
@@ -246,10 +255,10 @@ class _CCCL_DECLSPEC_EMPTY_BASES dynamic_shared_memory
   bool __non_portable_{}; //!< \c true if the object was created with
                           //!< non_portable flag.
 
-  using typename __base_type::value_type; //!< Value type of the dynamic shared
-                                          //!< memory elements.
+  using typename __base_type::value_type; //!< Value type of the dynamic
+                                          //!< shared memory elements.
   using typename __base_type::view_type; //!< The view type returned by the
-                                         //!< cuda::device::dynamic_shared_memory_view(config).
+                                         //!< cuda::dynamic_shared_memory_view(config).
 
   static constexpr bool is_relevant_on_device        = true;
   static constexpr __detail::launch_option_kind kind = __detail::launch_option_kind::dynamic_shared_memory;
@@ -327,8 +336,8 @@ template <class _Tp>
 {
   ::cudaError_t __status = ::cudaSuccess;
 
-  // Since CUDA 12.4, querying CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES requires the
-  // function to be loaded.
+  // Since CUDA 12.4, querying CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES requires
+  // the function to be loaded.
   if (::cuda::__driver::__version_at_least(12, 4))
   {
     __status = ::cuda::__driver::__functionLoadNoThrow(__kernel);
@@ -452,11 +461,11 @@ _CCCL_CONCEPT __kernel_has_default_config =
 /**
  * @brief Type describing a kernel launch configuration
  *
- * This type should not be constructed directly and make_config helper function
- * should be used instead
+ * This type should not be constructed directly and make_config helper
+ * function should be used instead
  *
  * @tparam Dimensions
- * cuda::experimental::hierarchy_dimensions instance that describes dimensions
+ * cuda::hierarchy_dimensions instance that describes dimensions
  * of thread hierarchy in this configuration object
  *
  * @tparam Options
@@ -501,13 +510,13 @@ struct kernel_config
    * and the configuration from argument. It contains dimensions that are
    * combination of dimensions in this object and the other configuration. The
    * resulting hierarchy holds levels present in both hierarchies. In case of
-   * overlap of levels hierarchy from this configuration is prioritized, so the
-   * result always holds all levels from this hierarchy and non-overlapping
-   * levels from the other hierarchy. This behavior is the same as `combine()`
-   * member function of the hierarchy type. The result also contains
-   * configuration options from both configurations. In case the same type of a
-   * configuration option is present in both configuration this configuration is
-   * copied into the resulting configuration.
+   * overlap of levels hierarchy from this configuration is prioritized, so
+   * the result always holds all levels from this hierarchy and
+   * non-overlapping levels from the other hierarchy. This behavior is the
+   * same as `combine()` member function of the hierarchy type. The result
+   * also contains configuration options from both configurations. In case the
+   * same type of a configuration option is present in both configuration this
+   * configuration is copied into the resulting configuration.
    *
    * @param __other_config
    * Other configuration to combine with this configuration
@@ -527,12 +536,12 @@ struct kernel_config
    * functor
    *
    * Returns a new `kernel_config` that is a combination of this configuration
-   * and a default configuration from the kernel argument. Default configuration
-   * is a `kernel_config` object returned from `default_config()` member
-   * function of the kernel type. The configurations are combined using the
-   * `combine()` member function of this configuration. If the kernel has no
-   * default configuration, a copy of this configuration is returned without any
-   * changes.
+   * and a default configuration from the kernel argument. Default
+   * configuration is a `kernel_config` object returned from
+   * `default_config()` member function of the kernel type. The configurations
+   * are combined using the `combine()` member function of this configuration.
+   * If the kernel has no default configuration, a copy of this configuration
+   * is returned without any changes.
    *
    * @param __kernel
    * Kernel functor to search for the default configuration
@@ -550,22 +559,19 @@ struct kernel_config
     }
   }
 };
-} // namespace cuda::experimental
-
-_CCCL_BEGIN_NAMESPACE_CUDA
 
 // We can consider removing the operator&, but its convenient for in-line
 // construction
 template <typename Dimensions, typename... Options, typename NewLevel>
 _CCCL_HOST_API constexpr auto
-operator&(const experimental::kernel_config<Dimensions, Options...>& config, const NewLevel& new_level) noexcept
+operator&(const kernel_config<Dimensions, Options...>& config, const NewLevel& new_level) noexcept
 {
   return kernel_config(hierarchy_add_level(config.dims, new_level), config.options);
 }
 
 template <typename NewLevel, typename Dimensions, typename... Options>
 _CCCL_HOST_API constexpr auto
-operator&(const NewLevel& new_level, const experimental::kernel_config<Dimensions, Options...>& config) noexcept
+operator&(const NewLevel& new_level, const kernel_config<Dimensions, Options...>& config) noexcept
 {
   return kernel_config(hierarchy_add_level(config.dims, new_level), config.options);
 }
@@ -574,13 +580,9 @@ template <typename L1, typename Dims1, typename L2, typename Dims2>
 _CCCL_HOST_API constexpr auto
 operator&(const level_dimensions<L1, Dims1>& l1, const level_dimensions<L2, Dims2>& l2) noexcept
 {
-  return experimental::kernel_config(cuda::make_hierarchy(l1, l2));
+  return kernel_config(::cuda::make_hierarchy(l1, l2));
 }
 
-_CCCL_END_NAMESPACE_CUDA
-
-namespace cuda::experimental
-{
 template <typename _Dimensions, typename... _Options>
 auto __make_config_from_tuple(const _Dimensions& __dims, const ::cuda::std::tuple<_Options...>& __opts)
 {
@@ -634,8 +636,8 @@ make_config(const hierarchy_dimensions<BottomUnit, Levels...>& dims, const Opts&
  *
  * @par Snippet
  * @code
- * #include <cudax/hierarchy_dimensions.cuh>
- * using namespace cuda::experimental;
+ * #include <cuda/hierarchy_dimensions.cuh>
+ * using namespace cuda;
  *
  * constexpr int threadsPerBlock = 256;
  * auto dims = distribute<threadsPerBlock>(numElements);
@@ -706,49 +708,30 @@ inline unsigned int constexpr kernel_config_count_attr_space(const kernel_config
   return (0 + ... + Options::needs_attribute_space);
 }
 
-template <typename Dimensions, typename... Options>
-[[nodiscard]] cudaError_t apply_kernel_config(
-  const kernel_config<Dimensions, Options...>& config, cudaLaunchConfig_t& cuda_config, void* kernel) noexcept
-{
-  cudaError_t status = cudaSuccess;
-
-  ::cuda::std::apply(
-    [&](auto&... config_options) {
-      // Use short-cutting && to skip the rest on error, is this too
-      // convoluted?
-      (void) (... && [&](cudaError_t call_status) {
-        status = call_status;
-        return call_status == cudaSuccess;
-      }(config_options.apply(cuda_config, kernel)));
-    },
-    config.options);
-
-  return status;
-}
-
 template <typename Dimensions, typename... Options>
 [[nodiscard]] cudaError_t apply_kernel_config(
   const kernel_config<Dimensions, Options...>& config, CUlaunchConfig& cuda_config, CUfunction kernel) noexcept
 {
-  cudaError_t status = cudaSuccess;
-
-  ::cuda::std::apply(
+  return ::cuda::std::apply(
     [&](auto&... config_options) {
+      cudaError_t __status = cudaSuccess;
+
       // Use short-cutting && to skip the rest on error, is this too
       // convoluted?
-      (void) (... && [&](cudaError_t call_status) {
-        status = call_status;
+      // For some reason gcc 7 complains about __status capture, so we pass it as a reference
+      (void) (... && [](cudaError_t call_status, cudaError_t& __status_out) {
+        __status_out = call_status;
         return call_status == cudaSuccess;
-      }(::cuda::experimental::__apply_launch_option(config_options, cuda_config, kernel)));
+      }(::cuda::__apply_launch_option(config_options, cuda_config, kernel), __status));
+
+      return __status;
     },
     config.options);
-
-  return status;
 }
 } // namespace __detail
 
-namespace device
-{
+#  if _CCCL_CUDA_COMPILATION()
+
 template <class _Dims, class... _Opts>
 _CCCL_DEVICE_API decltype(auto) dynamic_shared_memory_view(const kernel_config<_Dims, _Opts...>& __config) noexcept
 {
@@ -759,10 +742,13 @@ _CCCL_DEVICE_API decltype(auto) dynamic_shared_memory_view(const kernel_config<_
   extern __shared__ unsigned char __cccl_device_dyn_smem[];
   return __opt.__make_view(reinterpret_cast<typename _Opt::value_type*>(__cccl_device_dyn_smem));
 }
-} // namespace device
-} // namespace cuda::experimental
-#endif // _CCCL_STD_VER >= 2017
 
-#include <cuda/std/__cccl/epilogue.h>
+#  endif // _CCCL_CUDA_COMPILATION()
+
+_CCCL_END_NAMESPACE_CUDA
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 
-#endif // _CUDAX__LAUNCH_CONFIGURATION_CUH
+#endif // _CUDA___LAUNCH_CONFIGURATION_H
diff --git a/libcudacxx/include/cuda/__launch/host_launch.h b/libcudacxx/include/cuda/__launch/host_launch.h
index cf44f020d0e..7c700bfa42e 100644
--- a/libcudacxx/include/cuda/__launch/host_launch.h
+++ b/libcudacxx/include/cuda/__launch/host_launch.h
@@ -21,15 +21,17 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/__driver/driver_api.h>
-#include <cuda/__stream/stream_ref.h>
-#include <cuda/std/__functional/reference_wrapper.h>
-#include <cuda/std/__type_traits/decay.h>
-#include <cuda/std/__type_traits/is_move_constructible.h>
-#include <cuda/std/__utility/forward.h>
-#include <cuda/std/tuple>
+#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 
-#include <cuda/std/__cccl/prologue.h>
+#  include <cuda/__driver/driver_api.h>
+#  include <cuda/__stream/stream_ref.h>
+#  include <cuda/std/__functional/reference_wrapper.h>
+#  include <cuda/std/__type_traits/decay.h>
+#  include <cuda/std/__type_traits/is_move_constructible.h>
+#  include <cuda/std/__utility/forward.h>
+#  include <cuda/std/tuple>
+
+#  include <cuda/std/__cccl/prologue.h>
 
 _CCCL_BEGIN_NAMESPACE_CUDA
 
@@ -106,6 +108,8 @@ _CCCL_HOST_API void host_launch(stream_ref __stream, ::cuda::std::reference_wrap
 }
 _CCCL_END_NAMESPACE_CUDA
 
-#include <cuda/std/__cccl/epilogue.h>
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
 
 #endif // !_CUDA___LAUNCH_HOST_LAUNCH_H
diff --git a/libcudacxx/include/cuda/__launch/launch.h b/libcudacxx/include/cuda/__launch/launch.h
new file mode 100644
index 00000000000..a9066e00b91
--- /dev/null
+++ b/libcudacxx/include/cuda/__launch/launch.h
@@ -0,0 +1,334 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA___LAUNCH_LAUNCH_H
+#define _CUDA___LAUNCH_LAUNCH_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+
+#  include <cuda/__driver/driver_api.h>
+#  include <cuda/__launch/configuration.h>
+#  include <cuda/__runtime/api_wrapper.h>
+#  include <cuda/__runtime/ensure_current_context.h>
+#  include <cuda/__stream/device_transform.h>
+#  include <cuda/__stream/stream_ref.h>
+#  include <cuda/std/__exception/cuda_error.h>
+#  include <cuda/std/__type_traits/is_function.h>
+#  include <cuda/std/__type_traits/is_pointer.h>
+#  include <cuda/std/__type_traits/type_identity.h>
+#  include <cuda/std/__utility/forward.h>
+#  include <cuda/std/__utility/pod_tuple.h>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA
+
+#  if _CCCL_CUDA_COMPILATION()
+
+template <typename _Config, typename _Kernel, class... _Args>
+__global__ static void __kernel_launcher(const _CCCL_GRID_CONSTANT _Config __conf, _Kernel __kernel_fn, _Args... __args)
+{
+  __kernel_fn(__conf, __args...);
+}
+
+template <typename _Kernel, class... _Args>
+__global__ static void __kernel_launcher_no_config(_Kernel __kernel_fn, _Args... __args)
+{
+  __kernel_fn(__args...);
+}
+
+#  endif // _CCCL_CUDA_COMPILATION()
+
+template <class... _Args>
+[[nodiscard]] _CCCL_HOST_API ::CUfunction __get_cufunction_of(void (*__kernel)(_Args...))
+{
+  ::cudaFunction_t __kernel_cufunction{};
+  _CCCL_TRY_CUDA_API(
+    ::cudaGetFuncBySymbol, "Failed to get function from symbol", &__kernel_cufunction, (const void*) __kernel);
+  return (::CUfunction) __kernel_cufunction;
+}
+
+_CCCL_HOST_API void inline __do_launch(
+  ::cuda::stream_ref __stream, ::CUlaunchConfig& __config, ::CUfunction __kernel, void** __args_ptrs)
+{
+  __config.hStream = __stream.get();
+#  if defined(_CCCLRT_LAUNCH_CONFIG_TEST)
+  test_launch_kernel_replacement(__config, __kernel, __args_ptrs);
+#  else // ^^^ _CUDAX_LAUNCH_CONFIG_TEST ^^^ / vvv !_CUDAX_LAUNCH_CONFIG_TEST vvv
+  ::cuda::__driver::__launchKernel(__config, __kernel, __args_ptrs);
+#  endif // ^^^ !_CUDAX_LAUNCH_CONFIG_TEST ^^^
+}
+
+template <typename... _ExpTypes, typename _Dst, typename _Config>
+_CCCL_HOST_API auto __launch_impl(_Dst&& __dst, _Config __conf, ::CUfunction __kernel, _ExpTypes... __args)
+{
+  static_assert(!::cuda::std::is_same_v<decltype(__conf.dims), no_init_t>,
+                "Can't launch a configuration without hierarchy dimensions");
+  ::CUlaunchConfig __config{};
+  constexpr bool __has_cluster_level        = has_level<cluster_level, decltype(__conf.dims)>;
+  constexpr unsigned int __num_attrs_needed = __detail::kernel_config_count_attr_space(__conf) + __has_cluster_level;
+  ::CUlaunchAttribute __attrs[__num_attrs_needed == 0 ? 1 : __num_attrs_needed];
+  __config.attrs    = &__attrs[0];
+  __config.numAttrs = 0;
+
+  ::cudaError_t __status = __detail::apply_kernel_config(__conf, __config, __kernel);
+  if (__status != ::cudaSuccess)
+  {
+    ::cuda::__throw_cuda_error(__status, "Failed to prepare a launch configuration");
+  }
+
+  __config.gridDimX  = static_cast<unsigned>(__conf.dims.extents(block, grid).x);
+  __config.gridDimY  = static_cast<unsigned>(__conf.dims.extents(block, grid).y);
+  __config.gridDimZ  = static_cast<unsigned>(__conf.dims.extents(block, grid).z);
+  __config.blockDimX = static_cast<unsigned>(__conf.dims.extents(thread, block).x);
+  __config.blockDimY = static_cast<unsigned>(__conf.dims.extents(thread, block).y);
+  __config.blockDimZ = static_cast<unsigned>(__conf.dims.extents(thread, block).z);
+
+  if constexpr (__has_cluster_level)
+  {
+    ::CUlaunchAttribute __cluster_dims_attr{};
+    __cluster_dims_attr.id                 = ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+    __cluster_dims_attr.value.clusterDim.x = static_cast<unsigned>(__conf.dims.extents(block, cluster).x);
+    __cluster_dims_attr.value.clusterDim.y = static_cast<unsigned>(__conf.dims.extents(block, cluster).y);
+    __cluster_dims_attr.value.clusterDim.z = static_cast<unsigned>(__conf.dims.extents(block, cluster).z);
+    __config.attrs[__config.numAttrs++]    = __cluster_dims_attr;
+  }
+
+  const void* __pArgs[(sizeof...(__args) > 0) ? sizeof...(__args) : 1]{::cuda::std::addressof(__args)...};
+  return ::cuda::__do_launch(::cuda::std::forward<_Dst>(__dst), __config, __kernel, const_cast<void**>(__pArgs));
+}
+
+_CCCL_HOST_API ::cuda::stream_ref inline __stream_or_invalid(::cuda::stream_ref __stream)
+{
+  return __stream;
+}
+
+// cast to stream_ref to avoid instantiating launch_impl for every type
+// convertible to stream_ref
+template <typename _Dummy>
+_CCCL_HOST_API ::cuda::stream_ref __forward_or_cast_to_stream_ref(::cuda::stream_ref __stream)
+{
+  return __stream;
+}
+
+template <typename _Submitter>
+_CCCL_CONCEPT work_submitter = ::cuda::std::is_convertible_v<_Submitter, ::cuda::stream_ref>;
+
+#  if _CCCL_CUDA_COMPILATION()
+
+//! @brief Launch a kernel functor with specified configuration and arguments
+//!
+//! Launches a kernel functor object on the specified stream and with specified
+//! configuration. Kernel functor object is a type with __device__ operator().
+//! Functor might or might not accept the configuration as its first argument.
+//!
+//! @par Snippet
+//! @code
+//! #include <cstdio>
+//! #include <cuda/launch>
+//!
+//! struct kernel {
+//!     template <typename Configuration>
+//!     __device__ void operator()(Configuration conf, unsigned int
+//!     thread_to_print) {
+//!         if (conf.dims.rank(cuda::thread, cuda::grid) == thread_to_print) {
+//!             printf("Hello from the GPU\n");
+//!         }
+//!     }
+//! };
+//!
+//! void launch_kernel(cuda::stream_ref stream) {
+//!     auto dims    = cuda::make_hierarchy(cuda::block_dims<128>(),
+//!     cuda::grid_dims(4)); auto config = cuda::make_config(dims,
+//!     cuda::launch_cooperative());
+//!
+//!     cuda::launch(stream, config, kernel(), 42);
+//! }
+//! @endcode
+//!
+//! @param stream
+//! cuda::stream_ref to launch the kernel into
+//!
+//! @param conf
+//! configuration for this launch
+//!
+//! @param kernel
+//! kernel functor to be launched
+//!
+//! @param args
+//! arguments to be passed into the kernel functor
+_CCCL_TEMPLATE(typename... _Args, typename... _Config, typename _Submitter, typename _Dimensions, typename _Kernel)
+_CCCL_REQUIRES(work_submitter<_Submitter> _CCCL_AND(!::cuda::std::is_pointer_v<_Kernel>)
+                 _CCCL_AND(!::cuda::std::is_function_v<_Kernel>))
+_CCCL_HOST_API auto launch(_Submitter&& __submitter,
+                           const kernel_config<_Dimensions, _Config...>& __conf,
+                           const _Kernel& __kernel,
+                           _Args&&... __args)
+{
+  __ensure_current_context __dev_setter{__submitter};
+  auto __combined = __conf.combine_with_default(__kernel);
+  if constexpr (::cuda::std::is_invocable_v<_Kernel,
+                                            kernel_config<_Dimensions, _Config...>,
+                                            ::cuda::std::decay_t<transformed_device_argument_t<_Args>>...>)
+  {
+    auto __launcher =
+      __kernel_launcher<decltype(__combined), _Kernel, ::cuda::std::decay_t<transformed_device_argument_t<_Args>>...>;
+    return __launch_impl(
+      cuda::__forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)),
+      __combined,
+      ::cuda::__get_cufunction_of(__launcher),
+      __combined,
+      __kernel,
+      device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_Args>(__args))...);
+  }
+  else
+  {
+    static_assert(::cuda::std::is_invocable_v<_Kernel, ::cuda::std::decay_t<transformed_device_argument_t<_Args>>...>);
+    auto __launcher =
+      __kernel_launcher_no_config<_Kernel, ::cuda::std::decay_t<transformed_device_argument_t<_Args>>...>;
+    return ::cuda::__launch_impl(
+      cuda::__forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)),
+      __combined,
+      ::cuda::__get_cufunction_of(__launcher),
+      __kernel,
+      device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_Args>(__args))...);
+  }
+}
+
+#  endif // _CCCL_CUDA_COMPILATION()
+
+//! @brief Launch a kernel function with specified configuration and arguments
+//!
+//! Launches a kernel function on the specified stream and with specified
+//! configuration. Kernel function is a function with __global__ annotation.
+//! Function might or might not accept the configuration as its first argument.
+//!
+//! @par Snippet
+//! @code
+//! #include <cstdio>
+//! #include <cuda/launch>
+//!
+//! template <typename Configuration>
+//! __global__ void kernel(Configuration conf, unsigned int thread_to_print) {
+//!     if (conf.dims.rank(cuda::thread, cuda::grid) == thread_to_print) {
+//!         printf("Hello from the GPU\n");
+//!     }
+//! }
+//!
+//! void launch_kernel(cuda::stream_ref stream) {
+//!     auto dims    = cuda::make_hierarchy(cuda::block_dims<128>(),
+//!     cuda::grid_dims(4)); auto config = cuda::make_config(dims,
+//!     cuda::launch_cooperative());
+//!
+//!     cuda::launch(stream, config, kernel<decltype(config)>, 42);
+//! }
+//! @endcode
+//!
+//! @param stream
+//! cuda::stream_ref to launch the kernel into
+//!
+//! @param conf
+//! configuration for this launch
+//!
+//! @param kernel
+//! kernel function to be launched
+//!
+//! @param args
+//! arguments to be passed into the kernel function
+//!
+_CCCL_TEMPLATE(
+  typename... _ExpArgs, typename... _ActArgs, typename _Submitter, typename... _Config, typename _Dimensions)
+_CCCL_REQUIRES(work_submitter<_Submitter> _CCCL_AND(sizeof...(_ExpArgs) == sizeof...(_ActArgs)))
+_CCCL_HOST_API auto launch(_Submitter&& __submitter,
+                           const kernel_config<_Dimensions, _Config...>& __conf,
+                           void (*__kernel)(kernel_config<_Dimensions, _Config...>, _ExpArgs...),
+                           _ActArgs&&... __args)
+{
+  __ensure_current_context __dev_setter{__submitter};
+  return ::cuda::__launch_impl<kernel_config<_Dimensions, _Config...>,
+                               _ExpArgs...>(
+    cuda::__forward_or_cast_to_stream_ref<_Submitter>(__submitter), //
+    __conf,
+    ::cuda::__get_cufunction_of(__kernel),
+    __conf,
+    device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...);
+}
+
+//! @brief Launch a kernel function with specified configuration and arguments
+//!
+//! Launches a kernel function on the specified stream and with specified
+//! configuration. Kernel function is a function with __global__ annotation.
+//! Function might or might not accept the configuration as its first argument.
+//!
+//! @par Snippet
+//! @code
+//! #include <cstdio>
+//! #include <cuda/launch>
+//!
+//! template <typename Configuration>
+//! __global__ void kernel(Configuration conf, unsigned int thread_to_print) {
+//!     if (conf.dims.rank(cuda::thread, cuda::grid) == thread_to_print) {
+//!         printf("Hello from the GPU\n");
+//!     }
+//! }
+//!
+//! void launch_kernel(cuda::stream_ref stream) {
+//!     auto dims    = cuda::make_hierarchy(cuda::block_dims<128>(),
+//!     cuda::grid_dims(4)); auto config = cuda::make_config(dims,
+//!     cuda::launch_cooperative());
+//!
+//!     cuda::launch(stream, config, kernel<decltype(config)>, 42);
+//! }
+//! @endcode
+//!
+//! @param __stream
+//! cuda::stream_ref to launch the kernel into
+//!
+//! @param __conf
+//! configuration for this launch
+//!
+//! @param __kernel
+//! kernel function to be launched
+//!
+//! @param __args
+//! arguments to be passed into the kernel function
+_CCCL_TEMPLATE(
+  typename... _ExpArgs, typename... _ActArgs, typename _Submitter, typename... _Config, typename _Dimensions)
+_CCCL_REQUIRES(work_submitter<_Submitter> _CCCL_AND(sizeof...(_ExpArgs) == sizeof...(_ActArgs)))
+_CCCL_HOST_API auto launch(_Submitter&& __submitter,
+                           const kernel_config<_Dimensions, _Config...>& __conf,
+                           void (*__kernel)(_ExpArgs...),
+                           _ActArgs&&... __args)
+{
+  __ensure_current_context __dev_setter{__submitter};
+  return ::cuda::__launch_impl<_ExpArgs...>(
+    cuda::__forward_or_cast_to_stream_ref<_Submitter>(::cuda::std::forward<_Submitter>(__submitter)), //
+    __conf,
+    ::cuda::__get_cufunction_of(__kernel),
+    device_transform(::cuda::__stream_or_invalid(__submitter), ::cuda::std::forward<_ActArgs>(__args))...);
+}
+
+_CCCL_END_NAMESPACE_CUDA
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_HAS_CTK() && !_CCCL_COMPILER(NVRTC)
+
+#endif // _CUDA___LAUNCH_LAUNCH_H
diff --git a/libcudacxx/include/cuda/launch b/libcudacxx/include/cuda/launch
new file mode 100644
index 00000000000..5647ee62c47
--- /dev/null
+++ b/libcudacxx/include/cuda/launch
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_LAUNCH
+#define _CUDA_LAUNCH
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__launch/configuration.h>
+#include <cuda/__launch/host_launch.h>
+#include <cuda/__launch/launch.h>
+
+#endif //_CUDA_LAUNCH
diff --git a/cudax/test/launch/configuration.cu b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/configuration.cu
similarity index 66%
rename from cudax/test/launch/configuration.cu
rename to libcudacxx/test/libcudacxx/cuda/ccclrt/launch/configuration.cu
index 9f171fce2f1..cf7e15183cd 100644
--- a/cudax/test/launch/configuration.cu
+++ b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/configuration.cu
@@ -17,8 +17,8 @@ void test_launch_kernel_replacement(CUlaunchConfig& config, CUfunction kernel, v
 // matches the expected configuration and calls the original launch kernel
 // function if it does. If the configuration does not match, it will fail the
 // test.
-#define _CUDAX_LAUNCH_CONFIG_TEST
-#include <cuda/experimental/launch.cuh>
+#define _CCCLRT_LAUNCH_CONFIG_TEST
+#include <cuda/launch>
 
 #include <host_device.cuh>
 
@@ -30,15 +30,15 @@ void test_launch_kernel_replacement(CUlaunchConfig& config, CUfunction kernel, v
   replacementCalled = true;
   bool has_cluster  = false;
 
-  CUDAX_CHECK(expectedConfig.gridDimX == config.gridDimX);
-  CUDAX_CHECK(expectedConfig.gridDimY == config.gridDimY);
-  CUDAX_CHECK(expectedConfig.gridDimZ == config.gridDimZ);
-  CUDAX_CHECK(expectedConfig.blockDimX == config.blockDimX);
-  CUDAX_CHECK(expectedConfig.blockDimY == config.blockDimY);
-  CUDAX_CHECK(expectedConfig.blockDimZ == config.blockDimZ);
-  CUDAX_CHECK(expectedConfig.sharedMemBytes == config.sharedMemBytes);
-  CUDAX_CHECK(expectedConfig.hStream == config.hStream);
-  CUDAX_CHECK(expectedConfig.numAttrs == config.numAttrs);
+  CCCLRT_CHECK(expectedConfig.gridDimX == config.gridDimX);
+  CCCLRT_CHECK(expectedConfig.gridDimY == config.gridDimY);
+  CCCLRT_CHECK(expectedConfig.gridDimZ == config.gridDimZ);
+  CCCLRT_CHECK(expectedConfig.blockDimX == config.blockDimX);
+  CCCLRT_CHECK(expectedConfig.blockDimY == config.blockDimY);
+  CCCLRT_CHECK(expectedConfig.blockDimZ == config.blockDimZ);
+  CCCLRT_CHECK(expectedConfig.sharedMemBytes == config.sharedMemBytes);
+  CCCLRT_CHECK(expectedConfig.hStream == config.hStream);
+  CCCLRT_CHECK(expectedConfig.numAttrs == config.numAttrs);
 
   for (unsigned int i = 0; i < expectedConfig.numAttrs; ++i)
   {
@@ -52,26 +52,26 @@ void test_launch_kernel_replacement(CUlaunchConfig& config, CUfunction kernel, v
         switch (expectedAttr.id)
         {
           case CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION:
-            CUDAX_CHECK(expectedAttr.value.clusterDim.x == actualAttr.value.clusterDim.x);
-            CUDAX_CHECK(expectedAttr.value.clusterDim.y == actualAttr.value.clusterDim.y);
-            CUDAX_CHECK(expectedAttr.value.clusterDim.z == actualAttr.value.clusterDim.z);
+            CCCLRT_CHECK(expectedAttr.value.clusterDim.x == actualAttr.value.clusterDim.x);
+            CCCLRT_CHECK(expectedAttr.value.clusterDim.y == actualAttr.value.clusterDim.y);
+            CCCLRT_CHECK(expectedAttr.value.clusterDim.z == actualAttr.value.clusterDim.z);
             has_cluster = true;
             break;
           case CU_LAUNCH_ATTRIBUTE_COOPERATIVE:
-            CUDAX_CHECK(expectedAttr.value.cooperative == actualAttr.value.cooperative);
+            CCCLRT_CHECK(expectedAttr.value.cooperative == actualAttr.value.cooperative);
             break;
           case CU_LAUNCH_ATTRIBUTE_PRIORITY:
-            CUDAX_CHECK(expectedAttr.value.priority == actualAttr.value.priority);
+            CCCLRT_CHECK(expectedAttr.value.priority == actualAttr.value.priority);
             break;
           default:
-            CUDAX_CHECK(false);
+            CCCLRT_CHECK(false);
             break;
         }
         break;
       }
     }
     INFO("Searched attribute is " << expectedAttr.id);
-    CUDAX_CHECK(j != expectedConfig.numAttrs);
+    CCCLRT_CHECK(j != expectedConfig.numAttrs);
   }
 
   if (!has_cluster || !skip_device_exec(arch_filter<std::less<int>, 90>))
@@ -128,7 +128,7 @@ auto configuration_test(
   SECTION("Simple cooperative launch")
   {
     CUlaunchAttribute attrs[2];
-    auto config                               = cudax::make_config(dims, cudax::cooperative_launch());
+    auto config                               = cuda::make_config(dims, cuda::cooperative_launch());
     expectedConfig.numAttrs                   = 1 + HasCluster;
     expectedConfig.attrs                      = &attrs[0];
     expectedConfig.attrs[0].id                = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
@@ -137,7 +137,7 @@ auto configuration_test(
     {
       add_cluster(cluster_dims, expectedConfig.attrs[1]);
     }
-    cudax::launch(stream, config, empty_kernel, 0);
+    cuda::launch(stream, config, empty_kernel, 0);
   }
 
   SECTION("Priority and dynamic smem")
@@ -146,7 +146,7 @@ auto configuration_test(
     constexpr int priority = 42;
     constexpr int num_ints = 128;
     auto config =
-      cudax::make_config(dims, cudax::launch_priority(priority), cudax::dynamic_shared_memory<int[num_ints]>());
+      cuda::make_config(dims, cuda::launch_priority(priority), cuda::dynamic_shared_memory<int[num_ints]>());
     expectedConfig.sharedMemBytes          = num_ints * sizeof(int);
     expectedConfig.numAttrs                = 1 + HasCluster;
     expectedConfig.attrs                   = &attrs[0];
@@ -156,7 +156,7 @@ auto configuration_test(
     {
       add_cluster(cluster_dims, expectedConfig.attrs[1]);
     }
-    cudax::launch(stream, config, empty_kernel, 0);
+    cuda::launch(stream, config, empty_kernel, 0);
   }
 
   SECTION("Large dynamic smem")
@@ -168,7 +168,7 @@ auto configuration_test(
       int arr[13 * 1024];
     };
     CUlaunchAttribute attrs[1];
-    auto config                   = cudax::make_config(dims, cudax::dynamic_shared_memory<S>(cudax::non_portable));
+    auto config                   = cuda::make_config(dims, cuda::dynamic_shared_memory<S>(cuda::non_portable));
     expectedConfig.sharedMemBytes = sizeof(S);
     expectedConfig.numAttrs       = HasCluster;
     expectedConfig.attrs          = &attrs[0];
@@ -176,7 +176,7 @@ auto configuration_test(
     {
       add_cluster(cluster_dims, expectedConfig.attrs[0]);
     }
-    cudax::launch(stream, config, empty_kernel, 0);
+    cuda::launch(stream, config, empty_kernel, 0);
   }
   stream.sync();
 }
@@ -195,21 +195,21 @@ C2H_TEST("Launch configuration", "[launch]")
   }
 
   CUDART(cudaStreamDestroy(stream));
-  CUDAX_CHECK(replacementCalled);
+  CCCLRT_CHECK(replacementCalled);
 }
 
 C2H_TEST("Hierarchy construction in config", "[launch]")
 {
-  auto config = cudax::make_config(cuda::grid_dims<2>(), cudax::cooperative_launch());
+  auto config = cuda::make_config(cuda::grid_dims<2>(), cuda::cooperative_launch());
   static_assert(config.dims.count(cuda::block) == 2);
 
-  auto config_larger = cudax::make_config(cuda::grid_dims<2>(), cuda::block_dims(256), cudax::cooperative_launch());
-  CUDAX_REQUIRE(config_larger.dims.count(cuda::thread) == 512);
+  auto config_larger = cuda::make_config(cuda::grid_dims<2>(), cuda::block_dims(256), cuda::cooperative_launch());
+  CCCLRT_REQUIRE(config_larger.dims.count(cuda::thread) == 512);
 
-  auto config_no_options = cudax::make_config(cuda::grid_dims(2), cuda::block_dims<128>());
-  CUDAX_REQUIRE(config_no_options.dims.count(cuda::thread) == 256);
+  auto config_no_options = cuda::make_config(cuda::grid_dims(2), cuda::block_dims<128>());
+  CCCLRT_REQUIRE(config_no_options.dims.count(cuda::thread) == 256);
 
-  [[maybe_unused]] auto config_no_dims = cudax::make_config(cudax::cooperative_launch());
+  [[maybe_unused]] auto config_no_dims = cuda::make_config(cuda::cooperative_launch());
   static_assert(cuda::std::is_same_v<decltype(config_no_dims.dims), cuda::__empty_hierarchy>);
 }
 
@@ -220,33 +220,32 @@ C2H_TEST("Configuration combine", "[launch]")
   auto block   = cuda::block_dims(256);
   SECTION("Combine with no overlap")
   {
-    auto config_part1                         = cudax::make_config(grid);
-    auto config_part2                         = cudax::make_config(block, cudax::launch_priority(2));
+    auto config_part1                         = cuda::make_config(grid);
+    auto config_part2                         = cuda::make_config(block, cuda::launch_priority(2));
     auto combined                             = config_part1.combine(config_part2);
     [[maybe_unused]] auto combined_other_way  = config_part2.combine(config_part1);
-    [[maybe_unused]] auto combined_with_empty = combined.combine(cudax::make_config());
-    [[maybe_unused]] auto empty_with_combined = cudax::make_config().combine(combined);
+    [[maybe_unused]] auto combined_with_empty = combined.combine(cuda::make_config());
+    [[maybe_unused]] auto empty_with_combined = cuda::make_config().combine(combined);
     static_assert(
-      cuda::std::is_same_v<decltype(combined), decltype(cudax::make_config(grid, block, cudax::launch_priority(2)))>);
+      cuda::std::is_same_v<decltype(combined), decltype(cuda::make_config(grid, block, cuda::launch_priority(2)))>);
     static_assert(cuda::std::is_same_v<decltype(combined), decltype(combined_other_way)>);
     static_assert(cuda::std::is_same_v<decltype(combined), decltype(combined_with_empty)>);
     static_assert(cuda::std::is_same_v<decltype(combined), decltype(empty_with_combined)>);
-    CUDAX_REQUIRE(combined.dims.count(cuda::thread) == 512);
+    CCCLRT_REQUIRE(combined.dims.count(cuda::thread) == 512);
   }
   SECTION("Combine with overlap")
   {
-    auto config_part1 = make_config(grid, cluster, cudax::launch_priority(2));
-    auto config_part2 = make_config(cuda::cluster_dims<256>, block, cudax::launch_priority(42));
+    auto config_part1 = make_config(grid, cluster, cuda::launch_priority(2));
+    auto config_part2 = make_config(cuda::cluster_dims<256>(), block, cuda::launch_priority(42));
     auto combined     = config_part1.combine(config_part2);
-    CUDAX_REQUIRE(combined.dims.count(cuda::thread) == 2048);
-    CUDAX_REQUIRE(cuda::std::get<0>(combined.options).priority == 2);
+    CCCLRT_REQUIRE(combined.dims.count(cuda::thread) == 2048);
+    CCCLRT_REQUIRE(cuda::std::get<0>(combined.options).priority == 2);
 
-    auto replaced_one_option = cudax::make_config(cudax::launch_priority(3)).combine(combined);
-    CUDAX_REQUIRE(replaced_one_option.dims.count(cuda::thread) == 2048);
-    CUDAX_REQUIRE(cuda::std::get<0>(replaced_one_option.options).priority == 3);
+    auto replaced_one_option = cuda::make_config(cuda::launch_priority(3)).combine(combined);
+    CCCLRT_REQUIRE(replaced_one_option.dims.count(cuda::thread) == 2048);
+    CCCLRT_REQUIRE(cuda::std::get<0>(replaced_one_option.options).priority == 3);
 
-    [[maybe_unused]] auto combined_with_extra_option =
-      combined.combine(cudax::make_config(cudax::cooperative_launch()));
+    [[maybe_unused]] auto combined_with_extra_option = combined.combine(cuda::make_config(cuda::cooperative_launch()));
     static_assert(cuda::std::is_same_v<decltype(combined.dims), decltype(combined_with_extra_option.dims)>);
     static_assert(cuda::std::tuple_size_v<decltype(combined_with_extra_option.options)> == 2);
   }
diff --git a/cudax/test/launch/dynamic_shared_memory.cu b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/dynamic_shared_memory.cu
similarity index 68%
rename from cudax/test/launch/dynamic_shared_memory.cu
rename to libcudacxx/test/libcudacxx/cuda/ccclrt/launch/dynamic_shared_memory.cu
index 1cce086a97e..c0eaab93c05 100644
--- a/cudax/test/launch/dynamic_shared_memory.cu
+++ b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/dynamic_shared_memory.cu
@@ -10,14 +10,13 @@
 
 #include <cuda/devices>
 #include <cuda/hierarchy>
+#include <cuda/launch>
 #include <cuda/std/cstddef>
 #include <cuda/std/functional>
 #include <cuda/std/span>
 #include <cuda/std/type_traits>
 #include <cuda/stream>
 
-#include <cuda/experimental/launch.cuh>
-
 #include <testing.cuh>
 
 template <class T, class View>
@@ -26,16 +25,16 @@ struct TestKernel
   template <class Config>
   __device__ void operator()(const Config& config)
   {
-    static_assert(cuda::std::is_same_v<View, decltype(cudax::device::dynamic_shared_memory_view(config))>);
-    static_assert(noexcept(cudax::device::dynamic_shared_memory_view(config)));
+    static_assert(cuda::std::is_same_v<View, decltype(cuda::dynamic_shared_memory_view(config))>);
+    static_assert(noexcept(cuda::dynamic_shared_memory_view(config)));
 
-    write_smem(cudax::device::dynamic_shared_memory_view(config));
+    write_smem(cuda::dynamic_shared_memory_view(config));
   }
 
   __device__ void write_smem(T& view)
   {
     view = T{};
-    CUDAX_REQUIRE(view == T{});
+    CCCLRT_REQUIRE_DEVICE(view == T{});
   }
 
   template <cuda::std::size_t N>
@@ -44,7 +43,7 @@ struct TestKernel
     for (cuda::std::size_t i = 0; i < view.size(); ++i)
     {
       view[i] = T{};
-      CUDAX_REQUIRE(view[i] == T{});
+      CCCLRT_REQUIRE_DEVICE(view[i] == T{});
     }
   }
 };
@@ -55,16 +54,16 @@ void test_opt_and_launch(cuda::stream_ref stream, Opt opt)
   static_assert(cuda::std::is_same_v<T, typename Opt::value_type>);
   static_assert(cuda::std::is_same_v<View, typename Opt::view_type>);
 
-  const auto config = cudax::make_config(cuda::block_dims<1, 1>(), cuda::grid_dims<1, 1>(), opt);
-  cudax::launch(stream, config, TestKernel<T, View>{});
+  const auto config = cuda::make_config(cuda::block_dims<1, 1>(), cuda::grid_dims<1, 1>(), opt);
+  cuda::launch(stream, config, TestKernel<T, View>{});
   stream.sync();
 }
 
 template <class T>
 void test_ref(cuda::stream_ref stream)
 {
-  static_assert(noexcept(cudax::dynamic_shared_memory<T>()));
-  test_opt_and_launch<T, T&>(stream, cudax::dynamic_shared_memory<T>());
+  static_assert(noexcept(cuda::dynamic_shared_memory<T>()));
+  test_opt_and_launch<T, T&>(stream, cuda::dynamic_shared_memory<T>());
 }
 
 void test_ref(cuda::stream_ref stream)
@@ -78,11 +77,11 @@ void test_ref(cuda::stream_ref stream)
 template <class T, cuda::std::size_t N>
 void test_span(cuda::stream_ref stream)
 {
-  static_assert(!noexcept(cudax::dynamic_shared_memory<T[]>(N)));
-  test_opt_and_launch<T, cuda::std::span<T>>(stream, cudax::dynamic_shared_memory<T[]>(N));
+  static_assert(!noexcept(cuda::dynamic_shared_memory<T[]>(N * 1024 * 1024)));
+  test_opt_and_launch<T, cuda::std::span<T>>(stream, cuda::dynamic_shared_memory<T[]>(N));
 
-  static_assert(noexcept(cudax::dynamic_shared_memory<T[N]>()));
-  test_opt_and_launch<T, cuda::std::span<T, N>>(stream, cudax::dynamic_shared_memory<T[N]>());
+  static_assert(noexcept(cuda::dynamic_shared_memory<T[N]>()));
+  test_opt_and_launch<T, cuda::std::span<T, N>>(stream, cuda::dynamic_shared_memory<T[N]>());
 }
 
 void test_span(cuda::stream_ref stream)
diff --git a/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu
new file mode 100644
index 00000000000..f42e0a0b977
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ccclrt/launch/launch_smoke.cu
@@ -0,0 +1,341 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+#include <cuda/atomic>
+#include <cuda/launch>
+#include <cuda/memory>
+#include <cuda/stream>
+
+#include <cooperative_groups.h>
+#include <testing.cuh>
+
+#if !_CCCL_CUDA_COMPILER(CLANG)
+
+__managed__ bool kernel_run_proof = false;
+
+void check_kernel_run(cudaStream_t stream)
+{
+  CUDART(cudaStreamSynchronize(stream));
+  CCCLRT_CHECK(kernel_run_proof);
+  kernel_run_proof = false;
+}
+
+struct kernel_run_proof_check
+{
+  __device__ void operator()()
+  {
+    CCCLRT_CHECK_DEVICE(kernel_run_proof);
+    kernel_run_proof = false;
+  }
+};
+
+struct functor_int_argument
+{
+  __device__ void operator()(int dummy)
+  {
+    kernel_run_proof = true;
+  }
+};
+
+template <unsigned int BlockSize>
+struct functor_taking_config
+{
+  template <typename Config>
+  __device__ void operator()(Config config, int grid_size)
+  {
+    static_assert(config.dims.static_count(cuda::thread, cuda::block) == BlockSize);
+    CCCLRT_REQUIRE_DEVICE(config.dims.count(cuda::block, cuda::grid) == grid_size);
+    kernel_run_proof = true;
+  }
+};
+
+__global__ void kernel_no_arguments()
+{
+  kernel_run_proof = true;
+}
+
+__global__ void kernel_int_argument(int dummy)
+{
+  kernel_run_proof = true;
+}
+
+template <typename Config, unsigned int BlockSize>
+__global__ void kernel_taking_config(Config config, int grid_size)
+{
+  functor_taking_config<BlockSize>()(config, grid_size);
+}
+
+struct my_dynamic_smem_t
+{
+  int i;
+};
+
+template <typename SmemType>
+struct dynamic_smem_single
+{
+  template <typename Config>
+  __device__ void operator()(Config config)
+  {
+    decltype(auto) dynamic_smem = cuda::dynamic_shared_memory_view(config);
+    static_assert(::cuda::std::is_same_v<SmemType&, decltype(dynamic_smem)>);
+    CCCLRT_REQUIRE_DEVICE(::cuda::device::is_object_from(dynamic_smem, ::cuda::device::address_space::shared));
+    kernel_run_proof = true;
+  }
+};
+
+template <typename SmemType, size_t Extent>
+struct dynamic_smem_span
+{
+  template <typename Config>
+  __device__ void operator()(Config config, int size)
+  {
+    auto dynamic_smem = cuda::dynamic_shared_memory_view(config);
+    static_assert(decltype(dynamic_smem)::extent == Extent);
+    static_assert(::cuda::std::is_same_v<SmemType&, decltype(dynamic_smem[1])>);
+    CCCLRT_REQUIRE_DEVICE(dynamic_smem.size() == size);
+    CCCLRT_REQUIRE_DEVICE(::cuda::device::is_object_from(dynamic_smem[1], ::cuda::device::address_space::shared));
+    kernel_run_proof = true;
+  }
+};
+
+struct launch_transform_to_int_convertible
+{
+  int value_;
+
+  struct int_convertible
+  {
+    cudaStream_t stream_;
+    int value_;
+
+    int_convertible(cudaStream_t stream, int value) noexcept
+        : stream_(stream)
+        , value_(value)
+    {
+      // Check that the constructor runs before the kernel is launched
+      // Disabled for now because we don't handle it with graphs
+      // CUDAX_CHECK_FALSE(kernel_run_proof);
+    }
+
+    // Immovable to ensure that device_transform doesn't copy the returned
+    // object
+    int_convertible(int_convertible&&) noexcept = delete;
+
+    ~int_convertible() noexcept
+    {
+      // Check that the destructor runs after the kernel is launched
+      // Disabled for now because we don't handle it with graphs
+      // CUDART(cudaStreamSynchronize(stream_));
+      // CCCLRT_CHECK(kernel_run_proof);
+    }
+
+    // This is the value that will be passed to the kernel
+    int transformed_argument() const
+    {
+      return value_;
+    }
+  };
+
+  [[nodiscard]] friend int_convertible
+  transform_device_argument(::cuda::stream_ref stream, launch_transform_to_int_convertible self) noexcept
+  {
+    return int_convertible(stream.get(), self.value_);
+  }
+};
+
+// Needs a separate function for Windows extended lambda
+void launch_smoke_test(cudaStream_t dst)
+{
+  cuda::__ensure_current_context guard(cuda::device_ref{0});
+  // Use raw stream to make sure it can be implicitly converted on call to
+  // launch
+  cudaStream_t stream;
+
+  CUDART(cudaStreamCreate(&stream));
+  // Spell out all overloads to make sure they compile, include a check for
+  // implicit conversions
+  {
+    const int grid_size      = 4;
+    constexpr int block_size = 256;
+    auto dimensions          = cuda::make_hierarchy(cuda::grid_dims(grid_size), cuda::block_dims<256>());
+    auto config              = cuda::make_config(dimensions);
+
+    // Not taking dims
+    {
+      cuda::launch(dst, config, kernel_no_arguments);
+      check_kernel_run(dst);
+
+      const int dummy = 1;
+      cuda::launch(dst, config, kernel_int_argument, dummy);
+      check_kernel_run(dst);
+      cuda::launch(dst, config, kernel_int_argument, 1);
+      check_kernel_run(dst);
+      cuda::launch(dst, config, kernel_int_argument, launch_transform_to_int_convertible{1});
+      check_kernel_run(dst);
+      cuda::launch(dst, config, kernel_int_argument, 1U);
+      check_kernel_run(dst);
+
+      cuda::launch(dst, config, functor_int_argument(), dummy);
+      check_kernel_run(dst);
+      cuda::launch(dst, config, functor_int_argument(), 1);
+      check_kernel_run(dst);
+      cuda::launch(dst, config, functor_int_argument(), launch_transform_to_int_convertible{1});
+      check_kernel_run(dst);
+      cuda::launch(dst, config, functor_int_argument(), 1U);
+      check_kernel_run(dst);
+    }
+
+    // Config argument
+    {
+      auto functor_instance = functor_taking_config<block_size>();
+      auto kernel_instance  = kernel_taking_config<decltype(config), block_size>;
+
+      cuda::launch(dst, config, functor_instance, grid_size);
+      check_kernel_run(dst);
+      cuda::launch(dst, config, functor_instance, ::cuda::std::move(grid_size));
+      check_kernel_run(dst);
+      cuda::launch(dst, config, functor_instance, launch_transform_to_int_convertible{grid_size});
+      check_kernel_run(dst);
+      cuda::launch(dst, config, functor_instance, static_cast<unsigned int>(grid_size));
+      check_kernel_run(dst);
+
+      cuda::launch(dst, config, kernel_instance, grid_size);
+      check_kernel_run(dst);
+      cuda::launch(dst, config, kernel_instance, ::cuda::std::move(grid_size));
+      check_kernel_run(dst);
+      cuda::launch(dst, config, kernel_instance, launch_transform_to_int_convertible{grid_size});
+      check_kernel_run(dst);
+      cuda::launch(dst, config, kernel_instance, static_cast<unsigned int>(grid_size));
+      check_kernel_run(dst);
+    }
+  }
+
+  /* Comment out for now until I figure how to enable extended lambda for only this file
+  // Lambda
+  {
+    cuda::launch(dst, cuda::block_dims<256>() & cuda::grid_dims(1),
+                  [] __device__(auto config) {
+                    if (config.dims.rank(cuda::thread, cuda::block) == 0) {
+                      printf("Hello from the GPU\n");
+                      kernel_run_proof = true;
+                    }
+                  });
+    check_kernel_run(dst);
+  }
+  */
+
+  // Dynamic shared memory option
+  {
+    auto config = cuda::block_dims<32>() & cuda::grid_dims<1>();
+
+    auto test = [&](const auto& input_config) {
+      // Single element
+      {
+        auto config = input_config.add(cuda::dynamic_shared_memory<my_dynamic_smem_t>());
+
+        cuda::launch(dst, config, dynamic_smem_single<my_dynamic_smem_t>());
+        check_kernel_run(dst);
+      }
+
+      // Dynamic span
+      {
+        const int size = 2;
+        auto config    = input_config.add(cuda::dynamic_shared_memory<my_dynamic_smem_t[]>(size));
+        cuda::launch(dst, config, dynamic_smem_span<my_dynamic_smem_t, ::cuda::std::dynamic_extent>(), size);
+        check_kernel_run(dst);
+      }
+
+      // Static span
+      {
+        constexpr int size = 3;
+        auto config        = input_config.add(cuda::dynamic_shared_memory<my_dynamic_smem_t[size]>());
+        cuda::launch(dst, config, dynamic_smem_span<my_dynamic_smem_t, size>(), size);
+        check_kernel_run(dst);
+      }
+    };
+
+    test(config);
+    test(config.add(cuda::cooperative_launch(), cuda::launch_priority(0)));
+  }
+}
+
+C2H_TEST("Launch smoke stream", "[launch]")
+{
+  // Use raw stream to make sure it can be implicitly converted on call to
+  // launch
+  cudaStream_t stream;
+
+  CUDART(cudaStreamCreate(&stream));
+
+  launch_smoke_test(stream);
+
+  CUDART(cudaStreamSynchronize(stream));
+  CUDART(cudaStreamDestroy(stream));
+}
+#endif // !_CCCL_CUDA_COMPILER(CLANG)
+
+template <typename DefaultConfig>
+struct kernel_with_default_config
+{
+  DefaultConfig config;
+
+  kernel_with_default_config(DefaultConfig c)
+      : config(c)
+  {}
+
+  DefaultConfig default_config() const
+  {
+    return config;
+  }
+
+  template <typename Config, typename ConfigCheckFn>
+  __device__ void operator()(Config config, ConfigCheckFn check_fn)
+  {
+    check_fn(config);
+  }
+};
+
+/* Comment out for now until I figure how to enable extended lambda for only this file
+void test_default_config() {
+  cuda::stream stream{cuda::device_ref{0}};
+  auto grid = cuda::grid_dims(4);
+  auto block = cuda::block_dims<256>;
+
+  auto verify_lambda = [] __device__(auto config) {
+    static_assert(config.dims.count(cuda::thread, cuda::block) == 256);
+    CCCLRT_REQUIRE(config.dims.count(cuda::block) == 4);
+    cooperative_groups::this_grid().sync();
+  };
+
+  SECTION("Combine with empty") {
+    kernel_with_default_config kernel{
+        cuda::make_config(block, grid, cuda::cooperative_launch())};
+    static_assert(cuda::__is_kernel_config<decltype(kernel.default_config())>);
+    static_assert(cuda::__kernel_has_default_config<decltype(kernel)>);
+
+    cuda::launch(stream, cuda::make_config(), kernel, verify_lambda);
+    stream.sync();
+  }
+  SECTION("Combine with no overlap") {
+    kernel_with_default_config kernel{cuda::make_config(block)};
+    cuda::launch(stream, cuda::make_config(grid, cuda::cooperative_launch()),
+                  kernel, verify_lambda);
+    stream.sync();
+  }
+  SECTION("Combine with overlap") {
+    kernel_with_default_config kernel{
+        cuda::make_config(cuda::block_dims<1>, cuda::cooperative_launch())};
+    cuda::launch(stream,
+                  cuda::make_config(block, grid, cuda::cooperative_launch()),
+                  kernel, verify_lambda);
+    stream.sync();
+  }
+}
+
+C2H_TEST("Launch with default config", "") { test_default_config(); }
+*/
diff --git a/libcudacxx/test/libcudacxx/cuda/containers/buffer/transform.cu b/libcudacxx/test/libcudacxx/cuda/containers/buffer/transform.cu
index 7101ceb2cc5..560850d01b5 100644
--- a/libcudacxx/test/libcudacxx/cuda/containers/buffer/transform.cu
+++ b/libcudacxx/test/libcudacxx/cuda/containers/buffer/transform.cu
@@ -87,8 +87,7 @@ C2H_CCCLRT_TEST("cuda::buffer launch transform", "[container][buffer]")
   cuda::device_buffer<int> a       = cuda::make_buffer<int>(stream, resource, array);
   const cuda::device_buffer<int> b = cuda::make_buffer(stream, resource, a.size(), 1);
 
-  cuda::experimental::launch(
-    stream, cuda::experimental::make_config(cuda::grid_dims<1>(), cuda::block_dims<32>()), add_kernel{}, a, b);
+  cuda::launch(stream, cuda::make_config(cuda::grid_dims<1>(), cuda::block_dims<32>()), add_kernel{}, a, b);
 
   std::vector<int> host_result(a.size());
   cuda::copy_bytes(stream, a, host_result);