diff --git a/compiler/rustc_codegen_llvm/src/base.rs b/compiler/rustc_codegen_llvm/src/base.rs index d00e70638b45a..a75734d0271a5 100644 --- a/compiler/rustc_codegen_llvm/src/base.rs +++ b/compiler/rustc_codegen_llvm/src/base.rs @@ -99,7 +99,7 @@ pub(crate) fn compile_codegen_unit( .unstable_opts .offload .iter() - .any(|o| matches!(o, Offload::Host(_) | Offload::Test)); + .any(|o| matches!(o, Offload::Host(_) | Offload::Test | Offload::Args)); if has_host_offload && !cx.sess().target.is_like_gpu { cx.offload_globals.replace(Some(OffloadGlobals::declare(&cx))); } diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index 402861eda8707..9b2560727c00d 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -9,7 +9,7 @@ use rustc_middle::bug; use rustc_middle::ty::offload_meta::OffloadMetadata; use crate::builder::Builder; -use crate::common::CodegenCx; +use crate::common::{AsCCharPtr, CodegenCx}; use crate::llvm::AttributePlace::Function; use crate::llvm::{self, Linkage, Type, Value}; use crate::{SimpleCx, attributes}; @@ -346,8 +346,8 @@ impl KernelArgsTy { pub(crate) struct OffloadKernelGlobals<'ll> { pub offload_sizes: &'ll llvm::Value, pub memtransfer_types: &'ll llvm::Value, - pub region_id: &'ll llvm::Value, - pub offload_entry: &'ll llvm::Value, + pub region_id: Option<&'ll llvm::Value>, + pub offload_entry: Option<&'ll llvm::Value>, } fn gen_tgt_data_mappers<'ll>( @@ -417,6 +417,7 @@ pub(crate) fn gen_define_handling<'ll>( metadata: &[OffloadMetadata], symbol: String, offload_globals: &OffloadGlobals<'ll>, + host: bool, ) -> OffloadKernelGlobals<'ll> { if let Some(entry) = cx.offload_kernel_cache.borrow().get(&symbol) { return *entry; @@ -440,33 +441,38 @@ pub(crate) fn gen_define_handling<'ll>( // Next: For each function, generate these three entries. A weak constant, // the llvm.rodata entry name, and the llvm_offload_entries value - let name = format!(".{symbol}.region_id"); - let initializer = cx.get_const_i8(0); - let region_id = add_global(&cx, &name, initializer, WeakAnyLinkage); - - let c_entry_name = CString::new(symbol.clone()).unwrap(); - let c_val = c_entry_name.as_bytes_with_nul(); - let offload_entry_name = format!(".offloading.entry_name.{symbol}"); - - let initializer = crate::common::bytes_in_context(cx.llcx, c_val); - let llglobal = add_unnamed_global(&cx, &offload_entry_name, initializer, InternalLinkage); - llvm::set_alignment(llglobal, Align::ONE); - llvm::set_section(llglobal, c".llvm.rodata.offloading"); - - let name = format!(".offloading.entry.{symbol}"); - - // See the __tgt_offload_entry documentation above. - let elems = TgtOffloadEntry::new(&cx, region_id, llglobal); - - let initializer = crate::common::named_struct(offload_entry_ty, &elems); - let c_name = CString::new(name).unwrap(); - let offload_entry = llvm::add_global(cx.llmod, offload_entry_ty, &c_name); - llvm::set_global_constant(offload_entry, true); - llvm::set_linkage(offload_entry, WeakAnyLinkage); - llvm::set_initializer(offload_entry, initializer); - llvm::set_alignment(offload_entry, Align::EIGHT); - let c_section_name = CString::new("llvm_offload_entries").unwrap(); - llvm::set_section(offload_entry, &c_section_name); + let (offload_entry, region_id) = if !host { + let name = format!(".{symbol}.region_id"); + let initializer = cx.get_const_i8(0); + let region_id = add_global(&cx, &name, initializer, WeakAnyLinkage); + + let c_entry_name = CString::new(symbol.clone()).unwrap(); + let c_val = c_entry_name.as_bytes_with_nul(); + let offload_entry_name = format!(".offloading.entry_name.{symbol}"); + + let initializer = crate::common::bytes_in_context(cx.llcx, c_val); + let llglobal = add_unnamed_global(&cx, &offload_entry_name, initializer, InternalLinkage); + llvm::set_alignment(llglobal, Align::ONE); + llvm::set_section(llglobal, c".llvm.rodata.offloading"); + + let name = format!(".offloading.entry.{symbol}"); + + // See the __tgt_offload_entry documentation above. + let elems = TgtOffloadEntry::new(&cx, region_id, llglobal); + + let initializer = crate::common::named_struct(offload_entry_ty, &elems); + let c_name = CString::new(name).unwrap(); + let offload_entry = llvm::add_global(cx.llmod, offload_entry_ty, &c_name); + llvm::set_global_constant(offload_entry, true); + llvm::set_linkage(offload_entry, WeakAnyLinkage); + llvm::set_initializer(offload_entry, initializer); + llvm::set_alignment(offload_entry, Align::EIGHT); + let c_section_name = CString::new("llvm_offload_entries").unwrap(); + llvm::set_section(offload_entry, &c_section_name); + (Some(offload_entry), Some(region_id)) + } else { + (None, None) + }; let result = OffloadKernelGlobals { offload_sizes, memtransfer_types, region_id, offload_entry }; @@ -529,13 +535,16 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>( types: &[&Type], metadata: &[OffloadMetadata], offload_globals: &OffloadGlobals<'ll>, - offload_dims: &OffloadKernelDims<'ll>, + offload_dims: Option<&OffloadKernelDims<'ll>>, + host: bool, + host_llfn: &'ll Value, + host_llty: &'ll Type, ) { let cx = builder.cx; let OffloadKernelGlobals { offload_sizes, offload_entry, memtransfer_types, region_id } = offload_data; - let OffloadKernelDims { num_workgroups, threads_per_block, workgroup_dims, thread_dims } = - offload_dims; + //let OffloadKernelDims { num_workgroups, threads_per_block, workgroup_dims, thread_dims } = + // offload_dims; let tgt_decl = offload_globals.launcher_fn; let tgt_target_kernel_ty = offload_globals.launcher_ty; @@ -550,7 +559,12 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>( // FIXME(Sa4dUs): dummy loads are a temp workaround, we should find a proper way to prevent these // variables from being optimized away - for val in [offload_sizes, offload_entry] { + let to_keep: &[&llvm::Value] = if let Some(offload_entry) = offload_entry { + &[offload_sizes, offload_entry] + } else { + &[offload_sizes] + }; + for val in to_keep { unsafe { let dummy = llvm::LLVMBuildLoad2( &builder.llbuilder, @@ -686,27 +700,53 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>( num_args, s_ident_t, ); - let values = - KernelArgsTy::new(&cx, num_args, memtransfer_types, geps, workgroup_dims, thread_dims); - - // Step 3) - // Here we fill the KernelArgsTy, see the documentation above - for (i, value) in values.iter().enumerate() { - let ptr = builder.inbounds_gep(tgt_kernel_decl, a5, &[i32_0, cx.get_const_i32(i as u64)]); - builder.store(value.1, ptr, value.0); - } - let args = vec![ - s_ident_t, - // FIXME(offload) give users a way to select which GPU to use. - cx.get_const_i64(u64::MAX), // MAX == -1. - num_workgroups, - threads_per_block, - region_id, - a5, - ]; - builder.call(tgt_target_kernel_ty, None, None, tgt_decl, &args, None, None); - // %41 = call i32 @__tgt_target_kernel(ptr @1, i64 -1, i32 2097152, i32 256, ptr @.kernel_1.region_id, ptr %kernel_args) + if host { + let fn_name = "omp_get_mapped_ptr"; + let ty2: &'ll Type = cx.type_func(&[cx.type_ptr(), cx.type_i32()], cx.type_ptr()); + let mapper_fn = unsafe { + llvm::LLVMRustGetOrInsertFunction( + builder.llmod, + fn_name.as_c_char_ptr(), + fn_name.len(), + ty2, + ) + }; + + let mut device_vals = Vec::with_capacity(vals.len()); + let device_num = cx.get_const_i32(0); + for arg in vals { + let device_arg = + builder.call(ty2, None, None, mapper_fn, &[arg, device_num], None, None); + device_vals.push(device_arg); + } + builder.call(host_llty, None, None, host_llfn, &device_vals, None, None); + } else { + let OffloadKernelDims { num_workgroups, threads_per_block, workgroup_dims, thread_dims } = + offload_dims.unwrap(); + let values = + KernelArgsTy::new(&cx, num_args, memtransfer_types, geps, workgroup_dims, thread_dims); + + // Step 3) + // Here we fill the KernelArgsTy, see the documentation above + for (i, value) in values.iter().enumerate() { + let ptr = + builder.inbounds_gep(tgt_kernel_decl, a5, &[i32_0, cx.get_const_i32(i as u64)]); + builder.store(value.1, ptr, value.0); + } + // In the host case, we know by construction that this variable is set. + let args = vec![ + s_ident_t, + // FIXME(offload) give users a way to select which GPU to use. + cx.get_const_i64(u64::MAX), // MAX == -1. + num_workgroups, + threads_per_block, + region_id.unwrap(), + a5, + ]; + // %41 = call i32 @__tgt_target_kernel(ptr @1, i64 -1, i32 2097152, i32 256, ptr @.kernel_1.region_id, ptr %kernel_args) + builder.call(tgt_target_kernel_ty, None, None, tgt_decl, &args, None, None); + } // Step 4) let geps = get_geps(builder, ty, ty2, a1, a2, a4); diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index e035f0809d685..a19d42d62a173 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -217,7 +217,19 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { let _ = tcx.dcx().emit_almost_fatal(OffloadWithoutFatLTO); } - codegen_offload(self, tcx, instance, args); + codegen_offload(self, tcx, instance, args, false); + return Ok(()); + } + sym::offload_args => { + if tcx.sess.opts.unstable_opts.offload.is_empty() { + let _ = tcx.dcx().emit_almost_fatal(OffloadWithoutEnable); + } + + if tcx.sess.lto() != rustc_session::config::Lto::Fat { + let _ = tcx.dcx().emit_almost_fatal(OffloadWithoutFatLTO); + } + + codegen_offload(self, tcx, instance, args, true); return Ok(()); } sym::is_val_statically_known => { @@ -1362,6 +1374,7 @@ fn codegen_offload<'ll, 'tcx>( tcx: TyCtxt<'tcx>, instance: ty::Instance<'tcx>, args: &[OperandRef<'tcx, &'ll Value>], + host: bool, ) { let cx = bx.cx; let fn_args = instance.args; @@ -1384,8 +1397,18 @@ fn codegen_offload<'ll, 'tcx>( } }; - let offload_dims = OffloadKernelDims::from_operands(bx, &args[1], &args[2]); - let args = get_args_from_tuple(bx, args[3], fn_target); + let llfn = cx.get_fn(fn_target); + let (offload_dims, args) = if host { + // If we only map arguments to the gpu and otherwise work on host code, there is no need to + // handle block or thread dimensions. + let args = get_args_from_tuple(bx, args[1], fn_target); + (None, args) + } else { + let offload_dims = OffloadKernelDims::from_operands(bx, &args[1], &args[2]); + let args = get_args_from_tuple(bx, args[3], fn_target); + (Some(offload_dims), args) + }; + let target_symbol = symbol_name_for_instance_in_crate(tcx, fn_target, LOCAL_CRATE); let sig = tcx.fn_sig(fn_target.def_id()).skip_binder(); @@ -1405,8 +1428,24 @@ fn codegen_offload<'ll, 'tcx>( } }; register_offload(cx); - let offload_data = gen_define_handling(&cx, &metadata, target_symbol, offload_globals); - gen_call_handling(bx, &offload_data, &args, &types, &metadata, offload_globals, &offload_dims); + let instance = rustc_middle::ty::Instance::mono(tcx, fn_target.def_id()); + let fn_abi = cx.fn_abi_of_instance(instance, tcx.mk_type_list(&[])); + let host_fn_ty = fn_abi.llvm_type(cx); + + let offload_data = + gen_define_handling(&cx, &metadata, target_symbol.clone(), offload_globals, host); + gen_call_handling( + bx, + &offload_data, + &args, + &types, + &metadata, + offload_globals, + offload_dims.as_ref(), + host, + llfn, + host_fn_ty, + ); } fn get_args_from_tuple<'ll, 'tcx>( diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs index 22ee490b81a7b..7f6f498c8c1f5 100644 --- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs +++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs @@ -165,6 +165,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi | sym::mul_with_overflow | sym::needs_drop | sym::offload + | sym::offload_args | sym::offset_of | sym::overflow_checks | sym::powf16 @@ -339,6 +340,7 @@ pub(crate) fn check_intrinsic_type( ], param(2), ), + sym::offload_args => (3, 0, vec![param(0), param(1)], param(2)), sym::offset => (2, 0, vec![param(0), param(1)], param(0)), sym::arith_offset => ( 1, diff --git a/compiler/rustc_monomorphize/src/collector/autodiff.rs b/compiler/rustc_monomorphize/src/collector/autodiff.rs index e3646596e75e6..cb79d0700c72f 100644 --- a/compiler/rustc_monomorphize/src/collector/autodiff.rs +++ b/compiler/rustc_monomorphize/src/collector/autodiff.rs @@ -15,7 +15,10 @@ pub(crate) fn collect_autodiff_fn<'tcx>( intrinsic: IntrinsicDef, output: &mut MonoItems<'tcx>, ) { - if intrinsic.name != rustc_span::sym::autodiff { + if intrinsic.name != rustc_span::sym::autodiff + && intrinsic.name != rustc_span::sym::offload + && intrinsic.name != rustc_span::sym::offload_args + { return; }; diff --git a/compiler/rustc_session/src/config.rs b/compiler/rustc_session/src/config.rs index 5e2671ef4ef6b..bc9cd6c70c23b 100644 --- a/compiler/rustc_session/src/config.rs +++ b/compiler/rustc_session/src/config.rs @@ -196,6 +196,8 @@ pub enum Offload { Device, /// Second step in the offload pipeline, generates the host code to call kernels. Host(String), + /// We only map arguments, but still call host (=CPU) code. + Args, /// Test is similar to Host, but allows testing without a device artifact. Test, } diff --git a/compiler/rustc_session/src/options.rs b/compiler/rustc_session/src/options.rs index 9219b5a7e8aca..2e96116492fe7 100644 --- a/compiler/rustc_session/src/options.rs +++ b/compiler/rustc_session/src/options.rs @@ -797,7 +797,7 @@ mod desc { "a comma-separated list of strings, with elements beginning with + or -"; pub(crate) const parse_autodiff: &str = "a comma separated list of settings: `Enable`, `PrintSteps`, `PrintTA`, `PrintTAFn`, `PrintAA`, `PrintPerf`, `PrintModBefore`, `PrintModAfter`, `PrintModFinal`, `PrintPasses`, `NoPostopt`, `LooseTypes`, `Inline`, `NoTT`"; pub(crate) const parse_offload: &str = - "a comma separated list of settings: `Host=`, `Device`, `Test`"; + "a comma separated list of settings: `Host=`, `Device`, `Test`, `Args`"; pub(crate) const parse_comma_list: &str = "a comma-separated list of strings"; pub(crate) const parse_opt_comma_list: &str = parse_comma_list; pub(crate) const parse_number: &str = "a number"; @@ -1480,6 +1480,13 @@ pub mod parse { } Offload::Test } + "Args" => { + if let Some(_) = arg { + // Args does not accept a value + return false; + } + Offload::Args + } _ => { // FIXME(ZuseZ4): print an error saying which value is not recognized return false; @@ -2526,10 +2533,12 @@ options! { normalize_docs: bool = (false, parse_bool, [TRACKED], "normalize associated items in rustdoc when generating documentation"), offload: Vec = (Vec::new(), parse_offload, [TRACKED], - "a list of offload flags to enable - Mandatory setting: - `=Enable` - Currently the only option available"), + "a list of offload flags to enable: + `=Device` + `=Host(path)` + `=Test` + `=Args` + Multiple options can be combined with commas."), on_broken_pipe: OnBrokenPipe = (OnBrokenPipe::Default, parse_on_broken_pipe, [TRACKED], "behavior of std::io::ErrorKind::BrokenPipe (SIGPIPE)"), osx_rpath_install_name: bool = (false, parse_bool, [TRACKED], diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index b80d624bc4966..fcf7333be36ca 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -1614,6 +1614,7 @@ symbols! { of, off, offload, + offload_args, offset, offset_of, offset_of_enum, diff --git a/library/core/src/intrinsics/mod.rs b/library/core/src/intrinsics/mod.rs index 051dda731881f..9298012be478b 100644 --- a/library/core/src/intrinsics/mod.rs +++ b/library/core/src/intrinsics/mod.rs @@ -3390,6 +3390,72 @@ pub const fn copysignf128(x: f128, y: f128) -> f128; #[rustc_intrinsic] pub const fn autodiff(f: F, df: G, args: T) -> R; +/// This intrinsic maps the given args from the Host(=CPU) to a GPU device. It then calls the given +/// function. Unlike the full `offload` intrinsic, this intrinsic expects a host function, in which +/// we will replace all usages of the given host args with their device version. This enables +/// support for various GPU libraries like `cuBLAS`, `cuDNN`, or `rocBLAS`, which *must* be called +/// from the host, but expect a mixture of host and device arguments. +/// +/// Type Parameters: +/// - `F`: The kernel to call. Must be a function item. +/// - `T`: A tuple of arguments passed to `f`. +/// - `R`: The return type of the kernel. +/// +/// Arguments: +/// - `f`: The host function to be called. +/// - `args`: A tuple of arguments, will be mapped to the gpu and forwarded to `f`. +/// +/// Example usage (pseudocode): +/// +/// ```rust,ignore (pseudocode) +/// fn kernel(A: &[f32; 6], x: &[f32; 3], y: &mut [f64; 2]) { +/// core::intrinsics::offload_args(sgemv_wrapper, (A.as_ptr(),x.as_ptr(),y.as_mut_ptr())) +/// } +/// +/// #[cfg(target_os = "linux")] +/// extern "C" { +/// pub fn rocblas_sgemv( +/// alpha: *const f32, +/// A: *const f32, +/// x: *const f32, +/// beta: *const f32, +/// y: *mut f32, +/// ); +/// } +/// +/// #[cfg(not(target_os = "linux"))] +/// fn sgemv_wrapper(A: *const [f32; 6], x: *const [f32; 3], y: *mut [f64; 2]) { +/// // A, x, y were mapped to the GPU by our offload_args intrinsic call above. +/// // As such, trying to access them in this function (and therefore from the CPU) would be +/// // UB. We therefore are using raw pointers instead of references, since they do not point to +/// // a valid (host) memory location. +/// +/// // rocblas expects scalars to be passed as host pointers. +/// let alpha = 1.0; +/// let beta = 1.0; +/// unsafe { +/// rocblas_sgemv( +/// // Host ptr +/// &alpha as *const f32, +/// // Replaced by device ptr +/// A, +/// // Replaced by device ptr +/// x, +/// // Host ptr +/// &beta as *const f32, +/// // Replaced by device ptr +/// y +/// ); +/// } +/// } +/// ``` +/// +/// For reference, see the Clang documentation on offloading: +/// . +#[rustc_nounwind] +#[rustc_intrinsic] +pub const fn offload_args(f: F, args: T) -> R; + /// Generates the LLVM body of a wrapper function to offload a kernel `f`. /// /// Type Parameters: diff --git a/tests/codegen-llvm/gpu_offload/offload_args.rs b/tests/codegen-llvm/gpu_offload/offload_args.rs new file mode 100644 index 0000000000000..8b0e1f15c3eec --- /dev/null +++ b/tests/codegen-llvm/gpu_offload/offload_args.rs @@ -0,0 +1,82 @@ +//@ compile-flags: -Zoffload=Args -Zno-link -Zunstable-options -C opt-level=3 -Clto=fat +//@ no-prefer-dynamic +//@ needs-offload + +// This test is meant to verify that we are able to map cpu argument to a device, and pass those to +// a gpu library like cuBLAS or rocblas. We don't really want to link those libraries in CI, and we +// neither want to deal with the creation or destruction of handles that those require since it's +// just noise. We do however test that we can combine host pointer (like alpha, beta) with device +// pointers (A, x, y). We also test std support while already at it. + +#![allow(internal_features, non_camel_case_types, non_snake_case)] +#![feature(rustc_attrs)] +#![feature(core_intrinsics)] + +fn main() { + let mut A: [f32; 3 * 2] = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]; + let mut x: [f32; 3] = [1.0, 1.0, 1.0]; + let mut y: [f32; 2] = [0.0, 0.0]; + for _ in 0..10 { + core::intrinsics::offload_args::<_, _, ()>(rocblas_sgemv_wrapper, (&mut A, &mut x, &mut y)); + // CHECK-LABEL: ; offload_args::main + // CHECK: call void @__tgt_target_data_begin_mapper( + // CHECK-NEXT: [[A:%.*]] = call ptr @omp_get_mapped_ptr(ptr nonnull %A, i32 0) + // CHECK-NEXT: [[X:%.*]] = call ptr @omp_get_mapped_ptr(ptr nonnull %x, i32 0) + // CHECK-NEXT: [[Y:%.*]] = call ptr @omp_get_mapped_ptr(ptr nonnull %y, i32 0) + // CHECK-NEXT: ; call offload_args::rocblas_sgemv_wrapper + // CHECK-NEXT: call {{.*}}void {{@_RNv.*rocblas_sgemv_wrapper.*}}(ptr [[A]], ptr [[X]], ptr [[Y]]) + // CHECK-NEXT: call void @__tgt_target_data_end_mapper( + } + println!("{:?}", y); +} + +unsafe extern "C" { + pub fn fake_gpublas_sgemv( + m: i32, + n: i32, + alpha: *const f32, + A: *const f32, + lda: i32, + x: *const f32, + incx: i32, + beta: *const f32, + y: *mut f32, + incy: i32, + ) -> i32; +} + +#[inline(never)] +pub fn rocblas_sgemv_wrapper(A: &mut [f32; 6], x: &mut [f32; 3], y: &mut [f32; 2]) -> () { + let m: i32 = 2; + let n: i32 = 3; + let incx: i32 = 1; + let incy: i32 = 1; + let lda = m; + // those two by default should be host ptr: + let alpha: f32 = 1.0; + let beta: f32 = 1.0; + + // CHECK-LABEL: ; offload_args::rocblas_sgemv_wrapper + // CHECK: define {{.*}}void {{.*}}rocblas_sgemv_wrapper{{.*}}(ptr{{.*}} %A, ptr{{.*}} %x, ptr{{.*}} %y) + // CHECK-DAG: %alpha = alloca [4 x i8] + // CHECK-DAG: %beta = alloca [4 x i8] + // CHECK-DAG: store float 1.000000e+00, ptr %alpha + // CHECK-DAG: store float 1.000000e+00, ptr %beta + // CHECK: call noundef i32 @fake_gpublas_sgemv(i32 noundef 2, i32 noundef 3, ptr{{.*}} %alpha, ptr{{.*}} %A, i32 noundef 2, ptr{{.*}} %x, i32 noundef 1, ptr{{.*}} %beta, ptr{{.*}} %y, i32 noundef 1) + + unsafe { + let st_res = fake_gpublas_sgemv( + m, + n, + &alpha as *const f32, + A.as_ptr(), + lda, + x.as_ptr(), + incx, + &beta as *const f32, + y.as_mut_ptr(), + incy, + ); + assert_eq!(st_res, 1); + }; +}