diff --git a/.gitignore b/.gitignore index 8f0ec505d..5f512c319 100644 --- a/.gitignore +++ b/.gitignore @@ -452,6 +452,10 @@ SERIAL/Sparse/sparse SERIAL/Stencil/stencil SERIAL/Synch_p2p/p2p SERIAL/Transpose/transpose +COBOL/nstream +COBOL/transpose +COBOL/p2p +COBOL/dgemm dgemm-vector.dSYM dgemm.dSYM nstream-opencl.dSYM diff --git a/C1z/transpose-2d-openacc.c b/C1z/transpose-2d-openacc.c old mode 100755 new mode 100644 diff --git a/Cxx11/dgemm-cublas.cu b/Cxx11/dgemm-cublas.cu index 79937d779..f1ce70a55 100644 --- a/Cxx11/dgemm-cublas.cu +++ b/Cxx11/dgemm-cublas.cu @@ -182,13 +182,11 @@ int main(int argc, char * argv[]) /// Read and test input parameters ////////////////////////////////////////////////////////////////////// - int iterations; - int order; - int batches = 0; - int input_copy = 0; + int iterations, order, batches = 0; + bool input_copy = false, random_initialization = false; try { if (argc < 2) { - throw "Usage: <# iterations> [] []"; + throw "Usage: <# iterations> [] [] []"; } iterations = std::atoi(argv[1]); @@ -213,6 +211,13 @@ int main(int argc, char * argv[]) throw "ERROR: input_copy was not 0 or 1"; } } + + if (argc > 5) { + random_initialization = std::atoi(argv[5]); + if (random_initialization != 0 && random_initialization != 1) { + throw "ERROR: random_initialization was not 0 or 1"; + } + } } catch (const char * e) { std::cout << e << std::endl; @@ -229,6 +234,7 @@ int main(int argc, char * argv[]) std::cout << "Batch size = " << batches << " (batched BLAS)" << std::endl; } std::cout << "Input copy = " << (input_copy ? "yes" : "no") << std::endl; + std::cout << "Randomized data = " << (random_initialization ? "yes" : "no") << std::endl; cublasHandle_t h; prk::check( cublasCreate(&h) ); @@ -270,10 +276,24 @@ int main(int argc, char * argv[]) prk::CUDA::copyH2Dasync(&(d_a[b*nelems]), h_a, nelems); prk::CUDA::copyH2Dasync(&(d_b[b*nelems]), h_b, nelems); } - prk::CUDA::sync(); init<<>>(order, matrices, d_c); + } else if (random_initialization) { + // Initialize matrices with CURAND uniform distribution [0,1] + curandGenerator_t gen; + prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) ); + prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) ); + + // Generate uniform random numbers in [0,1] for matrices A and B + prk::check( curandGenerateUniformDouble(gen, d_a, matrices * nelems) ); + prk::check( curandGenerateUniformDouble(gen, d_b, matrices * nelems) ); + + prk::check( curandDestroyGenerator(gen) ); + + // Initialize matrix C to zero + init<<>>(order, matrices, d_c); + } else { init<<>>(order, matrices, d_a, d_b, d_c); @@ -346,12 +366,14 @@ int main(int argc, char * argv[]) } residuum /= matrices; - if (residuum < epsilon) { + if (residuum < epsilon || random_initialization) { #if VERBOSE std::cout << "Reference checksum = " << reference << "\n" << "Actual checksum = " << checksum << std::endl; #endif - std::cout << "Solution validates" << std::endl; + if (!random_initialization) { + std::cout << "Solution validates" << std::endl; + } auto avgtime = gemm_time/iterations/matrices; auto nflops = 2.0 * prk::pow(forder,3); prk::print_flop_rate_time("FP64", nflops/avgtime, avgtime); diff --git a/Cxx11/prk_cuda.h b/Cxx11/prk_cuda.h index 584499379..28e0fa5dc 100644 --- a/Cxx11/prk_cuda.h +++ b/Cxx11/prk_cuda.h @@ -15,6 +15,7 @@ #ifdef PRK_USE_CUBLAS #include +#include #endif //#include @@ -58,6 +59,14 @@ namespace prk std::abort(); } } + + void check(curandStatus_t rc) + { + if (rc!=CURAND_STATUS_SUCCESS) { + std::cerr << "PRK CURAND error: " << rc << std::endl; + std::abort(); + } + } #endif namespace CUDA diff --git a/Cxx11/sgemm-cublas.cu b/Cxx11/sgemm-cublas.cu index 7883daf90..902ff727e 100644 --- a/Cxx11/sgemm-cublas.cu +++ b/Cxx11/sgemm-cublas.cu @@ -62,6 +62,7 @@ #include "prk_util.h" #include "prk_cuda.h" +#include #if 0 __global__ void init(unsigned order, float * A, float * B, float * C) @@ -182,14 +183,12 @@ int main(int argc, char * argv[]) /// Read and test input parameters ////////////////////////////////////////////////////////////////////// - int iterations; - int order; - int batches = 0; - bool input_copy{false}; + int iterations, order, batches = 0; + bool input_copy = false, random_initialization = false; bool tf32{false}; try { if (argc < 2) { - throw "Usage: <# iterations> [] [] []"; + throw "Usage: <# iterations> [] [] [] []"; } iterations = std::atoi(argv[1]); @@ -215,6 +214,10 @@ int main(int argc, char * argv[]) if (argc > 5) { tf32 = prk::parse_boolean(std::string(argv[5])); } + + if (argc > 6) { + random_initialization = prk::parse_boolean(std::string(argv[6])); + } } catch (const char * e) { std::cout << e << std::endl; @@ -232,6 +235,7 @@ int main(int argc, char * argv[]) } std::cout << "Input copy = " << (input_copy ? "yes" : "no") << std::endl; std::cout << "TF32 = " << (tf32 ? "yes" : "no") << std::endl; + std::cout << "Randomized data = " << (random_initialization ? "yes" : "no") << std::endl; cublasHandle_t h; prk::check( cublasCreate(&h) ); @@ -281,6 +285,21 @@ int main(int argc, char * argv[]) init<<>>(order, matrices, d_c); + } else if (random_initialization) { + // Initialize matrices with CURAND uniform distribution [0,1] + curandGenerator_t gen; + prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) ); + prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) ); + + // Generate uniform random numbers in [0,1] for matrices A and B + prk::check( curandGenerateUniform(gen, d_a, matrices * nelems) ); + prk::check( curandGenerateUniform(gen, d_b, matrices * nelems) ); + + prk::check( curandDestroyGenerator(gen) ); + + // Initialize matrix C to zero + init<<>>(order, matrices, d_c); + } else { init<<>>(order, matrices, d_a, d_b, d_c); @@ -358,12 +377,14 @@ int main(int argc, char * argv[]) } residuum /= matrices; - if (residuum < epsilon) { + if (residuum < epsilon || random_initialization) { #if VERBOSE std::cout << "Reference checksum = " << reference << "\n" << "Actual checksum = " << checksum << std::endl; #endif - std::cout << "Solution validates" << std::endl; + if (!random_initialization) { + std::cout << "Solution validates" << std::endl; + } auto avgtime = gemm_time/iterations/matrices; auto nflops = 2.0 * prk::pow(forder,3); std::cout << "Rate (MF/s): " << 1.0e-6 * nflops/avgtime diff --git a/Cxx11/transpose-put-mpi.cc b/Cxx11/transpose-put-mpi.cc new file mode 100644 index 000000000..eb86b0703 --- /dev/null +++ b/Cxx11/transpose-put-mpi.cc @@ -0,0 +1,234 @@ +/// +/// Copyright (c) 2020, Intel Corporation +/// Copyright (c) 2025, NVIDIA +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: transpose +/// +/// PURPOSE: This program measures the time for the transpose of a +/// column-major stored matrix into a row-major stored matrix. +/// +/// USAGE: Program input is the matrix order and the number of times to +/// repeat the operation: +/// +/// transpose <# iterations> [tile size] +/// +/// An optional parameter specifies the tile size used to divide the +/// individual matrix blocks for improved cache and TLB performance. +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_mpi.h" +#include "transpose-kernel.h" + +int main(int argc, char * argv[]) +{ + { + prk::MPI::state mpi(&argc,&argv); + + int np = prk::MPI::size(); + int me = prk::MPI::rank(); + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + size_t order, block_order, tile_size; + + if (me == 0) { + std::cout << "Parallel Research Kernels" << std::endl; + std::cout << "C++11/MPI Matrix transpose (PUT-based): B = A^T" << std::endl; + + try { + if (argc < 3) { + throw "Usage: <# iterations> [tile size]"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atol(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + // } else if (order > prk::get_max_matrix_size()) { + // throw "ERROR: matrix dimension too large - overflow risk"; + } + + if (order % np != 0) { + throw "ERROR: Matrix order must be an integer multiple of the number of MPI processes"; + } + + // default tile size for tiling of local transpose + tile_size = (argc>3) ? std::atoi(argv[3]) : 32; + // a negative tile size means no tiling of the local transpose + if (tile_size <= 0) tile_size = order; + } + catch (const char * e) { + std::cout << e << std::endl; + prk::MPI::abort(1); + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; + } + + prk::MPI::bcast(&iterations); + prk::MPI::bcast(&order); + prk::MPI::bcast(&tile_size); + + block_order = order / np; + + //std::cout << "@" << me << " order=" << order << " block_order=" << block_order << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + ////////////////////////////////////////////////////////////////////// + + double trans_time{0}; + + // WA_A = window for A, LA_A = local address for A + auto [WA_A,LA_A] = prk::MPI::win_allocate(order * block_order); + // WB_B = window for B, LB_B = local address for B (B must be in MPI window for PUT operations) + auto [WA_B,LA_B] = prk::MPI::win_allocate(order * block_order); + + //A[order][block_order] + prk::vector A(LA_A, order * block_order, 0.0); + prk::vector B(LA_B, order * block_order, 0.0); + prk::vector T(block_order * block_order, 0.0); + + // fill A with the sequence 0 to order^2-1 as doubles + for (size_t i=0; i <# iterations> [tile size] +/// +/// An optional parameter specifies the tile size used to divide the +/// individual matrix blocks for improved cache and TLB performance. +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_oshmem.h" +#include "transpose-kernel.h" + +int main(int argc, char * argv[]) +{ + { + prk::SHMEM::state shmem(&argc,&argv); + + int np = prk::SHMEM::size(); + int me = prk::SHMEM::rank(); + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + size_t order, block_order, tile_size; + + if (me == 0) { + std::cout << "Parallel Research Kernels" << std::endl; + std::cout << "C++11/SHMEM Matrix transpose (PUT-based): B = A^T" << std::endl; + + try { + if (argc < 3) { + throw "Usage: <# iterations> [tile size]"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atol(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } + + if (order % np != 0) { + throw "ERROR: Matrix order must be an integer multiple of the number of SHMEM PEs"; + } + + // default tile size for tiling of local transpose + tile_size = (argc>3) ? std::atoi(argv[3]) : 32; + // a negative tile size means no tiling of the local transpose + if (tile_size <= 0) tile_size = order; + } + catch (const char * e) { + std::cout << e << std::endl; + prk::SHMEM::abort(1); + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; + } + + prk::SHMEM::broadcast(&iterations); + prk::SHMEM::broadcast(&order); + prk::SHMEM::broadcast(&tile_size); + + block_order = order / np; + + ////////////////////////////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + ////////////////////////////////////////////////////////////////////// + + double trans_time{0}; + + // A[order][block_order] - source matrix in symmetric memory + auto LA = prk::SHMEM::allocate(order * block_order); + prk::vector A(LA, order * block_order, 0.0); + + // B[order][block_order] - destination matrix in symmetric memory for PUT operations + auto LB = prk::SHMEM::allocate(order * block_order); + prk::vector B(LB, order * block_order, 0.0); + + // T[block_order][block_order] - temporary workspace for local transpose + prk::vector T(block_order * block_order, 0.0); + + // fill A with the sequence 0 to order^2-1 as doubles + for (size_t i=0; i +void curand_init_matrices(T* d_a, T* d_b, size_t nelems) { + std::cerr << "No valid CURAND template match for type T" << std::endl; + std::abort(); +} + +template <> +void curand_init_matrices(float* d_a, float* d_b, size_t nelems) { + curandGenerator_t gen; + prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) ); + prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) ); + prk::check( curandGenerateUniform(gen, d_a, nelems) ); + prk::check( curandGenerateUniform(gen, d_b, nelems) ); + prk::check( curandDestroyGenerator(gen) ); +} + +template <> +void curand_init_matrices(double* d_a, double* d_b, size_t nelems) { + curandGenerator_t gen; + prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) ); + prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) ); + prk::check( curandGenerateUniformDouble(gen, d_a, nelems) ); + prk::check( curandGenerateUniformDouble(gen, d_b, nelems) ); + prk::check( curandDestroyGenerator(gen) ); +} + +template <> +void curand_init_matrices<__half>(__half* d_a, __half* d_b, size_t nelems) { + // CURAND doesn't directly support __half, so generate float and convert + auto temp_a = prk::CUDA::malloc_device(nelems); + auto temp_b = prk::CUDA::malloc_device(nelems); + + curandGenerator_t gen; + prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) ); + prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) ); + prk::check( curandGenerateUniform(gen, temp_a, nelems) ); + prk::check( curandGenerateUniform(gen, temp_b, nelems) ); + prk::check( curandDestroyGenerator(gen) ); + + // Convert float to __half + const int block_size = 256; + const int grid_size = (nelems + block_size - 1) / block_size; + + auto convert_kernel = [] __device__ (int idx, float* src, __half* dst, size_t n) { + if (idx < n) dst[idx] = __float2half(src[idx]); + }; + + // Launch conversion kernels + thrust::for_each_n(thrust::device, thrust::counting_iterator(0), nelems, + [=] __device__ (int idx) { if (idx < nelems) d_a[idx] = __float2half(temp_a[idx]); }); + thrust::for_each_n(thrust::device, thrust::counting_iterator(0), nelems, + [=] __device__ (int idx) { if (idx < nelems) d_b[idx] = __float2half(temp_b[idx]); }); + + prk::CUDA::free(temp_a); + prk::CUDA::free(temp_b); +} + template void prk_gemm(const cublasHandle_t & h, const int order, const TC alpha, const TC beta, @@ -141,7 +200,7 @@ void prk_gemm(const cublasHandle_t & h, } template -void run(const cublasHandle_t & h, int iterations, int order) +void run(const cublasHandle_t & h, int iterations, int order, bool random_initialization) { double gemm_time{0}; @@ -156,7 +215,15 @@ void run(const cublasHandle_t & h, int iterations, int order) auto d_a = prk::CUDA::malloc_device(nelems); auto d_b = prk::CUDA::malloc_device(nelems); auto d_c = prk::CUDA::malloc_device(nelems); - init<<>>(order, d_a, d_b, d_c); + + if (random_initialization) { + // Initialize matrices with CURAND uniform distribution [0,1] + curand_init_matrices(d_a, d_b, nelems); + // Initialize matrix C to zero + init<<>>(order, d_c); + } else { + init<<>>(order, d_a, d_b, d_c); + } prk::CUDA::sync(); { for (int iter = 0; iter<=iterations; iter++) { @@ -190,12 +257,14 @@ void run(const cublasHandle_t & h, int iterations, int order) } const double residuum = std::abs(checksum - reference) / reference; const double epsilon{1.0e-8}; - if ((residuum < epsilon) || (sizeof(T) < 4)) { + if ((residuum < epsilon) || (sizeof(T) < 4) || random_initialization) { #if VERBOSE std::cout << "Reference checksum = " << reference << "\n" << "Actual checksum = " << checksum << std::endl; #endif - std::cout << "Solution validates" << std::endl; + if (residuum < epsilon) { + std::cout << "Solution validates" << std::endl; + } auto avgtime = gemm_time/iterations; auto nflops = 2.0 * prk::pow(forder,3); auto is_fp64 = (typeid(T) == typeid(double)); @@ -222,11 +291,11 @@ int main(int argc, char * argv[]) /// Read and test input parameters ////////////////////////////////////////////////////////////////////// - int iterations; - int order; + int iterations, order; + bool random_initialization = false; try { if (argc < 2) { - throw "Usage: <# iterations> "; + throw "Usage: <# iterations> []"; } iterations = std::atoi(argv[1]); @@ -240,6 +309,10 @@ int main(int argc, char * argv[]) } else if (order > prk::get_max_matrix_size()) { throw "ERROR: matrix dimension too large - overflow risk"; } + + if (argc > 3) { + random_initialization = prk::parse_boolean(std::string(argv[3])); + } } catch (const char * e) { std::cout << e << std::endl; @@ -250,6 +323,7 @@ int main(int argc, char * argv[]) std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Matrix order = " << order << std::endl; + std::cout << "Randomized data = " << (random_initialization ? "yes" : "no") << std::endl; ////////////////////////////////////////////////////////////////////// /// Setup CUBLAS environment @@ -257,9 +331,9 @@ int main(int argc, char * argv[]) cublasHandle_t h; prk::check( cublasCreate(&h) ); - run<__half>(h, iterations, order); - run(h, iterations, order); - run(h, iterations, order); + run<__half>(h, iterations, order, random_initialization); + run(h, iterations, order, random_initialization); + run(h, iterations, order, random_initialization); prk::check( cublasDestroy(h) ); return 0; diff --git a/common/make.defs.cuda b/common/make.defs.cuda index 0fd828d7c..4419c03ed 100644 --- a/common/make.defs.cuda +++ b/common/make.defs.cuda @@ -148,6 +148,7 @@ RAJADIR=/opt/raja/gcc RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/21.11/compilers/include-stdpar THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} +THRUSTFLAG+=--extended-lambda # # CBLAS for C++ DGEMM # diff --git a/transpose_access_pattern.gif b/transpose_access_pattern.gif new file mode 100644 index 000000000..a4e39b697 Binary files /dev/null and b/transpose_access_pattern.gif differ diff --git a/transpose_animation.gif b/transpose_animation.gif new file mode 100644 index 000000000..f21bc77f5 Binary files /dev/null and b/transpose_animation.gif differ diff --git a/transpose_code_explanation.pdf b/transpose_code_explanation.pdf new file mode 100644 index 000000000..17643b69a Binary files /dev/null and b/transpose_code_explanation.pdf differ diff --git a/transpose_code_explanation.png b/transpose_code_explanation.png new file mode 100644 index 000000000..e0be7bba9 Binary files /dev/null and b/transpose_code_explanation.png differ diff --git a/transpose_matrix_layout.pdf b/transpose_matrix_layout.pdf new file mode 100644 index 000000000..ddc3b8b02 Binary files /dev/null and b/transpose_matrix_layout.pdf differ diff --git a/transpose_matrix_layout.png b/transpose_matrix_layout.png new file mode 100644 index 000000000..896e40f4f Binary files /dev/null and b/transpose_matrix_layout.png differ