diff --git a/.gitignore b/.gitignore
index 8f0ec505d..5f512c319 100644
--- a/.gitignore
+++ b/.gitignore
@@ -452,6 +452,10 @@ SERIAL/Sparse/sparse
 SERIAL/Stencil/stencil
 SERIAL/Synch_p2p/p2p
 SERIAL/Transpose/transpose
+COBOL/nstream
+COBOL/transpose
+COBOL/p2p
+COBOL/dgemm
 dgemm-vector.dSYM
 dgemm.dSYM
 nstream-opencl.dSYM
diff --git a/C1z/transpose-2d-openacc.c b/C1z/transpose-2d-openacc.c
old mode 100755
new mode 100644
diff --git a/Cxx11/dgemm-cublas.cu b/Cxx11/dgemm-cublas.cu
index 79937d779..f1ce70a55 100644
--- a/Cxx11/dgemm-cublas.cu
+++ b/Cxx11/dgemm-cublas.cu
@@ -182,13 +182,11 @@ int main(int argc, char * argv[])
   /// Read and test input parameters
   //////////////////////////////////////////////////////////////////////
 
-  int iterations;
-  int order;
-  int batches = 0;
-  int input_copy = 0;
+  int iterations, order, batches = 0;
+  bool input_copy = false, random_initialization = false;
   try {
       if (argc < 2) {
-        throw "Usage: <# iterations> <matrix order> [<batches>] [<copy input every iteration [0/1]>]";
+        throw "Usage: <# iterations> <matrix order> [<batches>] [<copy input every iteration [0/1]>] [<random initializatoin [0/1]>]";
       }
 
       iterations  = std::atoi(argv[1]);
@@ -213,6 +211,13 @@ int main(int argc, char * argv[])
           throw "ERROR: input_copy was not 0 or 1";
         }
       }
+
+      if (argc > 5) {
+        random_initialization = std::atoi(argv[5]);
+        if (random_initialization != 0 && random_initialization != 1) {
+          throw "ERROR: random_initialization was not 0 or 1";
+        }
+      }
   }
   catch (const char * e) {
     std::cout << e << std::endl;
@@ -229,6 +234,7 @@ int main(int argc, char * argv[])
       std::cout << "Batch size           = " <<  batches << " (batched BLAS)" << std::endl;
   }
   std::cout << "Input copy           = " << (input_copy ? "yes" : "no") << std::endl;
+  std::cout << "Randomized data      = " << (random_initialization ? "yes" : "no") << std::endl;
 
   cublasHandle_t h;
   prk::check( cublasCreate(&h) );
@@ -270,10 +276,24 @@ int main(int argc, char * argv[])
       prk::CUDA::copyH2Dasync(&(d_a[b*nelems]), h_a, nelems);
       prk::CUDA::copyH2Dasync(&(d_b[b*nelems]), h_b, nelems);
     }
-    prk::CUDA::sync();
 
     init<<<dimGrid, dimBlock>>>(order, matrices, d_c);
 
+  } else if (random_initialization) {
+    // Initialize matrices with CURAND uniform distribution [0,1]
+    curandGenerator_t gen;
+    prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
+    prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );
+    
+    // Generate uniform random numbers in [0,1] for matrices A and B
+    prk::check( curandGenerateUniformDouble(gen, d_a, matrices * nelems) );
+    prk::check( curandGenerateUniformDouble(gen, d_b, matrices * nelems) );
+    
+    prk::check( curandDestroyGenerator(gen) );
+
+    // Initialize matrix C to zero
+    init<<<dimGrid, dimBlock>>>(order, matrices, d_c);
+    
   } else {
 
     init<<<dimGrid, dimBlock>>>(order, matrices, d_a, d_b, d_c);
@@ -346,12 +366,14 @@ int main(int argc, char * argv[])
   }
   residuum /= matrices;
 
-  if (residuum < epsilon) {
+  if (residuum < epsilon || random_initialization) {
 #if VERBOSE
     std::cout << "Reference checksum = " << reference << "\n"
               << "Actual checksum = " << checksum << std::endl;
 #endif
-    std::cout << "Solution validates" << std::endl;
+    if (!random_initialization) {
+      std::cout << "Solution validates" << std::endl;
+    }
     auto avgtime = gemm_time/iterations/matrices;
     auto nflops = 2.0 * prk::pow(forder,3);
     prk::print_flop_rate_time("FP64", nflops/avgtime, avgtime);
diff --git a/Cxx11/prk_cuda.h b/Cxx11/prk_cuda.h
index 584499379..28e0fa5dc 100644
--- a/Cxx11/prk_cuda.h
+++ b/Cxx11/prk_cuda.h
@@ -15,6 +15,7 @@
 
 #ifdef PRK_USE_CUBLAS
 #include <cublas_v2.h>
+#include <curand.h>
 #endif
 
 //#include <nvtx3.hpp>
@@ -58,6 +59,14 @@ namespace prk
             std::abort();
         }
     }
+
+    void check(curandStatus_t rc)
+    {
+        if (rc!=CURAND_STATUS_SUCCESS) {
+            std::cerr << "PRK CURAND error: " << rc << std::endl;
+            std::abort();
+        }
+    }
 #endif
 
     namespace CUDA
diff --git a/Cxx11/sgemm-cublas.cu b/Cxx11/sgemm-cublas.cu
index 7883daf90..902ff727e 100644
--- a/Cxx11/sgemm-cublas.cu
+++ b/Cxx11/sgemm-cublas.cu
@@ -62,6 +62,7 @@
 
 #include "prk_util.h"
 #include "prk_cuda.h"
+#include <curand.h>
 
 #if 0
 __global__ void init(unsigned order, float * A, float * B, float * C)
@@ -182,14 +183,12 @@ int main(int argc, char * argv[])
   /// Read and test input parameters
   //////////////////////////////////////////////////////////////////////
 
-  int iterations;
-  int order;
-  int batches = 0;
-  bool input_copy{false};
+  int iterations, order, batches = 0;
+  bool input_copy = false, random_initialization = false;
   bool tf32{false};
   try {
       if (argc < 2) {
-        throw "Usage: <# iterations> <matrix order> [<batches>] [<copy input every iteration [0/1]>] [<use TF32 [0/1]>]";
+        throw "Usage: <# iterations> <matrix order> [<batches>] [<copy input every iteration [0/1]>] [<use TF32 [0/1]>] [<random initialization [0/1]>]";
       }
 
       iterations  = std::atoi(argv[1]);
@@ -215,6 +214,10 @@ int main(int argc, char * argv[])
       if (argc > 5) {
         tf32 = prk::parse_boolean(std::string(argv[5]));
       }
+
+      if (argc > 6) {
+        random_initialization = prk::parse_boolean(std::string(argv[6]));
+      }
   }
   catch (const char * e) {
     std::cout << e << std::endl;
@@ -232,6 +235,7 @@ int main(int argc, char * argv[])
   }
   std::cout << "Input copy           = " << (input_copy ? "yes" : "no") << std::endl;
   std::cout << "TF32                 = " << (tf32 ? "yes" : "no") << std::endl;
+  std::cout << "Randomized data      = " << (random_initialization ? "yes" : "no") << std::endl;
 
   cublasHandle_t h;
   prk::check( cublasCreate(&h) );
@@ -281,6 +285,21 @@ int main(int argc, char * argv[])
 
     init<<<dimGrid, dimBlock>>>(order, matrices, d_c);
 
+  } else if (random_initialization) {
+    // Initialize matrices with CURAND uniform distribution [0,1]
+    curandGenerator_t gen;
+    prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
+    prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );
+    
+    // Generate uniform random numbers in [0,1] for matrices A and B
+    prk::check( curandGenerateUniform(gen, d_a, matrices * nelems) );
+    prk::check( curandGenerateUniform(gen, d_b, matrices * nelems) );
+    
+    prk::check( curandDestroyGenerator(gen) );
+
+    // Initialize matrix C to zero
+    init<<<dimGrid, dimBlock>>>(order, matrices, d_c);
+    
   } else {
 
     init<<<dimGrid, dimBlock>>>(order, matrices, d_a, d_b, d_c);
@@ -358,12 +377,14 @@ int main(int argc, char * argv[])
   }
   residuum /= matrices;
 
-  if (residuum < epsilon) {
+  if (residuum < epsilon || random_initialization) {
 #if VERBOSE
     std::cout << "Reference checksum = " << reference << "\n"
               << "Actual checksum = " << checksum << std::endl;
 #endif
-    std::cout << "Solution validates" << std::endl;
+    if (!random_initialization) {
+      std::cout << "Solution validates" << std::endl;
+    }
     auto avgtime = gemm_time/iterations/matrices;
     auto nflops = 2.0 * prk::pow(forder,3);
     std::cout << "Rate (MF/s): " << 1.0e-6 * nflops/avgtime
diff --git a/Cxx11/transpose-put-mpi.cc b/Cxx11/transpose-put-mpi.cc
new file mode 100644
index 000000000..eb86b0703
--- /dev/null
+++ b/Cxx11/transpose-put-mpi.cc
@@ -0,0 +1,234 @@
+///
+/// Copyright (c) 2020, Intel Corporation
+/// Copyright (c) 2025, NVIDIA
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    transpose
+///
+/// PURPOSE: This program measures the time for the transpose of a
+///          column-major stored matrix into a row-major stored matrix.
+///
+/// USAGE:   Program input is the matrix order and the number of times to
+///          repeat the operation:
+///
+///          transpose <matrix_size> <# iterations> [tile size]
+///
+///          An optional parameter specifies the tile size used to divide the
+///          individual matrix blocks for improved cache and TLB performance.
+///
+///          The output consists of diagnostics to make sure the
+///          transpose worked and timing statistics.
+///
+/// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include "prk_mpi.h"
+#include "transpose-kernel.h"
+
+int main(int argc, char * argv[])
+{
+  {
+    prk::MPI::state mpi(&argc,&argv);
+
+    int np = prk::MPI::size();
+    int me = prk::MPI::rank();
+
+    //////////////////////////////////////////////////////////////////////
+    /// Read and test input parameters
+    //////////////////////////////////////////////////////////////////////
+
+    int iterations;
+    size_t order, block_order, tile_size;
+
+    if (me == 0) {
+      std::cout << "Parallel Research Kernels" << std::endl;
+      std::cout << "C++11/MPI Matrix transpose (PUT-based): B = A^T" << std::endl;
+
+      try {
+        if (argc < 3) {
+          throw "Usage: <# iterations> <matrix order> [tile size]";
+        }
+     
+        iterations  = std::atoi(argv[1]);
+        if (iterations < 1) {
+          throw "ERROR: iterations must be >= 1";
+        }
+     
+        order = std::atol(argv[2]);
+        if (order <= 0) {
+          throw "ERROR: Matrix Order must be greater than 0";
+        // } else if (order > prk::get_max_matrix_size()) {
+        //   throw "ERROR: matrix dimension too large - overflow risk";
+        }
+
+        if (order % np != 0) {
+          throw "ERROR: Matrix order must be an integer multiple of the number of MPI processes";
+        }
+
+        // default tile size for tiling of local transpose
+        tile_size = (argc>3) ? std::atoi(argv[3]) : 32;
+        // a negative tile size means no tiling of the local transpose
+        if (tile_size <= 0) tile_size = order;
+      }
+      catch (const char * e) {
+        std::cout << e << std::endl;
+        prk::MPI::abort(1);
+        return 1;
+      }
+     
+      std::cout << "Number of iterations = " << iterations << std::endl;
+      std::cout << "Matrix order         = " << order << std::endl;
+      std::cout << "Tile size            = " << tile_size << std::endl;
+    }
+
+    prk::MPI::bcast(&iterations);
+    prk::MPI::bcast(&order);
+    prk::MPI::bcast(&tile_size);
+    
+    block_order = order / np;
+
+    //std::cout << "@" << me << " order=" << order << " block_order=" << block_order << std::endl;
+
+    //////////////////////////////////////////////////////////////////////
+    // Allocate space for the input and transpose matrix
+    //////////////////////////////////////////////////////////////////////
+
+    double trans_time{0};
+
+    // WA_A = window for A, LA_A = local address for A  
+    auto [WA_A,LA_A] = prk::MPI::win_allocate<double>(order * block_order);
+    // WB_B = window for B, LB_B = local address for B (B must be in MPI window for PUT operations)
+    auto [WA_B,LA_B] = prk::MPI::win_allocate<double>(order * block_order);
+
+    //A[order][block_order]
+    prk::vector<double> A(LA_A, order * block_order, 0.0);
+    prk::vector<double> B(LA_B, order * block_order, 0.0);
+    prk::vector<double> T(block_order * block_order, 0.0);
+
+    // fill A with the sequence 0 to order^2-1 as doubles
+    for (size_t i=0; i<order; i++) {
+        for (size_t j=0; j<block_order; j++) {
+            A[i*block_order + j] = me * block_order + i * order + j;
+        }
+    }
+    prk::MPI::barrier();
+
+    //prk::MPI::print_matrix(A, order, block_order, "A@" + std::to_string(me));
+
+    {
+      for (int iter = 0; iter<=iterations; iter++) {
+
+        if (iter==1) {
+            trans_time = prk::wtime();
+            prk::MPI::barrier();
+        }
+
+        // PUT-based transpose following SHMEM staged communication pattern
+        
+        // Phase 0: Local transpose (diagonal block)
+        size_t diag_offset = block_order * block_order * me;
+        transpose_block(B.data() + diag_offset, A.data() + diag_offset, block_order, tile_size);
+        
+        // Phases 1 to np-1: Remote communication using PUT
+        for (int phase = 1; phase < np; phase++) {
+            const int send_to = (me - phase + np) % np;
+            const int recv_from = (me + phase) % np;
+            
+            // Source and destination offsets
+            size_t soffset = block_order * block_order * send_to;     // source block in A
+            size_t roffset = block_order * block_order * recv_from;   // destination block in B
+            
+            // Transpose the block locally first
+            transpose_block(T.data(), A.data() + soffset, block_order, tile_size);
+            
+            // PUT the transposed block to the remote process
+            prk::MPI::bput(WA_B, T.data(), recv_from, roffset, block_order * block_order);
+            
+            // Synchronize after each phase (could be optimized with asynchronous operations)
+            prk::MPI::win_flush(WA_B, recv_from);
+        }
+        
+        // Global synchronization to ensure all PUTs are complete
+        prk::MPI::barrier();
+        
+        // increment A
+        std::transform(A.begin(), A.end(), A.begin(), [](auto a) { return a + 1; });
+        prk::MPI::win_sync(WA_A);
+        prk::MPI::barrier();
+      }
+      trans_time = prk::wtime() - trans_time;
+    }
+
+    prk::MPI::win_free(WA_A);
+    prk::MPI::win_free(WA_B);
+
+    //prk::MPI::print_matrix(A, order, block_order, "A@" + std::to_string(me));
+    //prk::MPI::print_matrix(B, order, block_order, "B@" + std::to_string(me));
+
+    //////////////////////////////////////////////////////////////////////
+    /// Analyze and output results
+    //////////////////////////////////////////////////////////////////////
+
+    const double addit = (iterations+1.0) * (iterations*0.5);
+    double abserr(0);
+    for (size_t i=0; i<order; i++) {
+      for (size_t j=0; j<block_order; j++) {
+        const double temp = (order*(me*block_order+j)+(i)) * (1+iterations) + addit;
+        abserr += prk::abs(B[i*block_order+j] - temp);
+      }
+    }
+    abserr = prk::MPI::sum(abserr);
+
+#ifdef VERBOSE
+    std::cout << "Sum of absolute differences: " << abserr << std::endl;
+#endif
+
+    if (me == 0) {
+      const auto epsilon = 1.0e-8;
+      if (abserr < epsilon) {
+        std::cout << "Solution validates" << std::endl;
+        auto avgtime = trans_time/iterations;
+        auto bytes = (size_t)order * (size_t)order * sizeof(double);
+        std::cout << "Rate (MB/s): " << 1.0e-6 * (4.0*bytes)/avgtime
+                  << " Avg time (s): " << avgtime << std::endl;
+      } else {
+        std::cout << "ERROR: Aggregate squared error " << abserr
+                  << " exceeds threshold " << epsilon << std::endl;
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
diff --git a/Cxx11/transpose-put-oshmem.cc b/Cxx11/transpose-put-oshmem.cc
new file mode 100644
index 000000000..2bd352dc9
--- /dev/null
+++ b/Cxx11/transpose-put-oshmem.cc
@@ -0,0 +1,223 @@
+///
+/// Copyright (c) 2020, Intel Corporation
+/// Copyright (c) 2025, NVIDIA
+///
+/// Redistribution and use in source and binary forms, with or without
+/// modification, are permitted provided that the following conditions
+/// are met:
+///
+/// * Redistributions of source code must retain the above copyright
+///       notice, this list of conditions and the following disclaimer.
+/// * Redistributions in binary form must reproduce the above
+///       copyright notice, this list of conditions and the following
+///       disclaimer in the documentation and/or other materials provided
+///       with the distribution.
+/// * Neither the name of Intel Corporation nor the names of its
+///       contributors may be used to endorse or promote products
+///       derived from this software without specific prior written
+///       permission.
+///
+/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+/// POSSIBILITY OF SUCH DAMAGE.
+
+//////////////////////////////////////////////////////////////////////
+///
+/// NAME:    transpose
+///
+/// PURPOSE: This program measures the time for the transpose of a
+///          column-major stored matrix into a row-major stored matrix.
+///
+/// USAGE:   Program input is the matrix order and the number of times to
+///          repeat the operation:
+///
+///          transpose <matrix_size> <# iterations> [tile size]
+///
+///          An optional parameter specifies the tile size used to divide the
+///          individual matrix blocks for improved cache and TLB performance.
+///
+///          The output consists of diagnostics to make sure the
+///          transpose worked and timing statistics.
+///
+/// HISTORY: Written by  Rob Van der Wijngaart, February 2009.
+///          Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
+///
+//////////////////////////////////////////////////////////////////////
+
+#include "prk_util.h"
+#include "prk_oshmem.h"
+#include "transpose-kernel.h"
+
+int main(int argc, char * argv[])
+{
+  {
+    prk::SHMEM::state shmem(&argc,&argv);
+
+    int np = prk::SHMEM::size();
+    int me = prk::SHMEM::rank();
+
+    //////////////////////////////////////////////////////////////////////
+    /// Read and test input parameters
+    //////////////////////////////////////////////////////////////////////
+
+    int iterations;
+    size_t order, block_order, tile_size;
+
+    if (me == 0) {
+      std::cout << "Parallel Research Kernels" << std::endl;
+      std::cout << "C++11/SHMEM Matrix transpose (PUT-based): B = A^T" << std::endl;
+
+      try {
+        if (argc < 3) {
+          throw "Usage: <# iterations> <matrix order> [tile size]";
+        }
+     
+        iterations  = std::atoi(argv[1]);
+        if (iterations < 1) {
+          throw "ERROR: iterations must be >= 1";
+        }
+     
+        order = std::atol(argv[2]);
+        if (order <= 0) {
+          throw "ERROR: Matrix Order must be greater than 0";
+        }
+
+        if (order % np != 0) {
+          throw "ERROR: Matrix order must be an integer multiple of the number of SHMEM PEs";
+        }
+
+        // default tile size for tiling of local transpose
+        tile_size = (argc>3) ? std::atoi(argv[3]) : 32;
+        // a negative tile size means no tiling of the local transpose
+        if (tile_size <= 0) tile_size = order;
+      }
+      catch (const char * e) {
+        std::cout << e << std::endl;
+        prk::SHMEM::abort(1);
+        return 1;
+      }
+     
+      std::cout << "Number of iterations = " << iterations << std::endl;
+      std::cout << "Matrix order         = " << order << std::endl;
+      std::cout << "Tile size            = " << tile_size << std::endl;
+    }
+
+    prk::SHMEM::broadcast(&iterations);
+    prk::SHMEM::broadcast(&order);
+    prk::SHMEM::broadcast(&tile_size);
+    
+    block_order = order / np;
+
+    //////////////////////////////////////////////////////////////////////
+    // Allocate space for the input and transpose matrix
+    //////////////////////////////////////////////////////////////////////
+
+    double trans_time{0};
+
+    // A[order][block_order] - source matrix in symmetric memory
+    auto LA = prk::SHMEM::allocate<double>(order * block_order);
+    prk::vector<double> A(LA, order * block_order, 0.0);
+    
+    // B[order][block_order] - destination matrix in symmetric memory for PUT operations
+    auto LB = prk::SHMEM::allocate<double>(order * block_order);
+    prk::vector<double> B(LB, order * block_order, 0.0);
+    
+    // T[block_order][block_order] - temporary workspace for local transpose
+    prk::vector<double> T(block_order * block_order, 0.0);
+
+    // fill A with the sequence 0 to order^2-1 as doubles
+    for (size_t i=0; i<order; i++) {
+        for (size_t j=0; j<block_order; j++) {
+            A[i*block_order + j] = me * block_order + i * order + j;
+        }
+    }
+    prk::SHMEM::barrier();
+
+    {
+      for (int iter = 0; iter<=iterations; iter++) {
+
+        if (iter==1) {
+            trans_time = prk::wtime();
+            prk::SHMEM::barrier();
+        }
+
+        // PUT-based transpose following SHMEM staged communication pattern
+        
+        // Phase 0: Local transpose (diagonal block)
+        size_t diag_offset = block_order * block_order * me;
+        transpose_block(B.data() + diag_offset, A.data() + diag_offset, block_order, tile_size);
+        
+        // Phases 1 to np-1: Remote communication using PUT
+        for (int phase = 1; phase < np; phase++) {
+            const int send_to = (me - phase + np) % np;
+            const int recv_from = (me + phase) % np;
+            
+            // Source and destination offsets
+            size_t soffset = block_order * block_order * send_to;     // source block in A
+            size_t roffset = block_order * block_order * recv_from;   // destination block in B
+            
+            // Transpose the block locally first
+            transpose_block(T.data(), A.data() + soffset, block_order, tile_size);
+            
+            // PUT the transposed block to the remote PE
+            prk::SHMEM::put(B.data() + roffset, T.data(), block_order * block_order, recv_from);
+        }
+        
+        // Global synchronization to ensure all PUTs are complete
+        prk::SHMEM::barrier();
+
+        // increment A
+        std::transform(A.begin(), A.end(), A.begin(), [](auto a) { return a + 1; });
+        prk::SHMEM::barrier();
+      }
+      trans_time = prk::wtime() - trans_time;
+    }
+
+    prk::SHMEM::free(LA);
+    prk::SHMEM::free(LB);
+
+    //////////////////////////////////////////////////////////////////////
+    /// Analyze and output results
+    //////////////////////////////////////////////////////////////////////
+
+    const double addit = (iterations+1.0) * (iterations*0.5);
+    double abserr(0);
+    for (size_t i=0; i<order; i++) {
+      for (size_t j=0; j<block_order; j++) {
+        const double temp = (order*(me*block_order+j)+(i)) * (1+iterations) + addit;
+        abserr += prk::abs(B[i*block_order+j] - temp);
+      }
+    }
+    abserr = prk::SHMEM::sum(abserr);
+
+#ifdef VERBOSE
+    if (me == 0) std::cout << "Sum of absolute differences: " << abserr << std::endl;
+#endif
+
+    if (me == 0) {
+      const auto epsilon = 1.0e-8;
+      if (abserr < epsilon) {
+        std::cout << "Solution validates" << std::endl;
+        auto avgtime = trans_time/iterations;
+        auto bytes = (size_t)order * (size_t)order * sizeof(double);
+        std::cout << "Rate (MB/s): " << 1.0e-6 * (4.0*bytes)/avgtime
+                  << " Avg time (s): " << avgtime << std::endl;
+      } else {
+        std::cout << "ERROR: Aggregate squared error " << abserr
+                  << " exceeds threshold " << epsilon << std::endl;
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
+
diff --git a/Cxx11/xgemm-cublas.cu b/Cxx11/xgemm-cublas.cu
index 4f47ad972..e4eee5a3b 100644
--- a/Cxx11/xgemm-cublas.cu
+++ b/Cxx11/xgemm-cublas.cu
@@ -59,6 +59,7 @@
 
 #include "prk_util.h"
 #include "prk_cuda.h"
+#include "prk_thrust.h"
 
 prk::CUDA::info info;
 
@@ -86,6 +87,64 @@ __global__ void init(int order, T * A, T * B, T * C)
     }
 }
 
+// CURAND initialization templates
+template <typename T>
+void curand_init_matrices(T* d_a, T* d_b, size_t nelems) {
+    std::cerr << "No valid CURAND template match for type T" << std::endl;
+    std::abort();
+}
+
+template <>
+void curand_init_matrices<float>(float* d_a, float* d_b, size_t nelems) {
+    curandGenerator_t gen;
+    prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
+    prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );
+    prk::check( curandGenerateUniform(gen, d_a, nelems) );
+    prk::check( curandGenerateUniform(gen, d_b, nelems) );
+    prk::check( curandDestroyGenerator(gen) );
+}
+
+template <>
+void curand_init_matrices<double>(double* d_a, double* d_b, size_t nelems) {
+    curandGenerator_t gen;
+    prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
+    prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );
+    prk::check( curandGenerateUniformDouble(gen, d_a, nelems) );
+    prk::check( curandGenerateUniformDouble(gen, d_b, nelems) );
+    prk::check( curandDestroyGenerator(gen) );
+}
+
+template <>
+void curand_init_matrices<__half>(__half* d_a, __half* d_b, size_t nelems) {
+    // CURAND doesn't directly support __half, so generate float and convert
+    auto temp_a = prk::CUDA::malloc_device<float>(nelems);
+    auto temp_b = prk::CUDA::malloc_device<float>(nelems);
+    
+    curandGenerator_t gen;
+    prk::check( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
+    prk::check( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );
+    prk::check( curandGenerateUniform(gen, temp_a, nelems) );
+    prk::check( curandGenerateUniform(gen, temp_b, nelems) );
+    prk::check( curandDestroyGenerator(gen) );
+    
+    // Convert float to __half
+    const int block_size = 256;
+    const int grid_size = (nelems + block_size - 1) / block_size;
+    
+    auto convert_kernel = [] __device__ (int idx, float* src, __half* dst, size_t n) {
+        if (idx < n) dst[idx] = __float2half(src[idx]);
+    };
+    
+    // Launch conversion kernels
+    thrust::for_each_n(thrust::device, thrust::counting_iterator<int>(0), nelems,
+        [=] __device__ (int idx) { if (idx < nelems) d_a[idx] = __float2half(temp_a[idx]); });
+    thrust::for_each_n(thrust::device, thrust::counting_iterator<int>(0), nelems,
+        [=] __device__ (int idx) { if (idx < nelems) d_b[idx] = __float2half(temp_b[idx]); });
+    
+    prk::CUDA::free(temp_a);
+    prk::CUDA::free(temp_b);
+}
+
 template <typename TAB, typename TC>
 void prk_gemm(const cublasHandle_t & h,
               const int order, const TC alpha, const TC beta,
@@ -141,7 +200,7 @@ void prk_gemm(const cublasHandle_t & h,
 }
 
 template <typename T>
-void run(const cublasHandle_t & h, int iterations, int order)
+void run(const cublasHandle_t & h, int iterations, int order, bool random_initialization)
 {
   double gemm_time{0};
 
@@ -156,7 +215,15 @@ void run(const cublasHandle_t & h, int iterations, int order)
   auto d_a = prk::CUDA::malloc_device<T>(nelems);
   auto d_b = prk::CUDA::malloc_device<T>(nelems);
   auto d_c = prk::CUDA::malloc_device<T>(nelems);
-  init<<<dimGrid, dimBlock>>>(order, d_a, d_b, d_c);
+  
+  if (random_initialization) {
+    // Initialize matrices with CURAND uniform distribution [0,1]
+    curand_init_matrices(d_a, d_b, nelems);
+    // Initialize matrix C to zero
+    init<<<dimGrid, dimBlock>>>(order, d_c);
+  } else {
+    init<<<dimGrid, dimBlock>>>(order, d_a, d_b, d_c);
+  }
   prk::CUDA::sync();
   {
     for (int iter = 0; iter<=iterations; iter++) {
@@ -190,12 +257,14 @@ void run(const cublasHandle_t & h, int iterations, int order)
   }
   const double residuum = std::abs(checksum - reference) / reference;
   const double epsilon{1.0e-8};
-  if ((residuum < epsilon) || (sizeof(T) < 4)) {
+  if ((residuum < epsilon) || (sizeof(T) < 4) || random_initialization) {
 #if VERBOSE
     std::cout << "Reference checksum = " << reference << "\n"
               << "Actual checksum = " << checksum << std::endl;
 #endif
-    std::cout << "Solution validates" << std::endl;
+    if (residuum < epsilon) {
+      std::cout << "Solution validates" << std::endl;
+    }
     auto avgtime = gemm_time/iterations;
     auto nflops = 2.0 * prk::pow(forder,3);
     auto is_fp64 = (typeid(T) == typeid(double));
@@ -222,11 +291,11 @@ int main(int argc, char * argv[])
   /// Read and test input parameters
   //////////////////////////////////////////////////////////////////////
 
-  int iterations;
-  int order;
+  int iterations, order;
+  bool random_initialization = false;
   try {
       if (argc < 2) {
-        throw "Usage: <# iterations> <matrix order>";
+        throw "Usage: <# iterations> <matrix order> [<random initialization [0/1]>]";
       }
 
       iterations  = std::atoi(argv[1]);
@@ -240,6 +309,10 @@ int main(int argc, char * argv[])
       } else if (order > prk::get_max_matrix_size()) {
         throw "ERROR: matrix dimension too large - overflow risk";
       }
+      
+      if (argc > 3) {
+        random_initialization = prk::parse_boolean(std::string(argv[3]));
+      }
   }
   catch (const char * e) {
     std::cout << e << std::endl;
@@ -250,6 +323,7 @@ int main(int argc, char * argv[])
 
   std::cout << "Number of iterations = " << iterations << std::endl;
   std::cout << "Matrix order         = " << order << std::endl;
+  std::cout << "Randomized data      = " << (random_initialization ? "yes" : "no") << std::endl;
 
   //////////////////////////////////////////////////////////////////////
   /// Setup CUBLAS environment
@@ -257,9 +331,9 @@ int main(int argc, char * argv[])
 
   cublasHandle_t h;
   prk::check( cublasCreate(&h) );
-  run<__half>(h, iterations, order);
-  run<float>(h, iterations, order);
-  run<double>(h, iterations, order);
+  run<__half>(h, iterations, order, random_initialization);
+  run<float>(h, iterations, order, random_initialization);
+  run<double>(h, iterations, order, random_initialization);
   prk::check( cublasDestroy(h) );
 
   return 0;
diff --git a/common/make.defs.cuda b/common/make.defs.cuda
index 0fd828d7c..4419c03ed 100644
--- a/common/make.defs.cuda
+++ b/common/make.defs.cuda
@@ -148,6 +148,7 @@ RAJADIR=/opt/raja/gcc
 RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG}
 THRUSTDIR=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/21.11/compilers/include-stdpar
 THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG}
+THRUSTFLAG+=--extended-lambda
 #
 # CBLAS for C++ DGEMM
 #
diff --git a/transpose_access_pattern.gif b/transpose_access_pattern.gif
new file mode 100644
index 000000000..a4e39b697
Binary files /dev/null and b/transpose_access_pattern.gif differ
diff --git a/transpose_animation.gif b/transpose_animation.gif
new file mode 100644
index 000000000..f21bc77f5
Binary files /dev/null and b/transpose_animation.gif differ
diff --git a/transpose_code_explanation.pdf b/transpose_code_explanation.pdf
new file mode 100644
index 000000000..17643b69a
Binary files /dev/null and b/transpose_code_explanation.pdf differ
diff --git a/transpose_code_explanation.png b/transpose_code_explanation.png
new file mode 100644
index 000000000..e0be7bba9
Binary files /dev/null and b/transpose_code_explanation.png differ
diff --git a/transpose_matrix_layout.pdf b/transpose_matrix_layout.pdf
new file mode 100644
index 000000000..ddc3b8b02
Binary files /dev/null and b/transpose_matrix_layout.pdf differ
diff --git a/transpose_matrix_layout.png b/transpose_matrix_layout.png
new file mode 100644
index 000000000..896e40f4f
Binary files /dev/null and b/transpose_matrix_layout.png differ