diff --git a/docs/user_guide/errata.dox b/docs/user_guide/errata.dox index 52ce8e9798..961a46102d 100644 --- a/docs/user_guide/errata.dox +++ b/docs/user_guide/errata.dox @@ -1,5 +1,5 @@ /// -/// Copyright (c) 2019-2024 Arm Limited. +/// Copyright (c) 2019-2025 Arm Limited. /// /// SPDX-License-Identifier: MIT /// @@ -30,6 +30,14 @@ namespace arm_compute @section S7_1_errata Errata +- (COMPMID-8727) An issue has been identified with the FP16 MMUL Reshaped RHS kernel for small N. + - Versions Affected: >= v52.5.0 && <= v52.7.0 + - Conditions: + - One way to trigger this issue is by selecting the kernel through its heuristic function (e.g. through the CLGemm Operator class' configure() call stack) and the size of the N at the kernel level is less than or equal to 32. + - This is because in those cases, the kernel's heuristic function generates an invalid case where the product - for the N dimension - of the MMUL block size (MMUL_N0) and conventional block size (N0) is larger than the N dimension size. + - Another way is through using the kernel directly with N smaller than or equal to MMUL_N0 * N0 since the validation in the kernel-level configure() function does not catch this. + - Result: The FP16 MMUL Reshaped RHS kernel is incorrectly configured with block sizes that cause its operations to extend beyond the tensor dimensions. + - (COMPMID-7536) NEDepthwiseConvolutionLayer and NEPoolingLayer may produce wrong results with OpenMP® scheduler - Versions: >= v24.05 && < v24.08 - Oses: All diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp index eb8af29efb..f251e34c43 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp @@ -100,7 +100,7 @@ Status validate_arguments(const ITensorInfo *src0, if (arm_matrix_multiply_fp16_supported(CLKernelLibrary::get().get_device())) { // These error messages are for FP16 acc. - ARM_COMPUTE_RETURN_ERROR_ON_MSG((n > rhs_info.n0 * mmul_n0), + ARM_COMPUTE_RETURN_ERROR_ON_MSG((n <= rhs_info.n0 * mmul_n0), "N must be greater than N0 * MMUL_N0 in the FP16 MMUL Kernel"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k % 4) != 0), "K must be multiple of 4 in the FP16 MMUL Kernel"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((m < 4), "M must be greater than or equal to 4 in the FP16 MMUL Kernel"); diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp index 0c89f15bbd..ef8482e0d7 100644 --- a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp +++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp @@ -198,7 +198,7 @@ bool is_mmul_kernel_preferred_fp16_acc(const unsigned int m, const unsigned int m_div_m0 = ceil_to_multiple_m_m0 / best_m0; const unsigned int ceil_to_multiple_m_div_m0_mmul_k0 = ceil_to_multiple(m_div_m0, mmul_k0); const unsigned int gws_y = ceil_to_multiple_m_div_m0_mmul_k0 / mmul_k0; - return ((k % mmul_k0) == 0) && (gws_y >= 4); + return ((k % mmul_k0) == 0) && (gws_y >= 4) && (n > best_n0 * mmul_n0); } return false; diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRhsMMUL.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRhsMMUL.cpp index 512565c5ee..3632df1524 100644 --- a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRhsMMUL.cpp +++ b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRhsMMUL.cpp @@ -75,7 +75,7 @@ const auto m_values = make("M", {49}); /** N values to test */ const auto n_values = make("N", {257, 64, 48}); -const auto n_values_fp16 = make("N", {79, 32, 80}); +const auto n_values_fp16 = make("N", {79, 80}); const auto n_values_texture_fp16 = make("N", {128, 96, 48}); /** K values to test */