Added SME1 support for int8 GEMM and IGEMM operations

qti-vaiskv · qti-vaiskv · commit ab26084af639 · 2025-10-23T14:33:10.000+05:30
diff --git a/cmake/DownloadKleidiAI.cmake b/cmake/DownloadKleidiAI.cmake
@@ -18,8 +18,8 @@ ENDIF()
 # LINT.IfChange
 INCLUDE(ExternalProject)
 ExternalProject_Add(kleidiai
-  URL https://github.com/ARM-software/kleidiai/archive/8ca226712975f24f13f71d04cda039a0ee9f9e2f.zip
-  URL_HASH SHA256=42155cfc084bf1f80e9ef486470f949502ea8d1b845b2f1bebd58978a1b540aa
+  URL https://github.com/ARM-software/kleidiai/archive/bd2e6ae060014035e25bf4986be682762c446c2d.zip
+  URL_HASH SHA256=6a4a4e16b695fd6add6c361de1ebf3c7226f954ae103bc8d71fe6705a41cfd04
   SOURCE_DIR "${CMAKE_BINARY_DIR}/kleidiai-source"
   BINARY_DIR "${CMAKE_BINARY_DIR}/kleidiai"
   CONFIGURE_COMMAND ""
diff --git a/cmake/gen/neonsme2_microkernels.cmake b/cmake/gen/neonsme2_microkernels.cmake
@@ -20,9 +20,7 @@ SET(PROD_NEONSME2_MICROKERNEL_SRCS
   src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-1x64c4-neonsme2.c
   src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-16x64c4-neonsme2.c
   src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-1x64c4-neonsme2.c
-  src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-16x64c4-neonsme2.c
-  src/x8-pack-lh/x8-packlh-igemm-neonsme2.c
-  src/x8-pack-lh/x8-packlh-neonsme2.c
+  src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-16x64c4-neonsme2.c  
   src/x16-pack-lh/x16-packlh-neonsme2.c)
 
 SET(NON_PROD_NEONSME2_MICROKERNEL_SRCS)
diff --git a/cmake/gen/neonsme_microkernels.cmake b/cmake/gen/neonsme_microkernels.cmake
@@ -12,6 +12,10 @@
 SET(PROD_NEONSME_MICROKERNEL_SRCS
   src/pf32-gemm/pf32-gemm-1x32-minmax-neonsme.c
   src/pf32-gemm/pf32-gemm-32x32-minmax-neonsme.c
+  src/pqs8-qc8w-gemm/pqs8-qc8w-gemm-32x32c4-minmax-neonsme.c
+  src/pqs8-f32-qc8w-igemm/pqs8-f32-qc8w-igemm-32x32c4-minmax-neonsme.c
+  src/x8-pack-lh/x8-packlh-neonsme.c
+  src/x8-pack-lh/x8-packlh-igemm-neonsme.c  
   src/x32-pack-lh/x32-packlh-neonsme.c)
 
 SET(NON_PROD_NEONSME_MICROKERNEL_SRCS)
diff --git a/gen/neonsme2_microkernels.bzl b/gen/neonsme2_microkernels.bzl
@@ -16,9 +16,7 @@ PROD_NEONSME2_MICROKERNEL_SRCS = [
     "src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-1x64c4-neonsme2.c",
     "src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-16x64c4-neonsme2.c",
     "src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-1x64c4-neonsme2.c",
-    "src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-16x64c4-neonsme2.c",
-    "src/x8-pack-lh/x8-packlh-igemm-neonsme2.c",
-    "src/x8-pack-lh/x8-packlh-neonsme2.c",
+    "src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-16x64c4-neonsme2.c",    
     "src/x16-pack-lh/x16-packlh-neonsme2.c",
 ]
 
diff --git a/gen/neonsme_microkernels.bzl b/gen/neonsme_microkernels.bzl
@@ -8,7 +8,11 @@ Auto-generated file. Do not edit!
 PROD_NEONSME_MICROKERNEL_SRCS = [
     "src/pf32-gemm/pf32-gemm-1x32-minmax-neonsme.c",
     "src/pf32-gemm/pf32-gemm-32x32-minmax-neonsme.c",
+    "src/pqs8-f32-qc8w-igemm/pqs8-f32-qc8w-igemm-32x32c4-minmax-neonsme.c",
+    "src/pqs8-qc8w-gemm/pqs8-qc8w-gemm-32x32c4-minmax-neonsme.c",
     "src/x32-pack-lh/x32-packlh-neonsme.c",
+    "src/x8-pack-lh/x8-packlh-igemm-neonsme.c",
+    "src/x8-pack-lh/x8-packlh-neonsme.c",
 ]
 
 NON_PROD_NEONSME_MICROKERNEL_SRCS = [
diff --git a/src/configs/gemm-config.c b/src/configs/gemm-config.c
@@ -412,11 +412,11 @@ static void init_pqs8_qc8w_gemm_config(void) {
     pqs8_qc8w_gemm_config.init.qs8_qc8w =
         xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
     pqs8_qc8w_gemm_config.pack_weights_and_biases =
-        xnn_pack_kai_qs8_qc8w_weights_and_biases_sme2;
+        xnn_pack_kai_qs8_qc8w_weights_and_biases_sme;
     pqs8_qc8w_gemm_config.packed_stride_weights_and_biases =
-        xnn_packed_stride_kai_qs8_qc8w_weights_and_biases_sme2;
+        xnn_packed_stride_kai_qs8_qc8w_weights_and_biases_sme;
     pqs8_qc8w_gemm_config.pack_igemm_goki =
-        (xnn_pack_conv_goki_w_fn)xnn_pack_kai_qs8_conv_goki_w_sme2;
+        (xnn_pack_conv_goki_w_fn)xnn_pack_kai_qs8_conv_goki_w_sme;
     pqs8_qc8w_gemm_config.pack_igemm_kgo =
         (xnn_pack_conv_kgo_w_fn)xnn_pack_qs8_conv_kgo_w;
     pqs8_qc8w_gemm_config.pack_deconv_goki =
@@ -426,6 +426,39 @@ static void init_pqs8_qc8w_gemm_config(void) {
     pqs8_qc8w_gemm_config.nr = nr;
     pqs8_qc8w_gemm_config.log2_kr = 2;
 #endif  // XNN_ENABLE_ARM_SME2
+  } else if (XNN_ENABLE_ARM_SME && (hardware_config->arch_flags & xnn_arch_arm_sme)) {
+  #if XNN_ENABLE_ARM_SME
+    const size_t mr =
+        xnn_pqs8_qc8w_gemm_minmax_ukernel_32x32c4__neonsme_get_mr();
+    const size_t nr =
+        xnn_pqs8_qc8w_gemm_minmax_ukernel_32x32c4__neonsme_get_nr();
+    pqs8_qc8w_gemm_config.arch = xnn_arch_arm_sme;
+    pqs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(mr)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_pqs8_qc8w_gemm_minmax_ukernel_32x32c4__neonsme);    
+    pqs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(mr)] =
+        xnn_init_hmp_packed_igemm_ukernel(
+            (xnn_packed_lhs_igemm_ukernel_fn)
+                xnn_pqs8_qc8w_igemm_minmax_fp32_ukernel_32x32c4__neonsme);
+    pqs8_qc8w_gemm_config.init.qs8_qc8w =
+        xnn_init_qs8_qc8w_conv_minmax_fp32_scalar_params;
+    pqs8_qc8w_gemm_config.pack_weights_and_biases =
+        xnn_pack_kai_qs8_qc8w_weights_and_biases_sme;
+    pqs8_qc8w_gemm_config.packed_stride_weights_and_biases =
+        xnn_packed_stride_kai_qs8_qc8w_weights_and_biases_sme;
+    pqs8_qc8w_gemm_config.pack_igemm_goki =
+        (xnn_pack_conv_goki_w_fn)xnn_pack_kai_qs8_conv_goki_w_sme;
+    pqs8_qc8w_gemm_config.pack_igemm_kgo =
+        (xnn_pack_conv_kgo_w_fn)xnn_pack_qs8_conv_kgo_w;
+    pqs8_qc8w_gemm_config.pack_deconv_goki =
+        (xnn_pack_deconv_goki_w_fn)xnn_pack_qs8_deconv_goki_w;
+    pqs8_qc8w_gemm_config.mr = mr;
+    pqs8_qc8w_gemm_config.mr_packed = mr;
+    pqs8_qc8w_gemm_config.nr = nr;
+    pqs8_qc8w_gemm_config.log2_kr = 2;
+#endif  // XNN_ENABLE_ARM_SME  
+
+  }
+  else {
+    /* No action */
   }
   assert(pqs8_qc8w_gemm_config.mr <= XNN_MAX_MR);
 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
diff --git a/src/configs/pack-lh-config.c b/src/configs/pack-lh-config.c
@@ -103,13 +103,13 @@ const struct xnn_pack_lh_config* xnn_init_x16_pack_lh_config() {
 
 static void init_x8_pack_lh_config(void) {
 #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
-#if XNN_ENABLE_ARM_SME2
+#if XNN_ENABLE_ARM_SME
   const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
   assert(hardware_config != NULL);
-  if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) {
-    x8_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn) xnn_x8_pack_lh_ukernel__neonsme2;
-    x8_pack_lh_config.size_fn = (xnn_pack_lh_size_fn) xnn_x8_pack_lh_size__neonsme2;
-    x8_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn) xnn_x8_pack_lh_offset__neonsme2;
+  if ((hardware_config->arch_flags & xnn_arch_arm_sme)) {
+    x8_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn) xnn_x8_pack_lh_ukernel__neonsme;
+    x8_pack_lh_config.size_fn = (xnn_pack_lh_size_fn) xnn_x8_pack_lh_size__neonsme;
+    x8_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn) xnn_x8_pack_lh_offset__neonsme;
   }
 #endif  // XNN_ENABLE_ARM_SME2
 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
@@ -129,13 +129,13 @@ const struct xnn_pack_lh_config* xnn_init_x8_pack_lh_config() {
 
 static void init_x8_igemm_pack_lh_config(void) {
   #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
-  #if XNN_ENABLE_ARM_SME2
+  #if XNN_ENABLE_ARM_SME
     const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
     assert(hardware_config != NULL);
-    if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) {
-      x8_igemm_pack_lh_config.pack_lh_for_igemm_fn = (xnn_pack_lh_igemm_ukernel_fn) xnn_x8_pack_lh_ukernel__igemm_neonsme2;
-      x8_igemm_pack_lh_config.size_for_igemm_fn = (xnn_pack_lh_igemm_size_fn) xnn_x8_pack_lh_size__igemm_neonsme2;
-      x8_igemm_pack_lh_config.offset_for_igemm_fn = (xnn_pack_lh_igemm_offset_fn) xnn_x8_pack_lh_offset__igemm_neonsme2;
+    if ((hardware_config->arch_flags & xnn_arch_arm_sme)) {
+      x8_igemm_pack_lh_config.pack_lh_for_igemm_fn = (xnn_pack_lh_igemm_ukernel_fn) xnn_x8_pack_lh_ukernel__igemm_neonsme;
+      x8_igemm_pack_lh_config.size_for_igemm_fn = (xnn_pack_lh_igemm_size_fn) xnn_x8_pack_lh_size__igemm_neonsme;
+      x8_igemm_pack_lh_config.offset_for_igemm_fn = (xnn_pack_lh_igemm_offset_fn) xnn_x8_pack_lh_offset__igemm_neonsme;
     }
   #endif  // XNN_ENABLE_ARM_SME2
   #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
diff --git a/src/pqs8-f32-qc8w-igemm/pqs8-f32-qc8w-igemm-32x32c4-minmax-neonsme.c b/src/pqs8-f32-qc8w-igemm/pqs8-f32-qc8w-igemm-32x32c4-minmax-neonsme.c
@@ -0,0 +1,61 @@
+// Copyright 2025 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <stddef.h>
+
+#include "src/xnnpack/microparams.h"
+
+#if XNN_ENABLE_KLEIDIAI
+#include "kai/ukernels/matmul/matmul_clamp_qai8_qai8p_qsi8cxp/kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa.h"
+#include "kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x8p2vlx4_x8p_sme.h"
+#endif  // XNN_ENABLE_KLEIDIAI
+
+size_t xnn_pqs8_qc8w_igemm_minmax_fp32_ukernel_32x32c4__neonsme_get_mr(void) {
+#if XNN_ENABLE_KLEIDIAI
+  return kai_get_mr_matmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa();
+#else
+  assert(
+      "Calling wrapped KleidiAI function, but XNNPACK was compiled without "
+      "`XNN_ENABLE_KLEIDIAI`." &&
+      0);
+  return 0;
+#endif  // XNN_ENABLE_KLEIDIAI
+}
+
+size_t xnn_pqs8_qc8w_igemm_minmax_fp32_ukernel_32x32c4__neonsme_get_nr(void) {
+#if XNN_ENABLE_KLEIDIAI
+  return kai_get_nr_matmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa();
+#else
+  assert(
+      "Calling wrapped KleidiAI function, but XNNPACK was compiled without "
+      "`XNN_ENABLE_KLEIDIAI`." &&
+      0);
+  return 0;
+#endif  // XNN_ENABLE_KLEIDIAI
+}
+
+void xnn_pqs8_qc8w_igemm_minmax_fp32_ukernel_32x32c4__neonsme(
+    size_t mr, size_t nc, size_t kc, size_t ks, const void* packed_lhs,
+    const void* restrict w, int8_t* restrict c, size_t cm_stride,
+    const union xnn_qs8_qc8w_conv_minmax_params* params) {
+#if XNN_ENABLE_KLEIDIAI
+  const size_t kai_kr = 4;
+  const size_t k = ks * round_up(kc, kai_kr);
+
+  // Repackage the params.
+  struct kai_matmul_requantize32_params kai_params;
+  kai_params.output_zero_point = params->fp32_scalar.output_zero_point;
+  kai_params.min_value = (int8_t)params->fp32_scalar.output_min;
+  kai_params.max_value = (int8_t)params->fp32_scalar.output_max;
+
+  kai_run_matmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa(
+      mr, nc, k, packed_lhs, w, c, cm_stride, sizeof(int8_t), &kai_params);
+#else
+  assert(
+      "Calling wrapped KleidiAI function, but XNNPACK was compiled without "
+      "`XNN_ENABLE_KLEIDIAI`." &&
+      0);
+#endif  // XNN_ENABLE_KLEIDIAI
+}
diff --git a/src/pqs8-qc8w-gemm/pqs8-qc8w-gemm-32x32c4-minmax-neonsme.c b/src/pqs8-qc8w-gemm/pqs8-qc8w-gemm-32x32c4-minmax-neonsme.c
@@ -0,0 +1,60 @@
+// Copyright 2024 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <stddef.h>
+
+#include "src/xnnpack/math.h"
+#include "src/xnnpack/microparams.h"
+
+#if XNN_ENABLE_KLEIDIAI
+#include "kai/ukernels/matmul/matmul_clamp_qai8_qai8p_qsi8cxp/kai_matmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa.h"
+#endif  // XNN_ENABLE_KLEIDIAI
+
+
+size_t xnn_pqs8_qc8w_gemm_minmax_ukernel_32x32c4__neonsme_get_mr() {
+#if XNN_ENABLE_KLEIDIAI
+  return kai_get_mr_matmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa();
+#else
+  assert(
+      "Calling KleidiAI kai_get_mr wrapper, but XNNPACK was compiled without "
+      "`XNN_ENABLE_KLEIDIAI`." && 0);
+  return 0;
+#endif  // XNN_ENABLE_KLEIDIAI
+}
+
+size_t xnn_pqs8_qc8w_gemm_minmax_ukernel_32x32c4__neonsme_get_nr() {
+#if XNN_ENABLE_KLEIDIAI
+  return kai_get_nr_matmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa();
+  
+#else
+  assert(
+      "Calling KleidiAI kai_get_nr wrapper, but XNNPACK was compiled without "
+      "`XNN_ENABLE_KLEIDIAI`." && 0);
+  return 0;
+#endif  // XNN_ENABLE_KLEIDIAI
+}
+
+// Wraps the `kai_run_matmul_clamp_qai8_qai8p2vlx4_qsi8cxpsb2vlx4_2vlx2vl_sme_mopa`
+// GEMM microkernel with a name that is compatible with our tooling.
+void xnn_pqs8_qc8w_gemm_minmax_ukernel_32x32c4__neonsme(
+    size_t m, size_t n, size_t k, const void* lhs_packed,
+    const void* rhs_packed, void* dst, size_t dst_stride_row,
+    size_t dst_stride_col,
+    const union xnn_qs8_qc8w_conv_minmax_params* minmax_params) {
+#if XNN_ENABLE_KLEIDIAI
+  struct kai_matmul_requantize32_params kai_params;
+  kai_params.output_zero_point = minmax_params->fp32_scalar.output_zero_point;
+  kai_params.min_value = minmax_params->fp32_scalar.output_min;
+  kai_params.max_value = minmax_params->fp32_scalar.output_max;
+
+  kai_run_matmul_clamp_qai8_qai8p2vlx4_qsi8cxp2vlx4sb_2vlx2vl_sme_mopa(
+      m, n, k / sizeof(int8_t), lhs_packed, rhs_packed, dst, dst_stride_row,
+      /*dst_stride_col=*/sizeof(int8_t), &kai_params);
+#else
+  assert(
+      "Calling KleidiAI microkernel wrapper, but XNNPACK was compiled without "
+      "`XNN_ENABLE_KLEIDIAI`." && 0);
+#endif  // XNN_ENABLE_KLEIDIAI
+}
diff --git a/src/reference/packing.cc b/src/reference/packing.cc
@@ -2156,7 +2156,7 @@ void xnn_pack_kai_qs4_weights_and_biases(
   }
 }
 
-size_t xnn_packed_stride_kai_qs8_qc8w_weights_and_biases_sme2(
+size_t xnn_packed_stride_kai_qs8_qc8w_weights_and_biases_sme(
     const struct xnn_gemm_config* gemm_config, size_t k,
     size_t unused_block_size, size_t unused_k_stride, size_t extra_bytes) {
   size_t ret_val =
@@ -2175,7 +2175,7 @@ void transpose_weights_x8(const int8_t* in, int8_t* out, size_t height,
   }
 }
 
-void xnn_pack_kai_qs8_qc8w_weights_and_biases_sme2(
+void xnn_pack_kai_qs8_qc8w_weights_and_biases_sme(
     uint32_t flags, const struct xnn_gemm_config* gemm_config,
     size_t input_channels, size_t output_channels, size_t groups,
     size_t unused_block_size, size_t k_stride, const void* accumulator_init,
@@ -2560,7 +2560,7 @@ void xnn_pack_kai_qb4_weights_and_biases(
   }
 }
 
-void xnn_pack_kai_qs8_conv_goki_w_sme2(
+void xnn_pack_kai_qs8_conv_goki_w_sme(
     size_t g, size_t nc, size_t ks, size_t kc, size_t nr, size_t kr, size_t sr,
     const int8_t* k, const int32_t* b, const float* scale, void* packed_weights,
     size_t extra_bytes, const struct xnn_qs8_packing_params* params) {
diff --git a/src/x8-pack-lh/x8-pack-lh-igemm.inc b/src/x8-pack-lh/x8-pack-lh-igemm.inc
@@ -8,7 +8,7 @@
 // arch_flags, igemm_ukernel, igemm_size_fn, igemm_packed_offset_fn
 
 #if XNN_ENABLE_KLEIDIAI
-XNN_UKERNEL(xnn_arch_arm_sme, xnn_x8_pack_lh_ukernel__igemm_neonsme2,
-            xnn_x8_pack_lh_size__igemm_neonsme2,
-            xnn_x8_pack_lh_offset__igemm_neonsme2)
+XNN_UKERNEL(xnn_arch_arm_sme, xnn_x8_pack_lh_ukernel__igemm_neonsme,
+            xnn_x8_pack_lh_size__igemm_neonsme,
+            xnn_x8_pack_lh_offset__igemm_neonsme)
 #endif  // XNN_ENABLE_KLEIDIAI
diff --git a/src/x8-pack-lh/x8-pack-lh.inc b/src/x8-pack-lh/x8-pack-lh.inc
@@ -8,6 +8,6 @@
 // arch_flags, ukernel, size_fn, packed_offset_fn
 
 #if XNN_ENABLE_KLEIDIAI
-XNN_UKERNEL(xnn_arch_arm_sme, xnn_x8_pack_lh_ukernel__neonsme2,
-            xnn_x8_pack_lh_size__neonsme2, xnn_x8_pack_lh_offset__neonsme2)
+XNN_UKERNEL(xnn_arch_arm_sme, xnn_x8_pack_lh_ukernel__neonsme,
+            xnn_x8_pack_lh_size__neonsme, xnn_x8_pack_lh_offset__neonsme)
 #endif  // XNN_ENABLE_KLEIDIAI
diff --git a/src/x8-pack-lh/x8-packlh-igemm-neonsme.c b/src/x8-pack-lh/x8-packlh-igemm-neonsme.c
@@ -20,7 +20,7 @@
 
 // This function just wraps KleidiAI's `kai_run_lhs_pack_x8p2vlx4_x8_sme`, but
 // with a name that is recognized by our tooling.
-void xnn_x8_pack_lh_ukernel__igemm_neonsme2(size_t m, size_t kc, size_t ks,
+void xnn_x8_pack_lh_ukernel__igemm_neonsme(size_t m, size_t kc, size_t ks,
                                             size_t mr_packed, size_t kr,
                                             size_t sr, const void** restrict a,
                                             size_t a_offset, const void* zero,
@@ -34,7 +34,7 @@ void xnn_x8_pack_lh_ukernel__igemm_neonsme2(size_t m, size_t kc, size_t ks,
 #endif  // XNN_ENABLE_KLEIDIAI
 }
 
-size_t xnn_x8_pack_lh_size__igemm_neonsme2(size_t m, size_t kc, size_t ks,
+size_t xnn_x8_pack_lh_size__igemm_neonsme(size_t m, size_t kc, size_t ks,
                                            size_t mr_packed, size_t kr,
                                            size_t sr) {
 #if XNN_ENABLE_KLEIDIAI
@@ -47,7 +47,7 @@ size_t xnn_x8_pack_lh_size__igemm_neonsme2(size_t m, size_t kc, size_t ks,
 #endif  // XNN_ENABLE_KLEIDIAI
 }
 
-size_t xnn_x8_pack_lh_offset__igemm_neonsme2(size_t m, size_t kc, size_t ks,
+size_t xnn_x8_pack_lh_offset__igemm_neonsme(size_t m, size_t kc, size_t ks,
                                              size_t mr_packed, size_t kr,
                                              size_t sr) {
 #if XNN_ENABLE_KLEIDIAI
diff --git a/src/x8-pack-lh/x8-packlh-neonsme.c b/src/x8-pack-lh/x8-packlh-neonsme.c
@@ -13,6 +13,7 @@
 #include "src/xnnpack/common.h"
 #include "src/xnnpack/math.h"
 #include "src/xnnpack/pack-lh.h"
+#include "src/xnnpack/config.h"
 
 #if XNN_ENABLE_KLEIDIAI
 #include "kai/ukernels/matmul/pack/kai_lhs_pack_x8p2vlx4_x8_sme.h"
@@ -21,12 +22,15 @@
 
 // This function just wraps KleidiAI's `kai_run_lhs_pack_x8p2vlx4_x8_sme`, but
 // with a name that is recognized by our tooling.
-void xnn_x8_pack_lh_ukernel__neonsme2(size_t m, size_t k, size_t mr_packed,
+void xnn_x8_pack_lh_ukernel__neonsme(size_t m, size_t k, size_t mr_packed,
                                       size_t kr, size_t sr, size_t m_idx_start,
                                       const int8_t* XNN_RESTRICT lhs,
                                       size_t lhs_stride,
                                       void* XNN_RESTRICT lhs_packed) {
+ 
 #if XNN_ENABLE_KLEIDIAI
+  const struct xnn_gemm_config* gemm_config = xnn_init_pqs8_qc8w_gemm_config();
+  mr_packed = gemm_config->mr_packed;
   if (mr_packed == 1) {
     memcpy(lhs_packed, lhs, sizeof(int8_t) * k);
   } else {
@@ -38,9 +42,12 @@ void xnn_x8_pack_lh_ukernel__neonsme2(size_t m, size_t k, size_t mr_packed,
 #endif  // XNN_ENABLE_KLEIDIAI
 }
 
-size_t xnn_x8_pack_lh_size__neonsme2(size_t m, size_t k, size_t mr_packed,
+size_t xnn_x8_pack_lh_size__neonsme(size_t m, size_t k, size_t mr_packed,
                                      size_t kr, size_t sr) {
+
 #if XNN_ENABLE_KLEIDIAI
+  const struct xnn_gemm_config* gemm_config = xnn_init_pqs8_qc8w_gemm_config();
+  mr_packed = gemm_config->mr_packed;
   if (mr_packed == 1) {
     return m * sizeof(int8_t) * k;
   } else {
@@ -53,9 +60,12 @@ size_t xnn_x8_pack_lh_size__neonsme2(size_t m, size_t k, size_t mr_packed,
 #endif  // XNN_ENABLE_KLEIDIAI
 }
 
-size_t xnn_x8_pack_lh_offset__neonsme2(size_t m, size_t k, size_t mr_packed,
+size_t xnn_x8_pack_lh_offset__neonsme(size_t m, size_t k, size_t mr_packed,
                                        size_t kr, size_t sr) {
+
 #if XNN_ENABLE_KLEIDIAI
+  const struct xnn_gemm_config* gemm_config = xnn_init_pqs8_qc8w_gemm_config();
+  mr_packed = gemm_config->mr_packed;
   if (mr_packed == 1) {
     return m * sizeof(int8_t) * k;
   } else {
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
diff --git a/src/xnnpack/pack.h b/src/xnnpack/pack.h
diff --git a/test/gemm-microkernel-tester.cc b/test/gemm-microkernel-tester.cc
diff --git a/test/pqs8-qc8w-gemm-minmax.cc b/test/pqs8-qc8w-gemm-minmax.cc
diff --git a/test/pqs8-qc8w-gemm-minmax.yaml b/test/pqs8-qc8w-gemm-minmax.yaml
diff --git a/test/qs8-qc8w-igemm-minmax-fp32.cc b/test/qs8-qc8w-igemm-minmax-fp32.cc
diff --git a/test/qs8-qc8w-igemm-minmax-fp32.yaml b/test/qs8-qc8w-igemm-minmax-fp32.yaml

Original file line number	Diff line number	Diff line change
`@@ -2156,7 +2156,7 @@ void xnn_pack_kai_qs4_weights_and_biases(`
`2156`	`2156`	`}`
`2157`	`2157`	`}`
`2158`	`2158`
`2159`		`-size_t xnn_packed_stride_kai_qs8_qc8w_weights_and_biases_sme2(`
	`2159`	`+size_t xnn_packed_stride_kai_qs8_qc8w_weights_and_biases_sme(`
`2160`	`2160`	`const struct xnn_gemm_config* gemm_config, size_t k,`
`2161`	`2161`	`size_t unused_block_size, size_t unused_k_stride, size_t extra_bytes) {`
`2162`	`2162`	`size_t ret_val =`
`@@ -2175,7 +2175,7 @@ void transpose_weights_x8(const int8_t* in, int8_t* out, size_t height,`
`2175`	`2175`	`}`
`2176`	`2176`	`}`
`2177`	`2177`
`2178`		`-void xnn_pack_kai_qs8_qc8w_weights_and_biases_sme2(`
	`2178`	`+void xnn_pack_kai_qs8_qc8w_weights_and_biases_sme(`
`2179`	`2179`	`uint32_t flags, const struct xnn_gemm_config* gemm_config,`
`2180`	`2180`	`size_t input_channels, size_t output_channels, size_t groups,`
`2181`	`2181`	`size_t unused_block_size, size_t k_stride, const void* accumulator_init,`
`@@ -2560,7 +2560,7 @@ void xnn_pack_kai_qb4_weights_and_biases(`
`2560`	`2560`	`}`
`2561`	`2561`	`}`
`2562`	`2562`
`2563`		`-void xnn_pack_kai_qs8_conv_goki_w_sme2(`
	`2563`	`+void xnn_pack_kai_qs8_conv_goki_w_sme(`
`2564`	`2564`	`size_t g, size_t nc, size_t ks, size_t kc, size_t nr, size_t kr, size_t sr,`
`2565`	`2565`	`const int8_t* k, const int32_t* b, const float* scale, void* packed_weights,`
`2566`	`2566`	`size_t extra_bytes, const struct xnn_qs8_packing_params* params) {`