google · gmiodice · Jul 8, 2025 · Jul 9, 2025 · Sep 8, 2025
diff --git a/build_srcs.bzl b/build_srcs.bzl
@@ -280,6 +280,7 @@ MICROKERNEL_DEFS = [
     "src/x64-transposec/x64-transposec.inc",
     "src/x8-pack-lh/x8-pack-lh.inc",
     "src/x8-pack-lh/x8-pack-lh-igemm.inc",
+    "src/x16-pack-lh/x16-pack-lh-igemm.inc",
     "src/x8-packq/x8-packq.inc",
     "src/x8-packw/x8-packw.inc",
     "src/x8-transposec/x8-transposec.inc",

diff --git a/cmake/gen/neonsme2_microkernels.cmake b/cmake/gen/neonsme2_microkernels.cmake
@@ -14,13 +14,15 @@ SET(PROD_NEONSME2_MICROKERNEL_SRCS
   src/pf16-gemm/pf16-gemm-32x32c2-minmax-neonsme2.c
   src/pf32-gemm/pf32-gemm-1x32-minmax-neonsme2.c
   src/pf32-gemm/pf32-gemm-32x32-minmax-neonsme2.c
+  src/pf16-f16-f16-igemm/pf16-f16-f16-igemm-32x32c2-minmax-neonsme2.c
   src/pqs8-f32-qc8w-igemm/pqs8-f32-qc8w-igemm-32x32c4-minmax-neonsme2.c
   src/pqs8-qc8w-gemm/pqs8-qc8w-gemm-1x32c4-minmax-neonsme2.c
   src/pqs8-qc8w-gemm/pqs8-qc8w-gemm-32x32c4-minmax-neonsme2.c
   src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-1x64c4-neonsme2.c
   src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-16x64c4-neonsme2.c
   src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-1x64c4-neonsme2.c
   src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-16x64c4-neonsme2.c
+  src/x16-pack-lh/x16-packlh-igemm-neonsme2.c
   src/x8-pack-lh/x8-packlh-igemm-neonsme2.c
   src/x8-pack-lh/x8-packlh-neonsme2.c
   src/x16-pack-lh/x16-packlh-neonsme2.c

diff --git a/gen/neonsme2_microkernels.bzl b/gen/neonsme2_microkernels.bzl
@@ -10,13 +10,15 @@ PROD_NEONSME2_MICROKERNEL_SRCS = [
     "src/pf16-gemm/pf16-gemm-32x32c2-minmax-neonsme2.c",
     "src/pf32-gemm/pf32-gemm-1x32-minmax-neonsme2.c",
     "src/pf32-gemm/pf32-gemm-32x32-minmax-neonsme2.c",
+    "src/pf16-f16-f16-igemm/pf16-f16-f16-igemm-32x32c2-minmax-neonsme2.c",
     "src/pqs8-f32-qc8w-igemm/pqs8-f32-qc8w-igemm-32x32c4-minmax-neonsme2.c",
     "src/pqs8-qc8w-gemm/pqs8-qc8w-gemm-1x32c4-minmax-neonsme2.c",
     "src/pqs8-qc8w-gemm/pqs8-qc8w-gemm-32x32c4-minmax-neonsme2.c",
     "src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-1x64c4-neonsme2.c",
     "src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-16x64c4-neonsme2.c",
     "src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-1x64c4-neonsme2.c",
     "src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-16x64c4-neonsme2.c",
+    "src/x16-pack-lh/x16-packlh-igemm-neonsme2.c",
     "src/x8-pack-lh/x8-packlh-igemm-neonsme2.c",
     "src/x8-pack-lh/x8-packlh-neonsme2.c",
     "src/x16-pack-lh/x16-packlh-neonsme2.c",

diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh
@@ -50,6 +50,7 @@ tools/generate-gemm-test.py --spec test/qs8-qc4w-gemm-minmax-fp32.yaml --output-
 tools/generate-gemm-test.py --spec test/qs8-qc8w-gemm-minmax-fp32.yaml --output-test test/qs8-qc8w-gemm-minmax-fp32.cc --output-test test/qs8-qc8w-gemm-minmax-fp32-2.cc --output-test test/qs8-qc8w-gemm-minmax-fp32-3.cc --output-bench bench/qs8-qc8w-gemm-fp32.cc &
 
 ### Tests for IGEMM micro-kernels
+tools/generate-gemm-test.py --spec test/pf16-f16-igemm-minmax.yaml --output-test test/pf16-f16-igemm-minmax.cc &
 tools/generate-gemm-test.py --spec test/f16-igemm-minmax.yaml --output-test test/f16-igemm-minmax.cc &
 tools/generate-gemm-test.py --spec test/f16-f32acc-igemm-minmax.yaml --output-test test/f16-f32acc-igemm-minmax.cc &
 

diff --git a/src/configs/gemm-config.c b/src/configs/gemm-config.c
@@ -312,9 +312,17 @@ static void init_pf16_gemm_config(void) {
       pf16_gemm_config.arch = xnn_arch_arm_sme2;
       pf16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_pf16_gemm_minmax_ukernel_1x32c2__neonsme2);
       pf16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(mr)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_pf16_gemm_minmax_ukernel_32x32c2__neonsme2);
+      pf16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(mr)] =
+        xnn_init_hmp_packed_igemm_ukernel(
+            (xnn_packed_lhs_igemm_ukernel_fn)
+                xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme2);
       pf16_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
       pf16_gemm_config.pack_weights_and_biases = xnn_pack_kai_f16_weights_and_biases;
       pf16_gemm_config.packed_stride_weights_and_biases = xnn_packed_stride_kai_f16_weights_and_biases;
+      pf16_gemm_config.pack_igemm_goki =
+        (xnn_pack_conv_goki_w_fn)xnn_pack_kai_f16_conv_goki_w_sme2;
+      pf16_gemm_config.pack_igemm_kgo =
+        (xnn_pack_conv_kgo_w_fn)xnn_pack_f16_conv_kgo_w;
       pf16_gemm_config.mr = mr;
       pf16_gemm_config.mr_packed = mr;
       pf16_gemm_config.nr = nr;
@@ -5028,6 +5036,7 @@ const struct xnn_gemm_config* xnn_init_pf16_gemm_config() {
     return NULL;
   }
   XNN_INIT_ONCE(pf16_gemm);
+
   return pf16_gemm_config.mr ? &pf16_gemm_config : NULL;
 }
 

diff --git a/src/configs/pack-lh-config.c b/src/configs/pack-lh-config.c
@@ -24,6 +24,7 @@ static struct xnn_pack_lh_config x8_pack_lh_config = {0};
 static struct xnn_pack_lh_config x16_pack_lh_config = {0};
 static struct xnn_pack_lh_config x32_pack_lh_config = {0};
 static struct xnn_pack_lh_config x8_igemm_pack_lh_config = {0};
+static struct xnn_pack_lh_config x16_igemm_pack_lh_config = {0};
 
 XNN_INIT_ONCE_GUARD(f16_qdint8_pack_lh);
 XNN_INIT_ONCE_GUARD(f16_qduint8_pack_lh);
@@ -34,11 +35,15 @@ XNN_INIT_ONCE_GUARD(x8_pack_lh);
 XNN_INIT_ONCE_GUARD(x16_pack_lh);
 XNN_INIT_ONCE_GUARD(x32_pack_lh);
 XNN_INIT_ONCE_GUARD(x8_igemm_pack_lh);
+XNN_INIT_ONCE_GUARD(x16_igemm_pack_lh);
 
 static void init_f16_qdint8_pack_lh_config(void) {
-  f16_qdint8_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn)xnn_pack_lh_f16_qdint8;
-  f16_qdint8_pack_lh_config.size_fn = (xnn_pack_lh_size_fn)xnn_pack_lh_fx_qd8_packed_size;
-  f16_qdint8_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn)xnn_pack_lh_fx_qd8_packed_offset;
+  f16_qdint8_pack_lh_config.pack_lh_fn =
+      (xnn_pack_lh_ukernel_fn)xnn_pack_lh_f16_qdint8;
+  f16_qdint8_pack_lh_config.size_fn =
+      (xnn_pack_lh_size_fn)xnn_pack_lh_fx_qd8_packed_size;
+  f16_qdint8_pack_lh_config.offset_fn =
+      (xnn_pack_lh_offset_fn)xnn_pack_lh_fx_qd8_packed_offset;
   f16_qdint8_pack_lh_config.log2_input_element_size = XNN_LOG2_SIZEOF_HALF;
   f16_qdint8_pack_lh_config.log2_packed_element_size = 0;
 }
@@ -54,9 +59,12 @@ const struct xnn_pack_lh_config* xnn_init_f16_qdint8_pack_lh_config() {
 }
 
 static void init_f16_qduint8_pack_lh_config(void) {
-  f16_qduint8_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn)xnn_pack_lh_f16_qduint8;
-  f16_qduint8_pack_lh_config.size_fn = (xnn_pack_lh_size_fn)xnn_pack_lh_fx_qd8_packed_size;
-  f16_qduint8_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn)xnn_pack_lh_fx_qd8_packed_offset;
+  f16_qduint8_pack_lh_config.pack_lh_fn =
+      (xnn_pack_lh_ukernel_fn)xnn_pack_lh_f16_qduint8;
+  f16_qduint8_pack_lh_config.size_fn =
+      (xnn_pack_lh_size_fn)xnn_pack_lh_fx_qd8_packed_size;
+  f16_qduint8_pack_lh_config.offset_fn =
+      (xnn_pack_lh_offset_fn)xnn_pack_lh_fx_qd8_packed_offset;
   f16_qduint8_pack_lh_config.log2_input_element_size = XNN_LOG2_SIZEOF_HALF;
   f16_qduint8_pack_lh_config.log2_packed_element_size = 0;
 }
@@ -72,9 +80,12 @@ const struct xnn_pack_lh_config* xnn_init_f16_qduint8_pack_lh_config() {
 }
 
 static void init_f32_qdint8_pack_lh_config(void) {
-  f32_qdint8_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn)xnn_pack_lh_f32_qdint8;
-  f32_qdint8_pack_lh_config.size_fn = (xnn_pack_lh_size_fn)xnn_pack_lh_fx_qd8_packed_size;
-  f32_qdint8_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn)xnn_pack_lh_fx_qd8_packed_offset;
+  f32_qdint8_pack_lh_config.pack_lh_fn =
+      (xnn_pack_lh_ukernel_fn)xnn_pack_lh_f32_qdint8;
+  f32_qdint8_pack_lh_config.size_fn =
+      (xnn_pack_lh_size_fn)xnn_pack_lh_fx_qd8_packed_size;
+  f32_qdint8_pack_lh_config.offset_fn =
+      (xnn_pack_lh_offset_fn)xnn_pack_lh_fx_qd8_packed_offset;
   f32_qdint8_pack_lh_config.log2_input_element_size = XNN_LOG2_SIZEOF_FLOAT;
   f32_qdint8_pack_lh_config.log2_packed_element_size = 0;
 }
@@ -90,9 +101,12 @@ const struct xnn_pack_lh_config* xnn_init_f32_qdint8_pack_lh_config() {
 }
 
 static void init_f32_qduint8_pack_lh_config(void) {
-  f32_qduint8_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn)xnn_pack_lh_f32_qduint8;
-  f32_qduint8_pack_lh_config.size_fn = (xnn_pack_lh_size_fn)xnn_pack_lh_fx_qd8_packed_size;
-  f32_qduint8_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn)xnn_pack_lh_fx_qd8_packed_offset;
+  f32_qduint8_pack_lh_config.pack_lh_fn =
+      (xnn_pack_lh_ukernel_fn)xnn_pack_lh_f32_qduint8;
+  f32_qduint8_pack_lh_config.size_fn =
+      (xnn_pack_lh_size_fn)xnn_pack_lh_fx_qd8_packed_size;
+  f32_qduint8_pack_lh_config.offset_fn =
+      (xnn_pack_lh_offset_fn)xnn_pack_lh_fx_qd8_packed_offset;
   f32_qduint8_pack_lh_config.log2_input_element_size = XNN_LOG2_SIZEOF_FLOAT;
   f32_qduint8_pack_lh_config.log2_packed_element_size = 0;
 }
@@ -109,12 +123,16 @@ const struct xnn_pack_lh_config* xnn_init_f32_qduint8_pack_lh_config() {
 
 static void init_qp8_pack_lh_config(void) {
 #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
-  qp8_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn)xnn_x8_packq_f32qp8_ukernel__aarch64_neon_u2;
+  qp8_pack_lh_config.pack_lh_fn =
+      (xnn_pack_lh_ukernel_fn)xnn_x8_packq_f32qp8_ukernel__aarch64_neon_u2;
 #else
-  qp8_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn)xnn_x8_packq_f32qp8_ukernel__scalar_u1;
+  qp8_pack_lh_config.pack_lh_fn =
+      (xnn_pack_lh_ukernel_fn)xnn_x8_packq_f32qp8_ukernel__scalar_u1;
 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
-  qp8_pack_lh_config.size_fn = (xnn_pack_lh_size_fn)xnn_x8_packq_f32qp8_packed_size;
-  qp8_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn)xnn_x8_packq_f32qp8_packed_offset;
+  qp8_pack_lh_config.size_fn =
+      (xnn_pack_lh_size_fn)xnn_x8_packq_f32qp8_packed_size;
+  qp8_pack_lh_config.offset_fn =
+      (xnn_pack_lh_offset_fn)xnn_x8_packq_f32qp8_packed_offset;
   qp8_pack_lh_config.log2_input_element_size = XNN_LOG2_SIZEOF_FLOAT;
   qp8_pack_lh_config.log2_packed_element_size = 0;
 }
@@ -132,12 +150,16 @@ const struct xnn_pack_lh_config* xnn_init_qp8_pack_lh_config() {
 static void init_x32_pack_lh_config(void) {
 #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
 #if XNN_ENABLE_ARM_SME2
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
   assert(hardware_config != NULL);
   if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) {
-    x32_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn) xnn_x32_pack_lh_ukernel__neonsme2;
-    x32_pack_lh_config.size_fn = (xnn_pack_lh_size_fn) xnn_x32_pack_lh_size__neonsme2;
-    x32_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn) xnn_x32_pack_lh_offset__neonsme2;
+    x32_pack_lh_config.pack_lh_fn =
+        (xnn_pack_lh_ukernel_fn)xnn_x32_pack_lh_ukernel__neonsme2;
+    x32_pack_lh_config.size_fn =
+        (xnn_pack_lh_size_fn)xnn_x32_pack_lh_size__neonsme2;
+    x32_pack_lh_config.offset_fn =
+        (xnn_pack_lh_offset_fn)xnn_x32_pack_lh_offset__neonsme2;
   }
 #endif  // XNN_ENABLE_ARM_SME2
 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
@@ -147,7 +169,8 @@ static void init_x32_pack_lh_config(void) {
 }
 
 const struct xnn_pack_lh_config* xnn_init_x32_pack_lh_config() {
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
   if (hardware_config == NULL) {
     return NULL;
   }
@@ -158,12 +181,16 @@ const struct xnn_pack_lh_config* xnn_init_x32_pack_lh_config() {
 static void init_x16_pack_lh_config(void) {
 #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
 #if XNN_ENABLE_ARM_SME2
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
   assert(hardware_config != NULL);
   if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) {
-    x16_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn) xnn_x16_pack_lh_ukernel__neonsme2;
-    x16_pack_lh_config.size_fn = (xnn_pack_lh_size_fn) xnn_x16_pack_lh_size__neonsme2;
-    x16_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn) xnn_x16_pack_lh_offset__neonsme2;
+    x16_pack_lh_config.pack_lh_fn =
+        (xnn_pack_lh_ukernel_fn)xnn_x16_pack_lh_ukernel__neonsme2;
+    x16_pack_lh_config.size_fn =
+        (xnn_pack_lh_size_fn)xnn_x16_pack_lh_size__neonsme2;
+    x16_pack_lh_config.offset_fn =
+        (xnn_pack_lh_offset_fn)xnn_x16_pack_lh_offset__neonsme2;
   }
 #endif  // XNN_ENABLE_ARM_SME2
 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
@@ -173,7 +200,8 @@ static void init_x16_pack_lh_config(void) {
 }
 
 const struct xnn_pack_lh_config* xnn_init_x16_pack_lh_config() {
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
   if (hardware_config == NULL) {
     return NULL;
   }
@@ -184,12 +212,16 @@ const struct xnn_pack_lh_config* xnn_init_x16_pack_lh_config() {
 static void init_x8_pack_lh_config(void) {
 #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
 #if XNN_ENABLE_ARM_SME2
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
   assert(hardware_config != NULL);
   if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) {
-    x8_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn) xnn_x8_pack_lh_ukernel__neonsme2;
-    x8_pack_lh_config.size_fn = (xnn_pack_lh_size_fn) xnn_x8_pack_lh_size__neonsme2;
-    x8_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn) xnn_x8_pack_lh_offset__neonsme2;
+    x8_pack_lh_config.pack_lh_fn =
+        (xnn_pack_lh_ukernel_fn)xnn_x8_pack_lh_ukernel__neonsme2;
+    x8_pack_lh_config.size_fn =
+        (xnn_pack_lh_size_fn)xnn_x8_pack_lh_size__neonsme2;
+    x8_pack_lh_config.offset_fn =
+        (xnn_pack_lh_offset_fn)xnn_x8_pack_lh_offset__neonsme2;
   }
 #endif  // XNN_ENABLE_ARM_SME2
 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
@@ -199,7 +231,8 @@ static void init_x8_pack_lh_config(void) {
 }
 
 const struct xnn_pack_lh_config* xnn_init_x8_pack_lh_config() {
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
   if (hardware_config == NULL) {
     return NULL;
   }
@@ -208,17 +241,21 @@ const struct xnn_pack_lh_config* xnn_init_x8_pack_lh_config() {
 }
 
 static void init_x8_igemm_pack_lh_config(void) {
-  #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
-  #if XNN_ENABLE_ARM_SME2
-    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
-    assert(hardware_config != NULL);
-    if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) {
-      x8_igemm_pack_lh_config.pack_lh_for_igemm_fn = (xnn_pack_lh_igemm_ukernel_fn) xnn_x8_pack_lh_ukernel__igemm_neonsme2;
-      x8_igemm_pack_lh_config.size_for_igemm_fn = (xnn_pack_lh_igemm_size_fn) xnn_x8_pack_lh_size__igemm_neonsme2;
-      x8_igemm_pack_lh_config.offset_for_igemm_fn = (xnn_pack_lh_igemm_offset_fn) xnn_x8_pack_lh_offset__igemm_neonsme2;
-    }
-  #endif  // XNN_ENABLE_ARM_SME2
-  #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
+#if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
+#if XNN_ENABLE_ARM_SME2
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
+  assert(hardware_config != NULL);
+  if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) {
+    x8_igemm_pack_lh_config.pack_lh_for_igemm_fn =
+        (xnn_pack_lh_igemm_ukernel_fn)xnn_x8_pack_lh_ukernel__igemm_neonsme2;
+    x8_igemm_pack_lh_config.size_for_igemm_fn =
+        (xnn_pack_lh_igemm_size_fn)xnn_x8_pack_lh_size__igemm_neonsme2;
+    x8_igemm_pack_lh_config.offset_for_igemm_fn =
+        (xnn_pack_lh_igemm_offset_fn)xnn_x8_pack_lh_offset__igemm_neonsme2;
+  }
+#endif  // XNN_ENABLE_ARM_SME2
+#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
   x8_igemm_pack_lh_config.log2_input_element_size = 0;
   x8_igemm_pack_lh_config.log2_packed_element_size = 0;
 }
@@ -232,4 +269,33 @@ const struct xnn_pack_lh_config* xnn_init_x8_igemm_pack_lh_config() {
   XNN_INIT_ONCE(x8_igemm_pack_lh);
   return &x8_igemm_pack_lh_config;
 }
-
+
+static void init_x16_igemm_pack_lh_config(void) {
+#if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
+#if XNN_ENABLE_ARM_SME2
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
+  assert(hardware_config != NULL);
+  if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) {
+    x16_igemm_pack_lh_config.pack_lh_for_igemm_fn =
+        (xnn_pack_lh_igemm_ukernel_fn)xnn_x16_pack_lh_ukernel__igemm_neonsme2;
+    x16_igemm_pack_lh_config.size_for_igemm_fn =
+        (xnn_pack_lh_igemm_size_fn)xnn_x16_pack_lh_size__igemm_neonsme2;
+    x16_igemm_pack_lh_config.offset_for_igemm_fn =
+        (xnn_pack_lh_igemm_offset_fn)xnn_x16_pack_lh_offset__igemm_neonsme2;
+  }
+#endif  // XNN_ENABLE_ARM_SME2
+#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
+  x16_igemm_pack_lh_config.log2_input_element_size = 1;
+  x16_igemm_pack_lh_config.log2_packed_element_size = 1;
+}
+
+const struct xnn_pack_lh_config* xnn_init_x16_igemm_pack_lh_config() {
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
+  if (hardware_config == NULL) {
+    return NULL;
+  }
+  XNN_INIT_ONCE(x16_igemm_pack_lh);
+  return &x16_igemm_pack_lh_config;
+}