google · JonathanC-ARM · Jul 8, 2025 · Jul 9, 2025 · Sep 8, 2025 · Oct 7, 2025
diff --git a/build_srcs.bzl b/build_srcs.bzl
@@ -284,6 +284,7 @@ MICROKERNEL_DEFS = [
     "src/x64-transposec/x64-transposec.inc",
     "src/x8-pack-lh/x8-pack-lh.inc",
     "src/x8-pack-lh/x8-pack-lh-igemm.inc",
+    "src/x16-pack-lh/x16-pack-lh-igemm.inc",
     "src/x8-packq/x8-packq.inc",
     "src/x8-packw/x8-packw.inc",
     "src/x8-transposec/x8-transposec.inc",

diff --git a/cmake/gen/neonsme2_microkernels.cmake b/cmake/gen/neonsme2_microkernels.cmake
@@ -14,13 +14,15 @@ SET(PROD_NEONSME2_MICROKERNEL_SRCS
   src/pf16-gemm/pf16-gemm-32x32c2-minmax-neonsme2.c
   src/pf32-gemm/pf32-gemm-1x32-minmax-neonsme2.c
   src/pf32-gemm/pf32-gemm-32x32-minmax-neonsme2.c
+  src/pf16-f16-f16-igemm/pf16-f16-f16-igemm-32x32c2-minmax-neonsme2.c
   src/pqs8-f32-qc8w-igemm/pqs8-f32-qc8w-igemm-32x32c4-minmax-neonsme2.c
   src/pqs8-qc8w-gemm/pqs8-qc8w-gemm-1x32c4-minmax-neonsme2.c
   src/pqs8-qc8w-gemm/pqs8-qc8w-gemm-32x32c4-minmax-neonsme2.c
   src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-1x64c4-neonsme2.c
   src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-16x64c4-neonsme2.c
   src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-1x64c4-neonsme2.c
   src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-16x64c4-neonsme2.c
+  src/x16-pack-lh/x16-packlh-igemm-neonsme2.c
   src/x8-pack-lh/x8-packlh-igemm-neonsme2.c
   src/x8-pack-lh/x8-packlh-neonsme2.c
   src/x16-pack-lh/x16-packlh-neonsme2.c)

diff --git a/gen/neonsme2_microkernels.bzl b/gen/neonsme2_microkernels.bzl
@@ -10,13 +10,15 @@ PROD_NEONSME2_MICROKERNEL_SRCS = [
     "src/pf16-gemm/pf16-gemm-32x32c2-minmax-neonsme2.c",
     "src/pf32-gemm/pf32-gemm-1x32-minmax-neonsme2.c",
     "src/pf32-gemm/pf32-gemm-32x32-minmax-neonsme2.c",
+    "src/pf16-f16-f16-igemm/pf16-f16-f16-igemm-32x32c2-minmax-neonsme2.c",
     "src/pqs8-f32-qc8w-igemm/pqs8-f32-qc8w-igemm-32x32c4-minmax-neonsme2.c",
     "src/pqs8-qc8w-gemm/pqs8-qc8w-gemm-1x32c4-minmax-neonsme2.c",
     "src/pqs8-qc8w-gemm/pqs8-qc8w-gemm-32x32c4-minmax-neonsme2.c",
     "src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-1x64c4-neonsme2.c",
     "src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-16x64c4-neonsme2.c",
     "src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-1x64c4-neonsme2.c",
     "src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-16x64c4-neonsme2.c",
+    "src/x16-pack-lh/x16-packlh-igemm-neonsme2.c",
     "src/x8-pack-lh/x8-packlh-igemm-neonsme2.c",
     "src/x8-pack-lh/x8-packlh-neonsme2.c",
     "src/x16-pack-lh/x16-packlh-neonsme2.c",

diff --git a/include/xnnpack.h b/include/xnnpack.h
@@ -3049,6 +3049,46 @@ enum xnn_status xnn_create_convolution2d_nhwc_f16(
   xnn_weights_cache_t weights_cache,
   xnn_operator_t* convolution_op_out);
 
+enum xnn_status xnn_create_convolution2d_nhwc_pf16(
+  uint32_t input_padding_top,
+  uint32_t input_padding_right,
+  uint32_t input_padding_bottom,
+  uint32_t input_padding_left,
+  uint32_t kernel_height,
+  uint32_t kernel_width,
+  uint32_t subsampling_height,
+  uint32_t subsampling_width,
+  uint32_t dilation_height,
+  uint32_t dilation_width,
+  uint32_t groups,
+  size_t group_input_channels,
+  size_t group_output_channels,
+  size_t input_channel_stride,
+  size_t output_channel_stride,
+  const void* kernel,
+  const void* bias,
+  float output_min,
+  float output_max,
+  uint32_t flags,
+  xnn_weights_cache_t weights_cache,
+  xnn_operator_t* convolution_op_out);
+
+enum xnn_status xnn_reshape_convolution2d_nhwc_pf16(
+  xnn_operator_t convolution_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  size_t* workspace_size,
+  size_t* output_height_out,
+  size_t* output_width_out,
+  pthreadpool_t threadpool);
+
+enum xnn_status xnn_setup_convolution2d_nhwc_pf16(
+  xnn_operator_t convolution_op,
+  void* workspace,
+  const void* input,
+  void* output);
+
 enum xnn_status xnn_reshape_convolution2d_nhwc_f16(
   xnn_operator_t convolution_op,
   size_t batch_size,

diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh
@@ -49,6 +49,7 @@ tools/generate-gemm-test.py --spec test/qs8-qc4w-gemm-minmax-fp32.yaml --output-
 tools/generate-gemm-test.py --spec test/qs8-qc8w-gemm-minmax-fp32.yaml --output-test test/qs8-qc8w-gemm-minmax-fp32.cc --output-test test/qs8-qc8w-gemm-minmax-fp32-2.cc --output-test test/qs8-qc8w-gemm-minmax-fp32-3.cc --output-bench bench/qs8-qc8w-gemm-fp32.cc &
 
 ### Tests for IGEMM micro-kernels
+tools/generate-gemm-test.py --spec test/pf16-f16-igemm-minmax.yaml --output-test test/pf16-f16-igemm-minmax.cc &
 tools/generate-gemm-test.py --spec test/f16-igemm-minmax.yaml --output-test test/f16-igemm-minmax.cc &
 tools/generate-gemm-test.py --spec test/f16-f32acc-igemm-minmax.yaml --output-test test/f16-f32acc-igemm-minmax.cc &
 

diff --git a/src/configs/gemm-config.c b/src/configs/gemm-config.c
@@ -333,9 +333,17 @@ static void init_pf16_gemm_config(void) {
       pf16_gemm_config.arch = xnn_arch_arm_sme2;
       pf16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_pf16_gemm_minmax_ukernel_1x32c2__neonsme2);
       pf16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(mr)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_pf16_gemm_minmax_ukernel_32x32c2__neonsme2);
+      pf16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(mr)] =
+        xnn_init_hmp_packed_igemm_ukernel(
+            (xnn_packed_lhs_igemm_ukernel_fn)
+                xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme2);
       pf16_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
       pf16_gemm_config.pack_weights_and_biases = xnn_pack_kai_f16_weights_and_biases;
       pf16_gemm_config.packed_stride_weights_and_biases = xnn_packed_stride_kai_f16_weights_and_biases;
+      pf16_gemm_config.pack_igemm_goki =
+        (xnn_pack_conv_goki_w_fn)xnn_pack_kai_f16_conv_goki_w_sme2;
+      pf16_gemm_config.pack_igemm_kgo =
+        (xnn_pack_conv_kgo_w_fn)xnn_pack_f16_conv_kgo_w;
       pf16_gemm_config.mr = mr;
       pf16_gemm_config.mr_packed = mr;
       pf16_gemm_config.nr = nr;
@@ -5586,6 +5594,7 @@ const struct xnn_gemm_config* xnn_init_pf16_gemm_config() {
     return NULL;
   }
   XNN_INIT_ONCE(pf16_gemm);
+
   return pf16_gemm_config.mr ? &pf16_gemm_config : NULL;
 }
 

diff --git a/src/configs/pack-lh-config.c b/src/configs/pack-lh-config.c
@@ -20,21 +20,27 @@ static struct xnn_pack_lh_config x8_pack_lh_config = {0};
 static struct xnn_pack_lh_config x16_pack_lh_config = {0};
 static struct xnn_pack_lh_config x32_pack_lh_config = {0};
 static struct xnn_pack_lh_config x8_igemm_pack_lh_config = {0};
+static struct xnn_pack_lh_config x16_igemm_pack_lh_config = {0};
 
 XNN_INIT_ONCE_GUARD(qp8_pack_lh);
 XNN_INIT_ONCE_GUARD(x8_pack_lh);
 XNN_INIT_ONCE_GUARD(x16_pack_lh);
 XNN_INIT_ONCE_GUARD(x32_pack_lh);
 XNN_INIT_ONCE_GUARD(x8_igemm_pack_lh);
+XNN_INIT_ONCE_GUARD(x16_igemm_pack_lh);
 
 static void init_qp8_pack_lh_config(void) {
 #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
-  qp8_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn)xnn_x8_packq_f32qp8_ukernel__aarch64_neon_u2;
+  qp8_pack_lh_config.pack_lh_fn =
+      (xnn_pack_lh_ukernel_fn)xnn_x8_packq_f32qp8_ukernel__aarch64_neon_u2;
 #else
-  qp8_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn)xnn_x8_packq_f32qp8_ukernel__scalar_u1;
+  qp8_pack_lh_config.pack_lh_fn =
+      (xnn_pack_lh_ukernel_fn)xnn_x8_packq_f32qp8_ukernel__scalar_u1;
 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
-  qp8_pack_lh_config.size_fn = (xnn_pack_lh_size_fn)xnn_x8_packq_f32qp8_packed_size;
-  qp8_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn)xnn_x8_packq_f32qp8_packed_offset;
+  qp8_pack_lh_config.size_fn =
+      (xnn_pack_lh_size_fn)xnn_x8_packq_f32qp8_packed_size;
+  qp8_pack_lh_config.offset_fn =
+      (xnn_pack_lh_offset_fn)xnn_x8_packq_f32qp8_packed_offset;
   qp8_pack_lh_config.log2_input_element_size = XNN_LOG2_SIZEOF_FLOAT;
   qp8_pack_lh_config.log2_packed_element_size = 0;
 }
@@ -51,13 +57,17 @@ const struct xnn_pack_lh_config* xnn_init_qp8_pack_lh_config() {
 
 static void init_x32_pack_lh_config(void) {
 #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
-#if XNN_ENABLE_ARM_SME2 || XNN_ENABLE_ARM_SME
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+#if XNN_ENABLE_ARM_SME2
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
   assert(hardware_config != NULL);
-  if (hardware_config->arch_flags & xnn_arch_arm_sme) {
-    x32_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn) xnn_x32_pack_lh_ukernel__neonsme;
-    x32_pack_lh_config.size_fn = (xnn_pack_lh_size_fn) xnn_x32_pack_lh_size__neonsme;
-    x32_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn) xnn_x32_pack_lh_offset__neonsme;
+  if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) {
+    x32_pack_lh_config.pack_lh_fn =
+        (xnn_pack_lh_ukernel_fn)xnn_x32_pack_lh_ukernel__neonsme;
+    x32_pack_lh_config.size_fn =
+        (xnn_pack_lh_size_fn)xnn_x32_pack_lh_size__neonsme;
+    x32_pack_lh_config.offset_fn =
+        (xnn_pack_lh_offset_fn)xnn_x32_pack_lh_offset__neonsme;
   }
 #endif  // XNN_ENABLE_ARM_SME2 || XNN_ENABLE_ARM_SME
 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
@@ -67,7 +77,8 @@ static void init_x32_pack_lh_config(void) {
 }
 
 const struct xnn_pack_lh_config* xnn_init_x32_pack_lh_config() {
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
   if (hardware_config == NULL) {
     return NULL;
   }
@@ -78,12 +89,16 @@ const struct xnn_pack_lh_config* xnn_init_x32_pack_lh_config() {
 static void init_x16_pack_lh_config(void) {
 #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
 #if XNN_ENABLE_ARM_SME2
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
   assert(hardware_config != NULL);
-  if (hardware_config->arch_flags & xnn_arch_arm_sme2) {
-    x16_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn) xnn_x16_pack_lh_ukernel__neonsme2;
-    x16_pack_lh_config.size_fn = (xnn_pack_lh_size_fn) xnn_x16_pack_lh_size__neonsme2;
-    x16_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn) xnn_x16_pack_lh_offset__neonsme2;
+  if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) {
+    x16_pack_lh_config.pack_lh_fn =
+        (xnn_pack_lh_ukernel_fn)xnn_x16_pack_lh_ukernel__neonsme2;
+    x16_pack_lh_config.size_fn =
+        (xnn_pack_lh_size_fn)xnn_x16_pack_lh_size__neonsme2;
+    x16_pack_lh_config.offset_fn =
+        (xnn_pack_lh_offset_fn)xnn_x16_pack_lh_offset__neonsme2;
   }
 #endif  // XNN_ENABLE_ARM_SME2
 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
@@ -93,7 +108,8 @@ static void init_x16_pack_lh_config(void) {
 }
 
 const struct xnn_pack_lh_config* xnn_init_x16_pack_lh_config() {
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
   if (hardware_config == NULL) {
     return NULL;
   }
@@ -104,12 +120,16 @@ const struct xnn_pack_lh_config* xnn_init_x16_pack_lh_config() {
 static void init_x8_pack_lh_config(void) {
 #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
 #if XNN_ENABLE_ARM_SME2
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
   assert(hardware_config != NULL);
-  if (hardware_config->arch_flags & xnn_arch_arm_sme2) {
-    x8_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn) xnn_x8_pack_lh_ukernel__neonsme2;
-    x8_pack_lh_config.size_fn = (xnn_pack_lh_size_fn) xnn_x8_pack_lh_size__neonsme2;
-    x8_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn) xnn_x8_pack_lh_offset__neonsme2;
+  if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) {
+    x8_pack_lh_config.pack_lh_fn =
+        (xnn_pack_lh_ukernel_fn)xnn_x8_pack_lh_ukernel__neonsme2;
+    x8_pack_lh_config.size_fn =
+        (xnn_pack_lh_size_fn)xnn_x8_pack_lh_size__neonsme2;
+    x8_pack_lh_config.offset_fn =
+        (xnn_pack_lh_offset_fn)xnn_x8_pack_lh_offset__neonsme2;
   }
 #endif  // XNN_ENABLE_ARM_SME2
 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
@@ -119,7 +139,8 @@ static void init_x8_pack_lh_config(void) {
 }
 
 const struct xnn_pack_lh_config* xnn_init_x8_pack_lh_config() {
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
   if (hardware_config == NULL) {
     return NULL;
   }
@@ -128,17 +149,21 @@ const struct xnn_pack_lh_config* xnn_init_x8_pack_lh_config() {
 }
 
 static void init_x8_igemm_pack_lh_config(void) {
-  #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
-  #if XNN_ENABLE_ARM_SME2
-    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
-    assert(hardware_config != NULL);
-    if (hardware_config->arch_flags & xnn_arch_arm_sme2) {
-      x8_igemm_pack_lh_config.pack_lh_for_igemm_fn = (xnn_pack_lh_igemm_ukernel_fn) xnn_x8_pack_lh_ukernel__igemm_neonsme2;
-      x8_igemm_pack_lh_config.size_for_igemm_fn = (xnn_pack_lh_igemm_size_fn) xnn_x8_pack_lh_size__igemm_neonsme2;
-      x8_igemm_pack_lh_config.offset_for_igemm_fn = (xnn_pack_lh_igemm_offset_fn) xnn_x8_pack_lh_offset__igemm_neonsme2;
-    }
-  #endif  // XNN_ENABLE_ARM_SME2
-  #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
+#if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
+#if XNN_ENABLE_ARM_SME2
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
+  assert(hardware_config != NULL);
+  if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) {
+    x8_igemm_pack_lh_config.pack_lh_for_igemm_fn =
+        (xnn_pack_lh_igemm_ukernel_fn)xnn_x8_pack_lh_ukernel__igemm_neonsme2;
+    x8_igemm_pack_lh_config.size_for_igemm_fn =
+        (xnn_pack_lh_igemm_size_fn)xnn_x8_pack_lh_size__igemm_neonsme2;
+    x8_igemm_pack_lh_config.offset_for_igemm_fn =
+        (xnn_pack_lh_igemm_offset_fn)xnn_x8_pack_lh_offset__igemm_neonsme2;
+  }
+#endif  // XNN_ENABLE_ARM_SME2
+#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
   x8_igemm_pack_lh_config.log2_input_element_size = 0;
   x8_igemm_pack_lh_config.log2_packed_element_size = 0;
 }
@@ -152,3 +177,33 @@ const struct xnn_pack_lh_config* xnn_init_x8_igemm_pack_lh_config() {
   XNN_INIT_ONCE(x8_igemm_pack_lh);
   return &x8_igemm_pack_lh_config;
 }
+
+static void init_x16_igemm_pack_lh_config(void) {
+#if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
+#if XNN_ENABLE_ARM_SME2
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
+  assert(hardware_config != NULL);
+  if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) {
+    x16_igemm_pack_lh_config.pack_lh_for_igemm_fn =
+        (xnn_pack_lh_igemm_ukernel_fn)xnn_x16_pack_lh_ukernel__igemm_neonsme2;
+    x16_igemm_pack_lh_config.size_for_igemm_fn =
+        (xnn_pack_lh_igemm_size_fn)xnn_x16_pack_lh_size__igemm_neonsme2;
+    x16_igemm_pack_lh_config.offset_for_igemm_fn =
+        (xnn_pack_lh_igemm_offset_fn)xnn_x16_pack_lh_offset__igemm_neonsme2;
+  }
+#endif  // XNN_ENABLE_ARM_SME2
+#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
+  x16_igemm_pack_lh_config.log2_input_element_size = 1;
+  x16_igemm_pack_lh_config.log2_packed_element_size = 1;
+}
+
+const struct xnn_pack_lh_config* xnn_init_x16_igemm_pack_lh_config() {
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
+  if (hardware_config == NULL) {
+    return NULL;
+  }
+  XNN_INIT_ONCE(x16_igemm_pack_lh);
+  return &x16_igemm_pack_lh_config;
+}