google
diff --git a/‎build_srcs.bzl‎
Lines changed: 1 addition & 0 deletions b/‎build_srcs.bzl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmake/gen/neonsme2_microkernels.cmake‎
Lines changed: 2 additions & 0 deletions b/‎cmake/gen/neonsme2_microkernels.cmake‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎gen/neonsme2_microkernels.bzl‎
Lines changed: 2 additions & 0 deletions b/‎gen/neonsme2_microkernels.bzl‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎scripts/generate-tests.sh‎
Lines changed: 1 addition & 0 deletions b/‎scripts/generate-tests.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/configs/gemm-config.c‎
Lines changed: 9 additions & 0 deletions b/‎src/configs/gemm-config.c‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/configs/pack-lh-config.c‎
Lines changed: 89 additions & 34 deletions b/‎src/configs/pack-lh-config.c‎
Lines changed: 89 additions & 34 deletions
diff --git a/‎src/pf16-f16-f16-igemm/pf16-f16-f16-igemm-32x32c2-minmax-neonsme2.c‎
Lines changed: 58 additions & 0 deletions b/‎src/pf16-f16-f16-igemm/pf16-f16-f16-igemm-32x32c2-minmax-neonsme2.c‎
Lines changed: 58 additions & 0 deletions
@@ -284,6 +284,7 @@ MICROKERNEL_DEFS = [
     "src/x64-transposec/x64-transposec.inc",
     "src/x8-pack-lh/x8-pack-lh.inc",
     "src/x8-pack-lh/x8-pack-lh-igemm.inc",
+    "src/x16-pack-lh/x16-pack-lh-igemm.inc",
     "src/x8-packq/x8-packq.inc",
     "src/x8-packw/x8-packw.inc",
     "src/x8-transposec/x8-transposec.inc",
 
@@ -14,13 +14,15 @@ SET(PROD_NEONSME2_MICROKERNEL_SRCS
   src/pf16-gemm/pf16-gemm-32x32c2-minmax-neonsme2.c
   src/pf32-gemm/pf32-gemm-1x32-minmax-neonsme2.c
   src/pf32-gemm/pf32-gemm-32x32-minmax-neonsme2.c
+  src/pf16-f16-f16-igemm/pf16-f16-f16-igemm-32x32c2-minmax-neonsme2.c
   src/pqs8-f32-qc8w-igemm/pqs8-f32-qc8w-igemm-32x32c4-minmax-neonsme2.c
   src/pqs8-qc8w-gemm/pqs8-qc8w-gemm-1x32c4-minmax-neonsme2.c
   src/pqs8-qc8w-gemm/pqs8-qc8w-gemm-32x32c4-minmax-neonsme2.c
   src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-1x64c4-neonsme2.c
   src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-16x64c4-neonsme2.c
   src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-1x64c4-neonsme2.c
   src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-16x64c4-neonsme2.c
+  src/x16-pack-lh/x16-packlh-igemm-neonsme2.c
   src/x8-pack-lh/x8-packlh-igemm-neonsme2.c
   src/x8-pack-lh/x8-packlh-neonsme2.c
   src/x16-pack-lh/x16-packlh-neonsme2.c)
 
@@ -10,13 +10,15 @@ PROD_NEONSME2_MICROKERNEL_SRCS = [
     "src/pf16-gemm/pf16-gemm-32x32c2-minmax-neonsme2.c",
     "src/pf32-gemm/pf32-gemm-1x32-minmax-neonsme2.c",
     "src/pf32-gemm/pf32-gemm-32x32-minmax-neonsme2.c",
+    "src/pf16-f16-f16-igemm/pf16-f16-f16-igemm-32x32c2-minmax-neonsme2.c",
     "src/pqs8-f32-qc8w-igemm/pqs8-f32-qc8w-igemm-32x32c4-minmax-neonsme2.c",
     "src/pqs8-qc8w-gemm/pqs8-qc8w-gemm-1x32c4-minmax-neonsme2.c",
     "src/pqs8-qc8w-gemm/pqs8-qc8w-gemm-32x32c4-minmax-neonsme2.c",
     "src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-1x64c4-neonsme2.c",
     "src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-16x64c4-neonsme2.c",
     "src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-1x64c4-neonsme2.c",
     "src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-16x64c4-neonsme2.c",
+    "src/x16-pack-lh/x16-packlh-igemm-neonsme2.c",
     "src/x8-pack-lh/x8-packlh-igemm-neonsme2.c",
     "src/x8-pack-lh/x8-packlh-neonsme2.c",
     "src/x16-pack-lh/x16-packlh-neonsme2.c",
 
@@ -49,6 +49,7 @@ tools/generate-gemm-test.py --spec test/qs8-qc4w-gemm-minmax-fp32.yaml --output-
 tools/generate-gemm-test.py --spec test/qs8-qc8w-gemm-minmax-fp32.yaml --output-test test/qs8-qc8w-gemm-minmax-fp32.cc --output-test test/qs8-qc8w-gemm-minmax-fp32-2.cc --output-test test/qs8-qc8w-gemm-minmax-fp32-3.cc --output-bench bench/qs8-qc8w-gemm-fp32.cc &
 
 ### Tests for IGEMM micro-kernels
+tools/generate-gemm-test.py --spec test/pf16-f16-igemm-minmax.yaml --output-test test/pf16-f16-igemm-minmax.cc &
 tools/generate-gemm-test.py --spec test/f16-igemm-minmax.yaml --output-test test/f16-igemm-minmax.cc &
 tools/generate-gemm-test.py --spec test/f16-f32acc-igemm-minmax.yaml --output-test test/f16-f32acc-igemm-minmax.cc &
 
 
@@ -333,9 +333,17 @@ static void init_pf16_gemm_config(void) {
       pf16_gemm_config.arch = xnn_arch_arm_sme2;
       pf16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_pf16_gemm_minmax_ukernel_1x32c2__neonsme2);
       pf16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(mr)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_pf16_gemm_minmax_ukernel_32x32c2__neonsme2);
+      pf16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(mr)] =
+        xnn_init_hmp_packed_igemm_ukernel(
+            (xnn_packed_lhs_igemm_ukernel_fn)
+                xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme2);
       pf16_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params;
       pf16_gemm_config.pack_weights_and_biases = xnn_pack_kai_f16_weights_and_biases;
       pf16_gemm_config.packed_stride_weights_and_biases = xnn_packed_stride_kai_f16_weights_and_biases;
+      pf16_gemm_config.pack_igemm_goki =
+        (xnn_pack_conv_goki_w_fn)xnn_pack_kai_f16_conv_goki_w_sme2;
+      pf16_gemm_config.pack_igemm_kgo =
+        (xnn_pack_conv_kgo_w_fn)xnn_pack_f16_conv_kgo_w;
       pf16_gemm_config.mr = mr;
       pf16_gemm_config.mr_packed = mr;
       pf16_gemm_config.nr = nr;
@@ -5586,6 +5594,7 @@ const struct xnn_gemm_config* xnn_init_pf16_gemm_config() {
     return NULL;
   }
   XNN_INIT_ONCE(pf16_gemm);
+
   return pf16_gemm_config.mr ? &pf16_gemm_config : NULL;
 }
 
 
@@ -20,21 +20,27 @@ static struct xnn_pack_lh_config x8_pack_lh_config = {0};
 static struct xnn_pack_lh_config x16_pack_lh_config = {0};
 static struct xnn_pack_lh_config x32_pack_lh_config = {0};
 static struct xnn_pack_lh_config x8_igemm_pack_lh_config = {0};
+static struct xnn_pack_lh_config x16_igemm_pack_lh_config = {0};
 
 XNN_INIT_ONCE_GUARD(qp8_pack_lh);
 XNN_INIT_ONCE_GUARD(x8_pack_lh);
 XNN_INIT_ONCE_GUARD(x16_pack_lh);
 XNN_INIT_ONCE_GUARD(x32_pack_lh);
 XNN_INIT_ONCE_GUARD(x8_igemm_pack_lh);
+XNN_INIT_ONCE_GUARD(x16_igemm_pack_lh);
 
 static void init_qp8_pack_lh_config(void) {
 #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
-  qp8_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn)xnn_x8_packq_f32qp8_ukernel__aarch64_neon_u2;
+  qp8_pack_lh_config.pack_lh_fn =
+      (xnn_pack_lh_ukernel_fn)xnn_x8_packq_f32qp8_ukernel__aarch64_neon_u2;
 #else
-  qp8_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn)xnn_x8_packq_f32qp8_ukernel__scalar_u1;
+  qp8_pack_lh_config.pack_lh_fn =
+      (xnn_pack_lh_ukernel_fn)xnn_x8_packq_f32qp8_ukernel__scalar_u1;
 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
-  qp8_pack_lh_config.size_fn = (xnn_pack_lh_size_fn)xnn_x8_packq_f32qp8_packed_size;
-  qp8_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn)xnn_x8_packq_f32qp8_packed_offset;
+  qp8_pack_lh_config.size_fn =
+      (xnn_pack_lh_size_fn)xnn_x8_packq_f32qp8_packed_size;
+  qp8_pack_lh_config.offset_fn =
+      (xnn_pack_lh_offset_fn)xnn_x8_packq_f32qp8_packed_offset;
   qp8_pack_lh_config.log2_input_element_size = XNN_LOG2_SIZEOF_FLOAT;
   qp8_pack_lh_config.log2_packed_element_size = 0;
 }
@@ -51,13 +57,17 @@ const struct xnn_pack_lh_config* xnn_init_qp8_pack_lh_config() {
 
 static void init_x32_pack_lh_config(void) {
 #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
-#if XNN_ENABLE_ARM_SME2 || XNN_ENABLE_ARM_SME
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+#if XNN_ENABLE_ARM_SME2
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
   assert(hardware_config != NULL);
-  if (hardware_config->arch_flags & xnn_arch_arm_sme) {
-    x32_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn) xnn_x32_pack_lh_ukernel__neonsme;
-    x32_pack_lh_config.size_fn = (xnn_pack_lh_size_fn) xnn_x32_pack_lh_size__neonsme;
-    x32_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn) xnn_x32_pack_lh_offset__neonsme;
+  if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) {
+    x32_pack_lh_config.pack_lh_fn =
+        (xnn_pack_lh_ukernel_fn)xnn_x32_pack_lh_ukernel__neonsme;
+    x32_pack_lh_config.size_fn =
+        (xnn_pack_lh_size_fn)xnn_x32_pack_lh_size__neonsme;
+    x32_pack_lh_config.offset_fn =
+        (xnn_pack_lh_offset_fn)xnn_x32_pack_lh_offset__neonsme;
   }
 #endif  // XNN_ENABLE_ARM_SME2 || XNN_ENABLE_ARM_SME
 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
@@ -67,7 +77,8 @@ static void init_x32_pack_lh_config(void) {
 }
 
 const struct xnn_pack_lh_config* xnn_init_x32_pack_lh_config() {
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
   if (hardware_config == NULL) {
     return NULL;
   }
@@ -78,12 +89,16 @@ const struct xnn_pack_lh_config* xnn_init_x32_pack_lh_config() {
 static void init_x16_pack_lh_config(void) {
 #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
 #if XNN_ENABLE_ARM_SME2
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
   assert(hardware_config != NULL);
-  if (hardware_config->arch_flags & xnn_arch_arm_sme2) {
-    x16_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn) xnn_x16_pack_lh_ukernel__neonsme2;
-    x16_pack_lh_config.size_fn = (xnn_pack_lh_size_fn) xnn_x16_pack_lh_size__neonsme2;
-    x16_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn) xnn_x16_pack_lh_offset__neonsme2;
+  if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) {
+    x16_pack_lh_config.pack_lh_fn =
+        (xnn_pack_lh_ukernel_fn)xnn_x16_pack_lh_ukernel__neonsme2;
+    x16_pack_lh_config.size_fn =
+        (xnn_pack_lh_size_fn)xnn_x16_pack_lh_size__neonsme2;
+    x16_pack_lh_config.offset_fn =
+        (xnn_pack_lh_offset_fn)xnn_x16_pack_lh_offset__neonsme2;
   }
 #endif  // XNN_ENABLE_ARM_SME2
 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
@@ -93,7 +108,8 @@ static void init_x16_pack_lh_config(void) {
 }
 
 const struct xnn_pack_lh_config* xnn_init_x16_pack_lh_config() {
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
   if (hardware_config == NULL) {
     return NULL;
   }
@@ -104,12 +120,16 @@ const struct xnn_pack_lh_config* xnn_init_x16_pack_lh_config() {
 static void init_x8_pack_lh_config(void) {
 #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
 #if XNN_ENABLE_ARM_SME2
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
   assert(hardware_config != NULL);
-  if (hardware_config->arch_flags & xnn_arch_arm_sme2) {
-    x8_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn) xnn_x8_pack_lh_ukernel__neonsme2;
-    x8_pack_lh_config.size_fn = (xnn_pack_lh_size_fn) xnn_x8_pack_lh_size__neonsme2;
-    x8_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn) xnn_x8_pack_lh_offset__neonsme2;
+  if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) {
+    x8_pack_lh_config.pack_lh_fn =
+        (xnn_pack_lh_ukernel_fn)xnn_x8_pack_lh_ukernel__neonsme2;
+    x8_pack_lh_config.size_fn =
+        (xnn_pack_lh_size_fn)xnn_x8_pack_lh_size__neonsme2;
+    x8_pack_lh_config.offset_fn =
+        (xnn_pack_lh_offset_fn)xnn_x8_pack_lh_offset__neonsme2;
   }
 #endif  // XNN_ENABLE_ARM_SME2
 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
@@ -119,7 +139,8 @@ static void init_x8_pack_lh_config(void) {
 }
 
 const struct xnn_pack_lh_config* xnn_init_x8_pack_lh_config() {
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
   if (hardware_config == NULL) {
     return NULL;
   }
@@ -128,17 +149,21 @@ const struct xnn_pack_lh_config* xnn_init_x8_pack_lh_config() {
 }
 
 static void init_x8_igemm_pack_lh_config(void) {
-  #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
-  #if XNN_ENABLE_ARM_SME2
-    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
-    assert(hardware_config != NULL);
-    if (hardware_config->arch_flags & xnn_arch_arm_sme2) {
-      x8_igemm_pack_lh_config.pack_lh_for_igemm_fn = (xnn_pack_lh_igemm_ukernel_fn) xnn_x8_pack_lh_ukernel__igemm_neonsme2;
-      x8_igemm_pack_lh_config.size_for_igemm_fn = (xnn_pack_lh_igemm_size_fn) xnn_x8_pack_lh_size__igemm_neonsme2;
-      x8_igemm_pack_lh_config.offset_for_igemm_fn = (xnn_pack_lh_igemm_offset_fn) xnn_x8_pack_lh_offset__igemm_neonsme2;
-    }
-  #endif  // XNN_ENABLE_ARM_SME2
-  #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
+#if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
+#if XNN_ENABLE_ARM_SME2
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
+  assert(hardware_config != NULL);
+  if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) {
+    x8_igemm_pack_lh_config.pack_lh_for_igemm_fn =
+        (xnn_pack_lh_igemm_ukernel_fn)xnn_x8_pack_lh_ukernel__igemm_neonsme2;
+    x8_igemm_pack_lh_config.size_for_igemm_fn =
+        (xnn_pack_lh_igemm_size_fn)xnn_x8_pack_lh_size__igemm_neonsme2;
+    x8_igemm_pack_lh_config.offset_for_igemm_fn =
+        (xnn_pack_lh_igemm_offset_fn)xnn_x8_pack_lh_offset__igemm_neonsme2;
+  }
+#endif  // XNN_ENABLE_ARM_SME2
+#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
   x8_igemm_pack_lh_config.log2_input_element_size = 0;
   x8_igemm_pack_lh_config.log2_packed_element_size = 0;
 }
@@ -152,3 +177,33 @@ const struct xnn_pack_lh_config* xnn_init_x8_igemm_pack_lh_config() {
   XNN_INIT_ONCE(x8_igemm_pack_lh);
   return &x8_igemm_pack_lh_config;
 }
+
+static void init_x16_igemm_pack_lh_config(void) {
+#if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
+#if XNN_ENABLE_ARM_SME2
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
+  assert(hardware_config != NULL);
+  if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) {
+    x16_igemm_pack_lh_config.pack_lh_for_igemm_fn =
+        (xnn_pack_lh_igemm_ukernel_fn)xnn_x16_pack_lh_ukernel__igemm_neonsme2;
+    x16_igemm_pack_lh_config.size_for_igemm_fn =
+        (xnn_pack_lh_igemm_size_fn)xnn_x16_pack_lh_size__igemm_neonsme2;
+    x16_igemm_pack_lh_config.offset_for_igemm_fn =
+        (xnn_pack_lh_igemm_offset_fn)xnn_x16_pack_lh_offset__igemm_neonsme2;
+  }
+#endif  // XNN_ENABLE_ARM_SME2
+#endif  // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI
+  x16_igemm_pack_lh_config.log2_input_element_size = 1;
+  x16_igemm_pack_lh_config.log2_packed_element_size = 1;
+}
+
+const struct xnn_pack_lh_config* xnn_init_x16_igemm_pack_lh_config() {
+  const struct xnn_hardware_config* hardware_config =
+      xnn_init_hardware_config();
+  if (hardware_config == NULL) {
+    return NULL;
+  }
+  XNN_INIT_ONCE(x16_igemm_pack_lh);
+  return &x16_igemm_pack_lh_config;
+}
@@ -0,0 +1,58 @@
+// Copyright 2025 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <stddef.h>
+#include <stdio.h>
+
+#include "src/xnnpack/microparams.h"
+
+#if XNN_ENABLE_KLEIDIAI
+#include "kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.h"
+#include "kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.h"
+#endif  // XNN_ENABLE_KLEIDIAI
+
+size_t xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme2_get_mr(void) {
+#if XNN_ENABLE_KLEIDIAI
+  return kai_get_mr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
+#else
+  assert(
+      "Calling wrapped KleidiAI function, but XNNPACK was compiled without "
+      "`XNN_ENABLE_KLEIDIAI`." &&
+      0);
+  return 0;
+#endif  // XNN_ENABLE_KLEIDIAI
+}
+
+size_t xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme2_get_nr(void) {
+#if XNN_ENABLE_KLEIDIAI
+  return kai_get_nr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa();
+#else
+  assert(
+      "Calling wrapped KleidiAI function, but XNNPACK was compiled without "
+      "`XNN_ENABLE_KLEIDIAI`." &&
+      0);
+  return 0;
+#endif  // XNN_ENABLE_KLEIDIAI
+}
+
+void xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme2(
+    size_t mr, size_t nc, size_t kc, size_t ks, const void* packed_lhs,
+    const void* restrict w, xnn_float16* restrict c, size_t cm_stride,
+    const struct xnn_f16_minmax_params* params) {
+#if XNN_ENABLE_KLEIDIAI
+  const size_t kai_kr = 2;
+  const size_t k = ks * round_up(kc, kai_kr);
+
+  kai_run_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa(
+      mr, nc, k, packed_lhs, w, c, cm_stride * sizeof(xnn_float16),
+      sizeof(xnn_float16), xnn_float16_to_float(params->scalar.min),
+      xnn_float16_to_float(params->scalar.max));
+#else
+  assert(
+      "Calling wrapped KleidiAI function, but XNNPACK was compiled without "
+      "`XNN_ENABLE_KLEIDIAI`." &&
+      0);
+#endif  // XNN_ENABLE_KLEIDIAI
+}