diff --git a/bench/gemm-benchmark.cc b/bench/gemm-benchmark.cc index 8c2f7383c53..c746e7c653e 100644 --- a/bench/gemm-benchmark.cc +++ b/bench/gemm-benchmark.cc @@ -1103,7 +1103,7 @@ void GEMMBenchmark(benchmark::State& state, const uint32_t mb = min(mc - m, mr); gemm(mb, nc, kc * sizeof(xnn_float16), input_packed.data() + - xnn_x16_pack_lh_offset__neonsme2(m, kc, mr_packed, kr, sr), + xnn_x16_pack_lh_offset__neonsme(m, kc, mr_packed, kr, sr), w.data() + packed_w_size * buffer_index, &c[c_elements * buffer_index], nc * sizeof(xnn_float16), sizeof(xnn_float16), &minmax_params); diff --git a/build_srcs.bzl b/build_srcs.bzl index ea6b571f37f..d311af86994 100644 --- a/build_srcs.bzl +++ b/build_srcs.bzl @@ -284,6 +284,7 @@ MICROKERNEL_DEFS = [ "src/x64-transposec/x64-transposec.inc", "src/x8-pack-lh/x8-pack-lh.inc", "src/x8-pack-lh/x8-pack-lh-igemm.inc", + "src/x16-pack-lh/x16-pack-lh-igemm.inc", "src/x8-packq/x8-packq.inc", "src/x8-packw/x8-packw.inc", "src/x8-transposec/x8-transposec.inc", diff --git a/cmake/gen/neonsme2_microkernels.cmake b/cmake/gen/neonsme2_microkernels.cmake index 4b5414b2d8c..20e91df6d46 100644 --- a/cmake/gen/neonsme2_microkernels.cmake +++ b/cmake/gen/neonsme2_microkernels.cmake @@ -10,6 +10,7 @@ SET(PROD_NEONSME2_MICROKERNEL_SRCS + src/pf16-f16-f16-igemm/pf16-f16-f16-igemm-32x32c2-minmax-neonsme2.c src/pf16-gemm/pf16-gemm-1x32c2-minmax-neonsme2.c src/pf16-gemm/pf16-gemm-32x32c2-minmax-neonsme2.c src/pf32-gemm/pf32-gemm-1x32-minmax-neonsme2.c @@ -23,7 +24,9 @@ SET(PROD_NEONSME2_MICROKERNEL_SRCS src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-16x64c4-neonsme2.c src/x8-pack-lh/x8-packlh-igemm-neonsme2.c src/x8-pack-lh/x8-packlh-neonsme2.c - src/x16-pack-lh/x16-packlh-neonsme2.c) + src/x16-pack-lh/x16-packlh-igemm-neonsme2.c + src/x16-pack-lh/x16-packlh-neonsme2.c + src/x32-pack-lh/x32-packlh-neonsme2.c) SET(NON_PROD_NEONSME2_MICROKERNEL_SRCS) diff --git a/cmake/gen/neonsme_microkernels.cmake b/cmake/gen/neonsme_microkernels.cmake index 1057a1bf071..7847cf815d0 100644 --- a/cmake/gen/neonsme_microkernels.cmake +++ b/cmake/gen/neonsme_microkernels.cmake @@ -12,8 +12,11 @@ SET(PROD_NEONSME_MICROKERNEL_SRCS src/pf32-gemm/pf32-gemm-1x32-minmax-neonsme.c src/pf32-gemm/pf32-gemm-32x32-minmax-neonsme.c + src/x16-pack-lh/x16-packlh-igemm-neonsme.c + src/x16-pack-lh/x16-packlh-neonsme.c src/x32-pack-lh/x32-packlh-neonsme.c) -SET(NON_PROD_NEONSME_MICROKERNEL_SRCS) +SET(NON_PROD_NEONSME_MICROKERNEL_SRCS + src/pf16-f16-f16-igemm/pf16-f16-f16-igemm-32x32c2-minmax-neonsme.c) SET(ALL_NEONSME_MICROKERNEL_SRCS ${PROD_NEONSME_MICROKERNEL_SRCS} + ${NON_PROD_NEONSME_MICROKERNEL_SRCS}) diff --git a/gen/neonsme2_microkernels.bzl b/gen/neonsme2_microkernels.bzl index 3e7b533c1a7..7bffdbb4d0b 100644 --- a/gen/neonsme2_microkernels.bzl +++ b/gen/neonsme2_microkernels.bzl @@ -6,6 +6,7 @@ Auto-generated file. Do not edit! """ PROD_NEONSME2_MICROKERNEL_SRCS = [ + "src/pf16-f16-f16-igemm/pf16-f16-f16-igemm-32x32c2-minmax-neonsme2.c", "src/pf16-gemm/pf16-gemm-1x32c2-minmax-neonsme2.c", "src/pf16-gemm/pf16-gemm-32x32c2-minmax-neonsme2.c", "src/pf32-gemm/pf32-gemm-1x32-minmax-neonsme2.c", @@ -19,7 +20,9 @@ PROD_NEONSME2_MICROKERNEL_SRCS = [ "src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-16x64c4-neonsme2.c", "src/x8-pack-lh/x8-packlh-igemm-neonsme2.c", "src/x8-pack-lh/x8-packlh-neonsme2.c", + "src/x16-pack-lh/x16-packlh-igemm-neonsme2.c", "src/x16-pack-lh/x16-packlh-neonsme2.c", + "src/x32-pack-lh/x32-packlh-neonsme2.c", ] NON_PROD_NEONSME2_MICROKERNEL_SRCS = [ diff --git a/gen/neonsme_microkernels.bzl b/gen/neonsme_microkernels.bzl index 998e7f3cf06..4a2c492c880 100644 --- a/gen/neonsme_microkernels.bzl +++ b/gen/neonsme_microkernels.bzl @@ -8,10 +8,13 @@ Auto-generated file. Do not edit! PROD_NEONSME_MICROKERNEL_SRCS = [ "src/pf32-gemm/pf32-gemm-1x32-minmax-neonsme.c", "src/pf32-gemm/pf32-gemm-32x32-minmax-neonsme.c", + "src/x16-pack-lh/x16-packlh-igemm-neonsme.c", + "src/x16-pack-lh/x16-packlh-neonsme.c", "src/x32-pack-lh/x32-packlh-neonsme.c", ] NON_PROD_NEONSME_MICROKERNEL_SRCS = [ + "src/pf16-f16-f16-igemm/pf16-f16-f16-igemm-32x32c2-minmax-neonsme.c", ] ALL_NEONSME_MICROKERNEL_SRCS = PROD_NEONSME_MICROKERNEL_SRCS + NON_PROD_NEONSME_MICROKERNEL_SRCS diff --git a/include/xnnpack.h b/include/xnnpack.h index 25da2891f60..3adebca6791 100644 --- a/include/xnnpack.h +++ b/include/xnnpack.h @@ -3049,6 +3049,46 @@ enum xnn_status xnn_create_convolution2d_nhwc_f16( xnn_weights_cache_t weights_cache, xnn_operator_t* convolution_op_out); +enum xnn_status xnn_create_convolution2d_nhwc_pf16( + uint32_t input_padding_top, + uint32_t input_padding_right, + uint32_t input_padding_bottom, + uint32_t input_padding_left, + uint32_t kernel_height, + uint32_t kernel_width, + uint32_t subsampling_height, + uint32_t subsampling_width, + uint32_t dilation_height, + uint32_t dilation_width, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + size_t input_channel_stride, + size_t output_channel_stride, + const void* kernel, + const void* bias, + float output_min, + float output_max, + uint32_t flags, + xnn_weights_cache_t weights_cache, + xnn_operator_t* convolution_op_out); + +enum xnn_status xnn_reshape_convolution2d_nhwc_pf16( + xnn_operator_t convolution_op, + size_t batch_size, + size_t input_height, + size_t input_width, + size_t* workspace_size, + size_t* output_height_out, + size_t* output_width_out, + pthreadpool_t threadpool); + +enum xnn_status xnn_setup_convolution2d_nhwc_pf16( + xnn_operator_t convolution_op, + void* workspace, + const void* input, + void* output); + enum xnn_status xnn_reshape_convolution2d_nhwc_f16( xnn_operator_t convolution_op, size_t batch_size, diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh index a4e79da7ce4..c36940b4693 100755 --- a/scripts/generate-tests.sh +++ b/scripts/generate-tests.sh @@ -49,6 +49,7 @@ tools/generate-gemm-test.py --spec test/qs8-qc4w-gemm-minmax-fp32.yaml --output- tools/generate-gemm-test.py --spec test/qs8-qc8w-gemm-minmax-fp32.yaml --output-test test/qs8-qc8w-gemm-minmax-fp32.cc --output-test test/qs8-qc8w-gemm-minmax-fp32-2.cc --output-test test/qs8-qc8w-gemm-minmax-fp32-3.cc --output-bench bench/qs8-qc8w-gemm-fp32.cc & ### Tests for IGEMM micro-kernels +tools/generate-gemm-test.py --spec test/pf16-f16-igemm-minmax.yaml --output-test test/pf16-f16-igemm-minmax.cc & tools/generate-gemm-test.py --spec test/f16-igemm-minmax.yaml --output-test test/f16-igemm-minmax.cc & tools/generate-gemm-test.py --spec test/f16-f32acc-igemm-minmax.yaml --output-test test/f16-f32acc-igemm-minmax.cc & diff --git a/src/configs/gemm-config.c b/src/configs/gemm-config.c index 3c8bd4a995e..423631da4dd 100644 --- a/src/configs/gemm-config.c +++ b/src/configs/gemm-config.c @@ -324,27 +324,40 @@ static void init_pf16_gemm_config(void) { pf16_gemm_config.bias_element_size = sizeof(xnn_float16); #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI const struct xnn_hardware_config* hardware_config = - xnn_init_hardware_config(); + xnn_init_hardware_config(); assert(hardware_config != NULL); - if (XNN_ENABLE_ARM_SME2 && (hardware_config->arch_flags & xnn_arch_arm_sme2)) { - #if XNN_ENABLE_ARM_SME2 - const size_t mr = xnn_pf16_gemm_minmax_ukernel_32x32c2__neonsme2_get_mr(); - const size_t nr = xnn_pf16_gemm_minmax_ukernel_32x32c2__neonsme2_get_nr(); - pf16_gemm_config.arch = xnn_arch_arm_sme2; - pf16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_pf16_gemm_minmax_ukernel_1x32c2__neonsme2); - pf16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(mr)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_pf16_gemm_minmax_ukernel_32x32c2__neonsme2); - pf16_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params; - pf16_gemm_config.pack_weights_and_biases = xnn_pack_kai_f16_weights_and_biases; - pf16_gemm_config.packed_stride_weights_and_biases = xnn_packed_stride_kai_f16_weights_and_biases; - pf16_gemm_config.mr = mr; - pf16_gemm_config.mr_packed = mr; - pf16_gemm_config.nr = nr; - pf16_gemm_config.log2_kr = 1; - #endif // XNN_ENABLE_ARM_SME2 +if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) { +#if XNN_ENABLE_ARM_SME2 + const size_t mr = xnn_pf16_gemm_minmax_ukernel_32x32c2__neonsme2_get_mr(); + size_t nr = xnn_pf16_gemm_minmax_ukernel_32x32c2__neonsme2_get_nr(); + const size_t nstep_min = 16; + pf16_gemm_config.arch = xnn_arch_arm_sme2; + pf16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_pf16_gemm_minmax_ukernel_1x32c2__neonsme2); + pf16_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(mr)] = XNN_INIT_HMP_GEMM_UKERNEL(xnn_pf16_gemm_minmax_ukernel_32x32c2__neonsme2); + pf16_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(mr)] = + xnn_init_hmp_packed_igemm_ukernel( + (xnn_packed_lhs_igemm_ukernel_fn) + xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme2); + pf16_gemm_config.init.f16 = xnn_init_f16_minmax_scalar_params; + pf16_gemm_config.pack_weights_and_biases = xnn_pack_kai_f16_weights_and_biases; + pf16_gemm_config.packed_stride_weights_and_biases = xnn_packed_stride_kai_f16_weights_and_biases; + pf16_gemm_config.pack_igemm_goki = + (xnn_pack_conv_goki_w_fn)xnn_pack_kai_f16_conv_goki_w_sme; // both sme and sme2 use the same packing kernel + pf16_gemm_config.pack_igemm_kgo = + (xnn_pack_conv_kgo_w_fn)xnn_pack_f16_conv_kgo_w; + pf16_gemm_config.mr = mr; + pf16_gemm_config.mr_packed = mr; + pf16_gemm_config.nr = nr < nstep_min ? nstep_min : nr; + pf16_gemm_config.log2_kr = 1; +#endif + } else { + /* no action */ } -#endif // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI + assert(pf16_gemm_config.mr <= XNN_MAX_MR); + #endif // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI } + static void init_bf16_f32_gemm_config(void) { // Common parameters. bf16_f32_gemm_config.log2_input_element_size = XNN_LOG2_SIZEOF_BFLOAT16; @@ -5588,6 +5601,7 @@ const struct xnn_gemm_config* xnn_init_pf16_gemm_config() { return NULL; } XNN_INIT_ONCE(pf16_gemm); + return pf16_gemm_config.mr ? &pf16_gemm_config : NULL; } diff --git a/src/configs/pack-lh-config.c b/src/configs/pack-lh-config.c index 891e9c6ff6a..85dc0d27f85 100644 --- a/src/configs/pack-lh-config.c +++ b/src/configs/pack-lh-config.c @@ -20,21 +20,27 @@ static struct xnn_pack_lh_config x8_pack_lh_config = {0}; static struct xnn_pack_lh_config x16_pack_lh_config = {0}; static struct xnn_pack_lh_config x32_pack_lh_config = {0}; static struct xnn_pack_lh_config x8_igemm_pack_lh_config = {0}; +static struct xnn_pack_lh_config x16_igemm_pack_lh_config = {0}; XNN_INIT_ONCE_GUARD(qp8_pack_lh); XNN_INIT_ONCE_GUARD(x8_pack_lh); XNN_INIT_ONCE_GUARD(x16_pack_lh); XNN_INIT_ONCE_GUARD(x32_pack_lh); XNN_INIT_ONCE_GUARD(x8_igemm_pack_lh); +XNN_INIT_ONCE_GUARD(x16_igemm_pack_lh); static void init_qp8_pack_lh_config(void) { #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI - qp8_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn)xnn_x8_packq_f32qp8_ukernel__aarch64_neon_u2; + qp8_pack_lh_config.pack_lh_fn = + (xnn_pack_lh_ukernel_fn)xnn_x8_packq_f32qp8_ukernel__aarch64_neon_u2; #else - qp8_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn)xnn_x8_packq_f32qp8_ukernel__scalar_u1; + qp8_pack_lh_config.pack_lh_fn = + (xnn_pack_lh_ukernel_fn)xnn_x8_packq_f32qp8_ukernel__scalar_u1; #endif // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI - qp8_pack_lh_config.size_fn = (xnn_pack_lh_size_fn)xnn_x8_packq_f32qp8_packed_size; - qp8_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn)xnn_x8_packq_f32qp8_packed_offset; + qp8_pack_lh_config.size_fn = + (xnn_pack_lh_size_fn)xnn_x8_packq_f32qp8_packed_size; + qp8_pack_lh_config.offset_fn = + (xnn_pack_lh_offset_fn)xnn_x8_packq_f32qp8_packed_offset; qp8_pack_lh_config.log2_input_element_size = XNN_LOG2_SIZEOF_FLOAT; qp8_pack_lh_config.log2_packed_element_size = 0; } @@ -51,15 +57,29 @@ const struct xnn_pack_lh_config* xnn_init_qp8_pack_lh_config() { static void init_x32_pack_lh_config(void) { #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI -#if XNN_ENABLE_ARM_SME2 || XNN_ENABLE_ARM_SME - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->arch_flags & xnn_arch_arm_sme) { - x32_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn) xnn_x32_pack_lh_ukernel__neonsme; - x32_pack_lh_config.size_fn = (xnn_pack_lh_size_fn) xnn_x32_pack_lh_size__neonsme; - x32_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn) xnn_x32_pack_lh_offset__neonsme; + const struct xnn_hardware_config* hardware_config = + xnn_init_hardware_config(); + assert(hardware_config != NULL); +#if XNN_ENABLE_ARM_SME2 + if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) { + x32_pack_lh_config.pack_lh_fn = + (xnn_pack_lh_ukernel_fn)xnn_x32_pack_lh_ukernel__neonsme2; + x32_pack_lh_config.size_fn = + (xnn_pack_lh_size_fn)xnn_x32_pack_lh_size__neonsme2; + x32_pack_lh_config.offset_fn = + (xnn_pack_lh_offset_fn)xnn_x32_pack_lh_offset__neonsme2; + } +#endif // XNN_ENABLE_ARM_SME2 +#if XNN_ENABLE_ARM_SME + if ((hardware_config->arch_flags & xnn_arch_arm_sme)) { + x32_pack_lh_config.pack_lh_fn = + (xnn_pack_lh_ukernel_fn)xnn_x32_pack_lh_ukernel__neonsme; + x32_pack_lh_config.size_fn = + (xnn_pack_lh_size_fn)xnn_x32_pack_lh_size__neonsme; + x32_pack_lh_config.offset_fn = + (xnn_pack_lh_offset_fn)xnn_x32_pack_lh_offset__neonsme; } -#endif // XNN_ENABLE_ARM_SME2 || XNN_ENABLE_ARM_SME +#endif // XNN_ENABLE_ARM_SME #endif // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI x32_pack_lh_config.log2_input_element_size = 2; x32_pack_lh_config.log2_packed_element_size = 2; @@ -67,7 +87,8 @@ static void init_x32_pack_lh_config(void) { } const struct xnn_pack_lh_config* xnn_init_x32_pack_lh_config() { - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + const struct xnn_hardware_config* hardware_config = + xnn_init_hardware_config(); if (hardware_config == NULL) { return NULL; } @@ -77,13 +98,33 @@ const struct xnn_pack_lh_config* xnn_init_x32_pack_lh_config() { static void init_x16_pack_lh_config(void) { #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI +#if XNN_ENABLE_ARM_SME + { + const struct xnn_hardware_config* hardware_config = + xnn_init_hardware_config(); + assert(hardware_config != NULL); + if ((hardware_config->arch_flags & xnn_arch_arm_sme)) { + x16_pack_lh_config.pack_lh_fn = + (xnn_pack_lh_ukernel_fn)xnn_x16_pack_lh_ukernel__neonsme; + x16_pack_lh_config.size_fn = + (xnn_pack_lh_size_fn)xnn_x16_pack_lh_size__neonsme; + x16_pack_lh_config.offset_fn = + (xnn_pack_lh_offset_fn)xnn_x16_pack_lh_offset__neonsme; + } + } +#endif // XNN_ENABLE_ARM_SME #if XNN_ENABLE_ARM_SME2 - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + const struct xnn_hardware_config* hardware_config = + xnn_init_hardware_config(); assert(hardware_config != NULL); - if (hardware_config->arch_flags & xnn_arch_arm_sme2) { - x16_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn) xnn_x16_pack_lh_ukernel__neonsme2; - x16_pack_lh_config.size_fn = (xnn_pack_lh_size_fn) xnn_x16_pack_lh_size__neonsme2; - x16_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn) xnn_x16_pack_lh_offset__neonsme2; + if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) { +/* IGEMM SME packer is not used for generic x16 pack_lh. */ + x16_pack_lh_config.pack_lh_fn = + (xnn_pack_lh_ukernel_fn)xnn_x16_pack_lh_ukernel__neonsme2; + x16_pack_lh_config.size_fn = + (xnn_pack_lh_size_fn)xnn_x16_pack_lh_size__neonsme2; + x16_pack_lh_config.offset_fn = + (xnn_pack_lh_offset_fn)xnn_x16_pack_lh_offset__neonsme2; } #endif // XNN_ENABLE_ARM_SME2 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI @@ -93,7 +134,8 @@ static void init_x16_pack_lh_config(void) { } const struct xnn_pack_lh_config* xnn_init_x16_pack_lh_config() { - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + const struct xnn_hardware_config* hardware_config = + xnn_init_hardware_config(); if (hardware_config == NULL) { return NULL; } @@ -104,12 +146,16 @@ const struct xnn_pack_lh_config* xnn_init_x16_pack_lh_config() { static void init_x8_pack_lh_config(void) { #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI #if XNN_ENABLE_ARM_SME2 - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + const struct xnn_hardware_config* hardware_config = + xnn_init_hardware_config(); assert(hardware_config != NULL); - if (hardware_config->arch_flags & xnn_arch_arm_sme2) { - x8_pack_lh_config.pack_lh_fn = (xnn_pack_lh_ukernel_fn) xnn_x8_pack_lh_ukernel__neonsme2; - x8_pack_lh_config.size_fn = (xnn_pack_lh_size_fn) xnn_x8_pack_lh_size__neonsme2; - x8_pack_lh_config.offset_fn = (xnn_pack_lh_offset_fn) xnn_x8_pack_lh_offset__neonsme2; + if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) { + x8_pack_lh_config.pack_lh_fn = + (xnn_pack_lh_ukernel_fn)xnn_x8_pack_lh_ukernel__neonsme2; + x8_pack_lh_config.size_fn = + (xnn_pack_lh_size_fn)xnn_x8_pack_lh_size__neonsme2; + x8_pack_lh_config.offset_fn = + (xnn_pack_lh_offset_fn)xnn_x8_pack_lh_offset__neonsme2; } #endif // XNN_ENABLE_ARM_SME2 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI @@ -119,7 +165,8 @@ static void init_x8_pack_lh_config(void) { } const struct xnn_pack_lh_config* xnn_init_x8_pack_lh_config() { - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + const struct xnn_hardware_config* hardware_config = + xnn_init_hardware_config(); if (hardware_config == NULL) { return NULL; } @@ -128,17 +175,21 @@ const struct xnn_pack_lh_config* xnn_init_x8_pack_lh_config() { } static void init_x8_igemm_pack_lh_config(void) { - #if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI - #if XNN_ENABLE_ARM_SME2 - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->arch_flags & xnn_arch_arm_sme2) { - x8_igemm_pack_lh_config.pack_lh_for_igemm_fn = (xnn_pack_lh_igemm_ukernel_fn) xnn_x8_pack_lh_ukernel__igemm_neonsme2; - x8_igemm_pack_lh_config.size_for_igemm_fn = (xnn_pack_lh_igemm_size_fn) xnn_x8_pack_lh_size__igemm_neonsme2; - x8_igemm_pack_lh_config.offset_for_igemm_fn = (xnn_pack_lh_igemm_offset_fn) xnn_x8_pack_lh_offset__igemm_neonsme2; - } - #endif // XNN_ENABLE_ARM_SME2 - #endif // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI +#if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI +#if XNN_ENABLE_ARM_SME2 + const struct xnn_hardware_config* hardware_config = + xnn_init_hardware_config(); + assert(hardware_config != NULL); + if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) { + x8_igemm_pack_lh_config.pack_lh_for_igemm_fn = + (xnn_pack_lh_igemm_ukernel_fn)xnn_x8_pack_lh_ukernel__igemm_neonsme2; + x8_igemm_pack_lh_config.size_for_igemm_fn = + (xnn_pack_lh_igemm_size_fn)xnn_x8_pack_lh_size__igemm_neonsme2; + x8_igemm_pack_lh_config.offset_for_igemm_fn = + (xnn_pack_lh_igemm_offset_fn)xnn_x8_pack_lh_offset__igemm_neonsme2; + } +#endif // XNN_ENABLE_ARM_SME2 +#endif // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI x8_igemm_pack_lh_config.log2_input_element_size = 0; x8_igemm_pack_lh_config.log2_packed_element_size = 0; } @@ -152,3 +203,48 @@ const struct xnn_pack_lh_config* xnn_init_x8_igemm_pack_lh_config() { XNN_INIT_ONCE(x8_igemm_pack_lh); return &x8_igemm_pack_lh_config; } + +static void init_x16_igemm_pack_lh_config(void) { +#if XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI +#if XNN_ENABLE_ARM_SME2 + const struct xnn_hardware_config* hardware_config = + xnn_init_hardware_config(); + assert(hardware_config != NULL); + if ((hardware_config->arch_flags & xnn_arch_arm_sme2)) { + x16_igemm_pack_lh_config.pack_lh_for_igemm_fn = + (xnn_pack_lh_igemm_ukernel_fn)xnn_x16_pack_lh_ukernel__igemm_neonsme2; + x16_igemm_pack_lh_config.size_for_igemm_fn = + (xnn_pack_lh_igemm_size_fn)xnn_x16_pack_lh_size__igemm_neonsme2; + x16_igemm_pack_lh_config.offset_for_igemm_fn = + (xnn_pack_lh_igemm_offset_fn)xnn_x16_pack_lh_offset__igemm_neonsme2; + } +#endif // XNN_ENABLE_ARM_SME2 +#if XNN_ENABLE_ARM_SME + { + const struct xnn_hardware_config* hardware_config = + xnn_init_hardware_config(); + assert(hardware_config != NULL); + if ((hardware_config->arch_flags & xnn_arch_arm_sme)) { + x16_igemm_pack_lh_config.pack_lh_for_igemm_fn = + (xnn_pack_lh_igemm_ukernel_fn)xnn_x16_pack_lh_ukernel__igemm_neonsme; + x16_igemm_pack_lh_config.size_for_igemm_fn = + (xnn_pack_lh_igemm_size_fn)xnn_x16_pack_lh_size__igemm_neonsme; + x16_igemm_pack_lh_config.offset_for_igemm_fn = + (xnn_pack_lh_igemm_offset_fn)xnn_x16_pack_lh_offset__igemm_neonsme; + } + } +#endif // XNN_ENABLE_ARM_SME +#endif // XNN_ARCH_ARM64 && XNN_ENABLE_KLEIDIAI + x16_igemm_pack_lh_config.log2_input_element_size = 1; + x16_igemm_pack_lh_config.log2_packed_element_size = 1; +} + +const struct xnn_pack_lh_config* xnn_init_x16_igemm_pack_lh_config() { + const struct xnn_hardware_config* hardware_config = + xnn_init_hardware_config(); + if (hardware_config == NULL) { + return NULL; + } + XNN_INIT_ONCE(x16_igemm_pack_lh); + return &x16_igemm_pack_lh_config; +} diff --git a/src/operators/convolution-nhwc.c b/src/operators/convolution-nhwc.c index 8a535ad70df..d69cb43f830 100644 --- a/src/operators/convolution-nhwc.c +++ b/src/operators/convolution-nhwc.c @@ -1946,6 +1946,46 @@ enum xnn_status xnn_create_convolution2d_nhwc_f32( convolution_op_out); } +enum xnn_status xnn_create_convolution2d_nhwc_pf16( + uint32_t input_padding_top, uint32_t input_padding_right, + uint32_t input_padding_bottom, uint32_t input_padding_left, + uint32_t kernel_height, uint32_t kernel_width, uint32_t subsampling_height, + uint32_t subsampling_width, uint32_t dilation_height, + uint32_t dilation_width, uint32_t groups, size_t group_input_channels, + size_t group_output_channels, size_t input_channel_stride, + size_t output_channel_stride, const void* kernel, const void* bias, + float output_min, float output_max, uint32_t flags, + xnn_weights_cache_t weights_cache, + xnn_operator_t* convolution_op_out) { + struct convolution2d_nhwc_context context = { + .input_padding_top = input_padding_top, + .input_padding_right = input_padding_right, + .input_padding_bottom = input_padding_bottom, + .input_padding_left = input_padding_left, + .kernel_height = kernel_height, + .kernel_width = kernel_width, + .subsampling_height = subsampling_height, + .subsampling_width = subsampling_width, + .dilation_height = dilation_height, + .dilation_width = dilation_width, + .groups = groups, + .group_input_channels = group_input_channels, + .group_output_channels = group_output_channels, + .input_channel_stride = input_channel_stride, + .output_channel_stride = output_channel_stride, + .kernel = kernel, + .bias = bias, + .output_min = output_min, + .output_max = output_max, + .flags = flags, + .weights_cache = weights_cache, + .gemm_config = xnn_init_pf16_gemm_config(), + .operator_type = xnn_operator_type_convolution_nhwc_pf16, + }; + return create_convolution2d_nhwc_helper(&f16_variant, &context, + convolution_op_out); +} + enum xnn_status xnn_create_convolution2d_nhwc_f32_f16( uint32_t input_padding_top, uint32_t input_padding_right, uint32_t input_padding_bottom, uint32_t input_padding_left, @@ -2979,6 +3019,22 @@ enum xnn_status xnn_reshape_convolution2d_nhwc_f32( output_width_out, threadpool); } +enum xnn_status xnn_reshape_convolution2d_nhwc_pf16( + xnn_operator_t convolution_op, size_t batch_size, size_t input_height, + size_t input_width, size_t* workspace_size, size_t* output_height_out, + size_t* output_width_out, pthreadpool_t threadpool) { + return reshape_convolution2d_nhwc( + convolution_op, xnn_operator_type_convolution_nhwc_pf16, batch_size, + input_height, input_width, + /*log2_input_element_size=*/XNN_LOG2_SIZEOF_FLOAT16, + /*log2_filter_element_size=*/XNN_LOG2_SIZEOF_FLOAT16, + /*log2_accumulator_element_size=*/XNN_LOG2_SIZEOF_FLOAT16, + /*extra_weights_elements_size=*/sizeof(uint16_t), + /*log2_output_element_size=*/XNN_LOG2_SIZEOF_FLOAT16, + /*dynamic_quantization=*/false, workspace_size, output_height_out, + output_width_out, threadpool); +} + static enum xnn_status setup_igemm(xnn_operator_t convolution_op, void* workspace, uint32_t log2_input_element_size) { @@ -3177,6 +3233,16 @@ enum xnn_status xnn_setup_convolution2d_nhwc_f16(xnn_operator_t convolution_op, /*log2_input_element_size=*/XNN_LOG2_SIZEOF_FLOAT16); } +enum xnn_status xnn_setup_convolution2d_nhwc_pf16(xnn_operator_t convolution_op, + void* workspace, + const void* input, + void* output) { + return setup_convolution2d_nhwc( + convolution_op, xnn_operator_type_convolution_nhwc_pf16, workspace, input, + output, /*quantization_params=*/NULL, + /*log2_input_element_size=*/XNN_LOG2_SIZEOF_FLOAT16); +} + enum xnn_status xnn_setup_convolution2d_nhwc_f32(xnn_operator_t convolution_op, void* workspace, const float* input, diff --git a/src/pf16-f16-f16-igemm/pf16-f16-f16-igemm-32x32c2-minmax-neonsme.c b/src/pf16-f16-f16-igemm/pf16-f16-f16-igemm-32x32c2-minmax-neonsme.c new file mode 100644 index 00000000000..7f794ddeaa2 --- /dev/null +++ b/src/pf16-f16-f16-igemm/pf16-f16-f16-igemm-32x32c2-minmax-neonsme.c @@ -0,0 +1,58 @@ +// Copyright 2025 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include "src/xnnpack/microparams.h" + +#if XNN_ENABLE_KLEIDIAI +#include "kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa.h" +#include "kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.h" +#endif // XNN_ENABLE_KLEIDIAI + +size_t xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme_get_mr(void) { +#if XNN_ENABLE_KLEIDIAI + return kai_get_mr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa(); +#else + assert( + "Calling wrapped KleidiAI function, but XNNPACK was compiled without " + "`XNN_ENABLE_KLEIDIAI`." && + 0); + return 0; +#endif // XNN_ENABLE_KLEIDIAI +} + +size_t xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme_get_nr(void) { +#if XNN_ENABLE_KLEIDIAI + return kai_get_nr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa(); +#else + assert( + "Calling wrapped KleidiAI function, but XNNPACK was compiled without " + "`XNN_ENABLE_KLEIDIAI`." && + 0); + return 0; +#endif // XNN_ENABLE_KLEIDIAI +} + +void xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme( + size_t mr, size_t nc, size_t kc, size_t ks, const void* packed_lhs, + const void* restrict w, xnn_float16* restrict c, size_t cm_stride, + const struct xnn_f16_minmax_params* params) { +#if XNN_ENABLE_KLEIDIAI + const size_t kai_kr = 2; + const size_t k = ks * round_up(kc, kai_kr); + + kai_run_matmul_clamp_f16_f16p2vlx2_f16p2vlx2b_2vlx2vl_sme_mopa( + mr, nc, k, packed_lhs, w, c, cm_stride * sizeof(xnn_float16), + sizeof(xnn_float16), xnn_float16_to_float(params->scalar.min), + xnn_float16_to_float(params->scalar.max)); +#else + assert( + "Calling wrapped KleidiAI function, but XNNPACK was compiled without " + "`XNN_ENABLE_KLEIDIAI`." && + 0); +#endif // XNN_ENABLE_KLEIDIAI +} diff --git a/src/pf16-f16-f16-igemm/pf16-f16-f16-igemm-32x32c2-minmax-neonsme2.c b/src/pf16-f16-f16-igemm/pf16-f16-f16-igemm-32x32c2-minmax-neonsme2.c new file mode 100644 index 00000000000..8e0b09b79c1 --- /dev/null +++ b/src/pf16-f16-f16-igemm/pf16-f16-f16-igemm-32x32c2-minmax-neonsme2.c @@ -0,0 +1,58 @@ +// Copyright 2025 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include "src/xnnpack/microparams.h" + +#if XNN_ENABLE_KLEIDIAI +#include "kai/ukernels/matmul/matmul_clamp_f16_f16p_f16p/kai_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa.h" +#include "kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.h" +#endif // XNN_ENABLE_KLEIDIAI + +size_t xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme2_get_mr(void) { +#if XNN_ENABLE_KLEIDIAI + return kai_get_mr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa(); +#else + assert( + "Calling wrapped KleidiAI function, but XNNPACK was compiled without " + "`XNN_ENABLE_KLEIDIAI`." && + 0); + return 0; +#endif // XNN_ENABLE_KLEIDIAI +} + +size_t xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme2_get_nr(void) { +#if XNN_ENABLE_KLEIDIAI + return kai_get_nr_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa(); +#else + assert( + "Calling wrapped KleidiAI function, but XNNPACK was compiled without " + "`XNN_ENABLE_KLEIDIAI`." && + 0); + return 0; +#endif // XNN_ENABLE_KLEIDIAI +} + +void xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme2( + size_t mr, size_t nc, size_t kc, size_t ks, const void* packed_lhs, + const void* restrict w, xnn_float16* restrict c, size_t cm_stride, + const struct xnn_f16_minmax_params* params) { +#if XNN_ENABLE_KLEIDIAI + const size_t kai_kr = 2; + const size_t k = ks * round_up(kc, kai_kr); + + kai_run_matmul_clamp_f16_f16p2vlx2_f16p2vlx2_2vlx2vl_sme2_mopa( + mr, nc, k, packed_lhs, w, c, cm_stride * sizeof(xnn_float16), + sizeof(xnn_float16), xnn_float16_to_float(params->scalar.min), + xnn_float16_to_float(params->scalar.max)); +#else + assert( + "Calling wrapped KleidiAI function, but XNNPACK was compiled without " + "`XNN_ENABLE_KLEIDIAI`." && + 0); +#endif // XNN_ENABLE_KLEIDIAI +} diff --git a/src/reference/packing.cc b/src/reference/packing.cc index 33968cf6596..fc6311b18a1 100644 --- a/src/reference/packing.cc +++ b/src/reference/packing.cc @@ -25,6 +25,7 @@ #if XNN_ENABLE_KLEIDIAI #include "kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_qsi8cxp2vlx4sb_qs8cx_f32_i32_sme.h" +#include "kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p2vlx1biasf32_f32_f32_sme.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4c32p_qsu4c32s1s0.h" #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi4cxp_qs4cxs1s0.h" @@ -2175,6 +2176,15 @@ void transpose_weights_x8(const int8_t* in, int8_t* out, size_t height, } } +static void transpose_weights_x16(const uint16_t* in, uint16_t* out, + size_t height, size_t width) { + for (size_t j = 0; j < width; ++j) { + for (size_t i = 0; i < height; ++i) { + out[j * height + i] = in[i * width + j]; + } + } +} + void xnn_pack_kai_qs8_qc8w_weights_and_biases_sme2( uint32_t flags, const struct xnn_gemm_config* gemm_config, size_t input_channels, size_t output_channels, size_t groups, @@ -2340,7 +2350,7 @@ void xnn_pack_kai_f16_weights_and_biases( // initialized array as a workaround if bias is null. bool free_accumulator_init = false; if (accumulator_init == NULL) { - accumulator_init = calloc(output_channels, sizeof(xnn_float16)); + accumulator_init = calloc(output_channels, sizeof(float)); free_accumulator_init = true; } @@ -2362,7 +2372,7 @@ void xnn_pack_kai_f16_weights_and_biases( (const void*)((uintptr_t)weights + group * weights_group_stride), /*bias=*/ free_accumulator_init - ? accumulator_init + ? (const float*)accumulator_init : (const float*)(accumulator_init) + group * output_channels, /*scale=*/NULL, /*rhs_packed=*/ @@ -2378,7 +2388,7 @@ void xnn_pack_kai_f16_weights_and_biases( (const void*)((uintptr_t)weights + group * weights_group_stride), /*bias=*/ free_accumulator_init - ? accumulator_init + ? (const float*)accumulator_init : (const float*)(accumulator_init) + group * output_channels, /*scale=*/NULL, /*rhs_packed=*/ @@ -2560,6 +2570,59 @@ void xnn_pack_kai_qb4_weights_and_biases( } } +void xnn_pack_kai_f16_conv_goki_w_sme(size_t g, size_t nc, size_t ks, + size_t kc, size_t nr, size_t kr, + size_t sr, const uint16_t* k, + const uint16_t* b, const void* scale, + void* packed_weights, size_t extra_bytes, + const void* params) { + + assert(g != 0); + assert(nr >= sr); + assert(k != nullptr); + assert(packed_weights != nullptr); + + uint16_t* tmp_bias = NULL; + + if (b == NULL) { + tmp_bias = (uint16_t*)xnn_allocate_zero_memory(g * nc * sizeof(uint16_t)); + b = tmp_bias; + } + + uint16_t* tmp_data = + (uint16_t*)xnn_allocate_memory(nc * ks * kc * sizeof(uint16_t)); + const size_t rhs_row_stride = nc * sizeof(uint16_t); + const size_t packed_rhs_size = + kai_get_rhs_packed_size_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme( + nc, ks, kc); + + for (size_t g_idx = 0; g_idx < g; ++g_idx) { + + // TODO: Remove transpose_weights_x16 if KleidiAI release imatmul_pack_nxk packing variant + transpose_weights_x16(k, tmp_data, nc, ks * kc); + // Pass FP16 bias directly to the rhs_imatmul packer which expects FP16 bias + // for this kernel. + kai_run_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme( + nc, ks, kc, rhs_row_stride, tmp_data, b, packed_weights); + + k += nc * ks * kc; + b += nc; + + packed_weights = (void*)((uintptr_t)packed_weights + packed_rhs_size); + } + + xnn_release_memory(tmp_data); + + if (tmp_bias != NULL) { + xnn_release_memory(tmp_bias); + } +} + +size_t xnn_packed_size_kai_f16_conv_goki_w(size_t nc, size_t ks, size_t kc) { + return kai_get_rhs_packed_size_rhs_imatmul_pack_kxn_x16p2vlx2b_x16_x16_sme( + nc, ks, kc); +} + void xnn_pack_kai_qs8_conv_goki_w_sme2( size_t g, size_t nc, size_t ks, size_t kc, size_t nr, size_t kr, size_t sr, const int8_t* k, const int32_t* b, const float* scale, void* packed_weights, diff --git a/src/subgraph.c b/src/subgraph.c index dc1f42d9753..e50b398d429 100644 --- a/src/subgraph.c +++ b/src/subgraph.c @@ -1565,9 +1565,8 @@ enum xnn_status xnn_subgraph_fusion(xnn_subgraph_t subgraph) { (padding_datatype == xnn_datatype_fp32 && padding_value == 0) || ((padding_datatype == xnn_datatype_qint8 || padding_datatype == xnn_datatype_quint8) && - padding_value == - (uint32_t)subgraph->values[producer->outputs[0]] - .quantization.zero_point); + padding_value == (uint32_t)subgraph->values[producer->outputs[0]] + .quantization.zero_point); switch (consumer->type) { case xnn_node_type_convolution_2d: if (is_spatial_2d_padding && is_zero_padding && @@ -3294,6 +3293,19 @@ enum xnn_status xnn_subgraph_optimize_packed_lhs(xnn_subgraph_t subgraph, node->packed_input_datatype = xnn_datatype_pqint8; node->flags |= XNN_FLAG_INLINE_LHS_PACKING; } + if (input_datatype == xnn_datatype_fp16 && + kernel_datatype == xnn_datatype_fp16 && + output_datatype == xnn_datatype_fp16 && + xnn_init_pf16_gemm_config() != NULL && + !(optimization_flags & XNN_FLAG_NO_INLINED_LHS_PACKING)) { + // Note that there is currently no option to not use inlining for this + // iGEMM kernel. + xnn_log_debug("Setting assumed_datatype=%s for node #%u (%s).", + xnn_datatype_to_string(xnn_datatype_pfp16), node_id, + xnn_node_type_to_string(node->type)); + node->packed_input_datatype = xnn_datatype_pfp16; + node->flags |= XNN_FLAG_INLINE_LHS_PACKING; + } } break; default: break; diff --git a/src/subgraph/convolution-2d.c b/src/subgraph/convolution-2d.c index a941efd745f..a7fe5b0ca95 100644 --- a/src/subgraph/convolution-2d.c +++ b/src/subgraph/convolution-2d.c @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -18,7 +19,6 @@ #include "src/xnnpack/requantization.h" #include "src/xnnpack/subgraph-validation.h" #include "src/xnnpack/subgraph.h" -#include enum xnn_status create_nchw_convolution( uint32_t input_padding_top, uint32_t input_padding_right, @@ -254,10 +254,10 @@ static enum xnn_status create_convolution_operator( node->params.convolution_2d.group_input_channels, node->params.convolution_2d.group_output_channels, /*input_channel_stride=*/ - node->params.convolution_2d.group_input_channels * + node->params.convolution_2d.group_input_channels * node->params.convolution_2d.groups, /*output_channel_stride=*/ - node->params.convolution_2d.group_output_channels * + node->params.convolution_2d.group_output_channels * node->params.convolution_2d.groups, values[filter_id].quantization.channelwise_scale, filter_data, bias_data, node->activation.output_min, @@ -280,10 +280,10 @@ static enum xnn_status create_convolution_operator( node->params.convolution_2d.group_input_channels, node->params.convolution_2d.group_output_channels, /*input_channel_stride=*/ - node->params.convolution_2d.group_input_channels * + node->params.convolution_2d.group_input_channels * node->params.convolution_2d.groups, /*output_channel_stride=*/ - node->params.convolution_2d.group_output_channels * + node->params.convolution_2d.group_output_channels * node->params.convolution_2d.groups, values[filter_id].quantization.channelwise_scale, filter_data, bias_data, node->activation.output_min, @@ -309,28 +309,55 @@ static enum xnn_status create_convolution_operator( switch (filter_datatype) { case xnn_datatype_fp16: case xnn_datatype_fp32: - status = xnn_create_convolution2d_nhwc_f16( - node->params.convolution_2d.input_padding_top, - node->params.convolution_2d.input_padding_right, - node->params.convolution_2d.input_padding_bottom, - node->params.convolution_2d.input_padding_left, - node->params.convolution_2d.kernel_height, - node->params.convolution_2d.kernel_width, - node->params.convolution_2d.subsampling_height, - node->params.convolution_2d.subsampling_width, - node->params.convolution_2d.dilation_height, - node->params.convolution_2d.dilation_width, - node->params.convolution_2d.groups, - node->params.convolution_2d.group_input_channels, - node->params.convolution_2d.group_output_channels, - node->params.convolution_2d.group_input_channels * - node->params.convolution_2d.groups /* input_pixel_stride */, - node->params.convolution_2d.group_output_channels * - node->params.convolution_2d - .groups /* output_pixel_stride */, - filter_data, bias_data, node->activation.output_min, - node->activation.output_max, flags, weights_cache, - &opdata->operator_objects[0]); + if (input_datatype == xnn_datatype_pfp16) { + status = xnn_create_convolution2d_nhwc_pf16( + node->params.convolution_2d.input_padding_top, + node->params.convolution_2d.input_padding_right, + node->params.convolution_2d.input_padding_bottom, + node->params.convolution_2d.input_padding_left, + node->params.convolution_2d.kernel_height, + node->params.convolution_2d.kernel_width, + node->params.convolution_2d.subsampling_height, + node->params.convolution_2d.subsampling_width, + node->params.convolution_2d.dilation_height, + node->params.convolution_2d.dilation_width, + node->params.convolution_2d.groups, + node->params.convolution_2d.group_input_channels, + node->params.convolution_2d.group_output_channels, + node->params.convolution_2d.group_input_channels * + node->params.convolution_2d + .groups /* input_pixel_stride */, + node->params.convolution_2d.group_output_channels * + node->params.convolution_2d + .groups /* output_pixel_stride */, + filter_data, bias_data, node->activation.output_min, + node->activation.output_max, flags, weights_cache, + &opdata->operator_objects[0]); + } else { + status = xnn_create_convolution2d_nhwc_f16( + node->params.convolution_2d.input_padding_top, + node->params.convolution_2d.input_padding_right, + node->params.convolution_2d.input_padding_bottom, + node->params.convolution_2d.input_padding_left, + node->params.convolution_2d.kernel_height, + node->params.convolution_2d.kernel_width, + node->params.convolution_2d.subsampling_height, + node->params.convolution_2d.subsampling_width, + node->params.convolution_2d.dilation_height, + node->params.convolution_2d.dilation_width, + node->params.convolution_2d.groups, + node->params.convolution_2d.group_input_channels, + node->params.convolution_2d.group_output_channels, + node->params.convolution_2d.group_input_channels * + node->params.convolution_2d + .groups /* input_pixel_stride */, + node->params.convolution_2d.group_output_channels * + node->params.convolution_2d + .groups /* output_pixel_stride */, + filter_data, bias_data, node->activation.output_min, + node->activation.output_max, flags, weights_cache, + &opdata->operator_objects[0]); + } break; case xnn_datatype_qcint8: switch (input_datatype) { @@ -350,10 +377,10 @@ static enum xnn_status create_convolution_operator( node->params.convolution_2d.group_input_channels, node->params.convolution_2d.group_output_channels, /*input_channel_stride=*/ - node->params.convolution_2d.group_input_channels * + node->params.convolution_2d.group_input_channels * node->params.convolution_2d.groups, /*output_channel_stride=*/ - node->params.convolution_2d.group_output_channels * + node->params.convolution_2d.group_output_channels * node->params.convolution_2d.groups, values[filter_id].quantization.channelwise_scale, filter_data, bias_data, node->activation.output_min, @@ -376,10 +403,10 @@ static enum xnn_status create_convolution_operator( node->params.convolution_2d.group_input_channels, node->params.convolution_2d.group_output_channels, /*input_channel_stride=*/ - node->params.convolution_2d.group_input_channels * + node->params.convolution_2d.group_input_channels * node->params.convolution_2d.groups, /*output_channel_stride=*/ - node->params.convolution_2d.group_output_channels * + node->params.convolution_2d.group_output_channels * node->params.convolution_2d.groups, values[filter_id].quantization.channelwise_scale, filter_data, bias_data, node->activation.output_min, @@ -659,6 +686,11 @@ enum xnn_status reshape_convolution_operator(struct xnn_operator_data* opdata, opdata->operator_objects[0], batch_size, input_height, input_width, &opdata->workspace_size, &output_height, &output_width, threadpool); break; + case xnn_operator_type_convolution_nhwc_pf16: + status = xnn_reshape_convolution2d_nhwc_pf16( + opdata->operator_objects[0], batch_size, input_height, input_width, + &opdata->workspace_size, &output_height, &output_width, threadpool); + break; default: XNN_UNREACHABLE; } @@ -782,6 +814,11 @@ enum xnn_status setup_convolution_operator( opdata->operator_objects[0], opdata->workspace, input_data, output_data); break; + case xnn_operator_type_convolution_nhwc_pf16: + return xnn_setup_convolution2d_nhwc_pf16(opdata->operator_objects[0], + opdata->workspace, input_data, + output_data); + break; default: XNN_UNREACHABLE; } diff --git a/src/x16-pack-lh/x16-pack-lh-igemm.inc b/src/x16-pack-lh/x16-pack-lh-igemm.inc new file mode 100644 index 00000000000..47e46d526f0 --- /dev/null +++ b/src/x16-pack-lh/x16-pack-lh-igemm.inc @@ -0,0 +1,16 @@ +// clang-format off +// Copyright 2025 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +// arch_flags, igemm_ukernel, igemm_size_fn, igemm_packed_offset_fn + +#if XNN_ENABLE_KLEIDIAI +XNN_UKERNEL(xnn_arch_arm_sme, xnn_x16_pack_lh_ukernel__igemm_neonsme, + xnn_x16_pack_lh_size__igemm_neonsme, + xnn_x16_pack_lh_offset__igemm_neonsme) +XNN_UKERNEL(xnn_arch_arm_sme2, xnn_x16_pack_lh_ukernel__igemm_neonsme2, + xnn_x16_pack_lh_size__igemm_neonsme2, + xnn_x16_pack_lh_offset__igemm_neonsme2) +#endif // XNN_ENABLE_KLEIDIAI diff --git a/src/x16-pack-lh/x16-pack-lh.inc b/src/x16-pack-lh/x16-pack-lh.inc index b475161e8fc..d4648f6eb57 100644 --- a/src/x16-pack-lh/x16-pack-lh.inc +++ b/src/x16-pack-lh/x16-pack-lh.inc @@ -9,6 +9,8 @@ // XNN_UKERNEL(arch, ukernel, size_fn, packed_offset_fn) #if XNN_ENABLE_KLEIDIAI -XNN_UKERNEL(xnn_arch_arm_sme, xnn_x16_pack_lh_ukernel__neonsme2, +XNN_UKERNEL(xnn_arch_arm_sme, xnn_x16_pack_lh_ukernel__neonsme, + xnn_x16_pack_lh_size__neonsme, xnn_x16_pack_lh_offset__neonsme) +XNN_UKERNEL(xnn_arch_arm_sme2, xnn_x16_pack_lh_ukernel__neonsme2, xnn_x16_pack_lh_size__neonsme2, xnn_x16_pack_lh_offset__neonsme2) #endif // XNN_ENABLE_KLEIDIAI diff --git a/src/x16-pack-lh/x16-packlh-igemm-neonsme.c b/src/x16-pack-lh/x16-packlh-igemm-neonsme.c new file mode 100644 index 00000000000..0de657d56d5 --- /dev/null +++ b/src/x16-pack-lh/x16-packlh-igemm-neonsme.c @@ -0,0 +1,65 @@ +// Copyright 2025 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// + +#include +#include +#include +#include +#include +#include +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/pack-lh.h" + +#if XNN_ENABLE_KLEIDIAI +#include "kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.h" +#endif // XNN_ENABLE_KLEIDIAI + +// This function just wraps KleidiAI's +// `kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme`, but with a name that is recognized +// by our tooling. +void xnn_x16_pack_lh_ukernel__igemm_neonsme(size_t m, size_t kc, size_t ks, + size_t mr_packed, size_t kr, + size_t sr, const void** restrict a, + size_t a_offset, const void* zero, + void* lhs_packed) { +#if XNN_ENABLE_KLEIDIAI + assert(kr == 2); + kai_run_lhs_imatmul_pack_x16p2vlx2_x16p_sme(m, ks, kc, a, a_offset, zero, + lhs_packed); +#else + assert("Not compiled with XNN_ENABLE_KLEIDIAI" && 0); +#endif // XNN_ENABLE_KLEIDIAI +} + +size_t xnn_x16_pack_lh_size__igemm_neonsme(size_t m, size_t kc, size_t ks, + size_t mr_packed, size_t kr, + size_t sr) { +#if XNN_ENABLE_KLEIDIAI + assert(kr == 2); + + return kai_get_lhs_packed_size_lhs_imatmul_pack_x16p2vlx2_x16p_sme(m, ks, kc); +#else + assert("Not compiled with XNN_ENABLE_KLEIDIAI" && 0); + return 0; +#endif // XNN_ENABLE_KLEIDIAI +} + +size_t xnn_x16_pack_lh_offset__igemm_neonsme(size_t m, size_t kc, size_t ks, + size_t mr_packed, size_t kr, + size_t sr) { +#if XNN_ENABLE_KLEIDIAI + assert(kr == 2); + + return kai_get_lhs_packed_offset_lhs_imatmul_pack_x16p2vlx2_x16p_sme(m, ks, + kc); +#else + assert("Not compiled with XNN_ENABLE_KLEIDIAI" && 0); + return 0; +#endif // XNN_ENABLE_KLEIDIAI +} diff --git a/src/x16-pack-lh/x16-packlh-igemm-neonsme2.c b/src/x16-pack-lh/x16-packlh-igemm-neonsme2.c new file mode 100644 index 00000000000..69f2cb1e800 --- /dev/null +++ b/src/x16-pack-lh/x16-packlh-igemm-neonsme2.c @@ -0,0 +1,65 @@ +// Copyright 2025 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// + +#include +#include +#include +#include +#include +#include +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/pack-lh.h" + +#if XNN_ENABLE_KLEIDIAI +#include "kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme.h" +#endif // XNN_ENABLE_KLEIDIAI + +// This function just wraps KleidiAI's +// `kai_lhs_imatmul_pack_x16p2vlx2_x16p_sme`, but with a name that is recognized +// by our tooling. +void xnn_x16_pack_lh_ukernel__igemm_neonsme2(size_t m, size_t kc, size_t ks, + size_t mr_packed, size_t kr, + size_t sr, const void** restrict a, + size_t a_offset, const void* zero, + void* lhs_packed) { +#if XNN_ENABLE_KLEIDIAI + assert(kr == 2); + kai_run_lhs_imatmul_pack_x16p2vlx2_x16p_sme(m, ks, kc, a, a_offset, zero, + lhs_packed); +#else + assert("Not compiled with XNN_ENABLE_KLEIDIAI" && 0); +#endif // XNN_ENABLE_KLEIDIAI +} + +size_t xnn_x16_pack_lh_size__igemm_neonsme2(size_t m, size_t kc, size_t ks, + size_t mr_packed, size_t kr, + size_t sr) { +#if XNN_ENABLE_KLEIDIAI + assert(kr == 2); + + return kai_get_lhs_packed_size_lhs_imatmul_pack_x16p2vlx2_x16p_sme(m, ks, kc); +#else + assert("Not compiled with XNN_ENABLE_KLEIDIAI" && 0); + return 0; +#endif // XNN_ENABLE_KLEIDIAI +} + +size_t xnn_x16_pack_lh_offset__igemm_neonsme2(size_t m, size_t kc, size_t ks, + size_t mr_packed, size_t kr, + size_t sr) { +#if XNN_ENABLE_KLEIDIAI + assert(kr == 2); + + return kai_get_lhs_packed_offset_lhs_imatmul_pack_x16p2vlx2_x16p_sme(m, ks, + kc); +#else + assert("Not compiled with XNN_ENABLE_KLEIDIAI" && 0); + return 0; +#endif // XNN_ENABLE_KLEIDIAI +} diff --git a/src/x16-pack-lh/x16-packlh-neonsme.c b/src/x16-pack-lh/x16-packlh-neonsme.c new file mode 100644 index 00000000000..db55caee564 --- /dev/null +++ b/src/x16-pack-lh/x16-packlh-neonsme.c @@ -0,0 +1,69 @@ +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include +#include +#include +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/pack-lh.h" + +#if XNN_ENABLE_KLEIDIAI +#include "kai/ukernels/matmul/pack/kai_lhs_pack_f32p2vlx1_f32_sme.h" +#include "kai/ukernels/matmul/pack/kai_lhs_pack_x16p2vlx2_x16_sme.h" +#endif // XNN_ENABLE_KLEIDIAI + +// This function just wraps KleidiAI's `kai_run_lhs_pack_x16p2vlx2_x16_sme`, but +// with a name that is recognized by our tooling. +void xnn_x16_pack_lh_ukernel__neonsme(size_t m, size_t k, size_t mr_packed, + size_t kr, size_t sr, size_t m_idx_start, + const xnn_float16* XNN_RESTRICT lhs, + size_t lhs_stride, + void* XNN_RESTRICT lhs_packed) { +#if XNN_ENABLE_KLEIDIAI + if (mr_packed == 1) { + memcpy(lhs_packed, lhs, sizeof(xnn_float16) * k); + } else { + kai_run_lhs_pack_x16p2vlx2_x16_sme(m, k, mr_packed, kr, sr, m_idx_start, + lhs, lhs_stride, lhs_packed); + } +#else + assert("Not compiled with XNN_ENABLE_KLEIDIAI" && 0); +#endif // XNN_ENABLE_KLEIDIAI +} + +size_t xnn_x16_pack_lh_size__neonsme(size_t m, size_t k, size_t mr_packed, + size_t kr, size_t sr) { +#if XNN_ENABLE_KLEIDIAI + if (mr_packed == 1) { + return m * sizeof(xnn_float16) * k; + } else { + return kai_get_lhs_packed_size_lhs_pack_x16p2vlx2_x16_sme(m, k, mr_packed, + kr, sr); + } +#else + assert("Not compiled with XNN_ENABLE_KLEIDIAI" && 0); + return 0; +#endif // XNN_ENABLE_KLEIDIAI +} + +size_t xnn_x16_pack_lh_offset__neonsme(size_t m, size_t k, size_t mr_packed, + size_t kr, size_t sr) { +#if XNN_ENABLE_KLEIDIAI + if (mr_packed == 1) { + return m * sizeof(xnn_float16) * k; + } else { + return kai_get_lhs_packed_offset_lhs_pack_x16p2vlx2_x16_sme(m, k, mr_packed, + kr, sr); + } +#else + assert("Not compiled with XNN_ENABLE_KLEIDIAI" && 0); + return 0; +#endif // XNN_ENABLE_KLEIDIAI +} diff --git a/src/x32-pack-lh/x32-pack-lh.inc b/src/x32-pack-lh/x32-pack-lh.inc index b0219220c5b..d6a958aa194 100644 --- a/src/x32-pack-lh/x32-pack-lh.inc +++ b/src/x32-pack-lh/x32-pack-lh.inc @@ -10,4 +10,5 @@ #if XNN_ENABLE_KLEIDIAI XNN_UKERNEL(xnn_arch_arm_sme, xnn_x32_pack_lh_ukernel__neonsme, xnn_x32_pack_lh_size__neonsme, xnn_x32_pack_lh_offset__neonsme) +XNN_UKERNEL(xnn_arch_arm_sme2, xnn_x32_pack_lh_ukernel__neonsme2, xnn_x32_pack_lh_size__neonsme2, xnn_x32_pack_lh_offset__neonsme2) #endif // XNN_ENABLE_KLEIDIAI diff --git a/src/x32-pack-lh/x32-packlh-neonsme2.c b/src/x32-pack-lh/x32-packlh-neonsme2.c new file mode 100644 index 00000000000..f2af42300e0 --- /dev/null +++ b/src/x32-pack-lh/x32-packlh-neonsme2.c @@ -0,0 +1,68 @@ +// Copyright 2025 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include +#include +#include +#include + +#include "src/xnnpack/common.h" +#include "src/xnnpack/math.h" +#include "src/xnnpack/pack-lh.h" + +#if XNN_ENABLE_KLEIDIAI +#include "kai/ukernels/matmul/pack/kai_lhs_pack_f32p2vlx1_f32_sme.h" +#endif // XNN_ENABLE_KLEIDIAI + +// This function just wraps KleidiAI's `kai_run_lhs_pack_f32p2vlx1_f32_sme`, but +// with a name that is recognized by our tooling. +void xnn_x32_pack_lh_ukernel__neonsme2(size_t m, size_t k, size_t mr_packed, + size_t kr, size_t sr, size_t m_idx_start, + const float* XNN_RESTRICT lhs, + size_t lhs_stride, + void* XNN_RESTRICT lhs_packed) { +#if XNN_ENABLE_KLEIDIAI + if (mr_packed == 1) { + memcpy(lhs_packed, lhs, sizeof(float) * k); + } else { + kai_run_lhs_pack_f32p2vlx1_f32_sme(m, k, mr_packed, kr, sr, m_idx_start, + lhs, lhs_stride, lhs_packed); + } +#else + assert("Not compiled with XNN_ENABLE_KLEIDIAI" && 0); +#endif // XNN_ENABLE_KLEIDIAI +} + +size_t xnn_x32_pack_lh_size__neonsme2(size_t m, size_t k, size_t mr_packed, + size_t kr, size_t sr) { +#if XNN_ENABLE_KLEIDIAI + if (mr_packed == 1) { + return m * sizeof(float) * k; + } else { + return kai_get_lhs_packed_size_lhs_pack_f32p2vlx1_f32_sme(m, k, mr_packed, + kr, sr); + } +#else + assert("Not compiled with XNN_ENABLE_KLEIDIAI" && 0); + return 0; +#endif // XNN_ENABLE_KLEIDIAI +} + +size_t xnn_x32_pack_lh_offset__neonsme2(size_t m, size_t k, size_t mr_packed, + size_t kr, size_t sr) { +#if XNN_ENABLE_KLEIDIAI + if (mr_packed == 1) { + return m * sizeof(float) * k; + } else { + return kai_get_lhs_packed_offset_lhs_pack_f32p2vlx1_f32_sme( + m, k, mr_packed, kr, sr); + } +#else + assert("Not compiled with XNN_ENABLE_KLEIDIAI" && 0); + return 0; +#endif // XNN_ENABLE_KLEIDIAI +} diff --git a/src/xnnpack/config.h b/src/xnnpack/config.h index 6584c1da78c..75537989107 100644 --- a/src/xnnpack/config.h +++ b/src/xnnpack/config.h @@ -31,6 +31,8 @@ XNN_INTERNAL const struct xnn_pack_lh_config* xnn_init_x16_pack_lh_config(); XNN_INTERNAL const struct xnn_pack_lh_config* xnn_init_x32_pack_lh_config(); XNN_INTERNAL const struct xnn_pack_lh_config* xnn_init_x8_igemm_pack_lh_config(); +XNN_INTERNAL const struct xnn_pack_lh_config* +xnn_init_x16_igemm_pack_lh_config(); XNN_INTERNAL const struct xnn_binary_elementwise_config* xnn_init_f16_vadd_config(); diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h index 396918f68b1..ae022b7eea7 100644 --- a/src/xnnpack/gemm.h +++ b/src/xnnpack/gemm.h @@ -362,6 +362,22 @@ size_t xnn_pqs8_qc8w_igemm_minmax_fp32_ukernel_32x32c4__neonsme2_get_nr(); DECLARE_PQS8_QC8W_PACKED_IGEMM_MINMAX_UKERNEL_FUNCTION( xnn_pqs8_qc8w_igemm_minmax_fp32_ukernel_32x32c4__neonsme2) +#define DECLARE_PF16_F16_PACKED_IGEMM_MINMAX_UKERNEL_FUNCTION(fn_name) \ + \ + XNN_INTERNAL size_t fn_name##_get_mr(); \ + XNN_INTERNAL size_t fn_name##_get_nr(); \ + \ + XNN_INTERNAL void fn_name(size_t mr, size_t nc, size_t kc, size_t ks, \ + const void* packed_lhs, const void* w, \ + xnn_float16* c, size_t cm_stride, \ + const struct xnn_f16_minmax_params* params); + +DECLARE_PF16_F16_PACKED_IGEMM_MINMAX_UKERNEL_FUNCTION( + xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme2) + +DECLARE_PF16_F16_PACKED_IGEMM_MINMAX_UKERNEL_FUNCTION( + xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme) + #define DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(fn_name) \ XNN_INTERNAL void fn_name(size_t mr, size_t nc, size_t kc, const float* a, \ size_t a_stride, const float* w, float* c, \ @@ -1260,34 +1276,62 @@ DECLARE_F32_GEMM_RELU_UKERNEL_FUNCTION(xnn_f32_gemm_relu_ukernel_7x4v__rvv) DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x4v__rvv) DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_7x4v__rvv) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_1x32__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_2x32__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_3x32__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_4x32__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_5x32__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_6x32__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_7x32__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_8x32__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_9x32__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_10x32__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_1x64__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_2x64__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_3x64__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_4x64__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_5x64__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_6x64__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_7x64__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_8x64__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_9x64__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_10x64__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_1x128__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_2x128__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_3x128__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_4x128__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_5x128__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_6x128__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_7x128__hvx_broadcast) -DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_8x128__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_1x32__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_2x32__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_3x32__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_4x32__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_5x32__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_6x32__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_7x32__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_8x32__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_9x32__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_10x32__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_1x64__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_2x64__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_3x64__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_4x64__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_5x64__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_6x64__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_7x64__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_8x64__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_9x64__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_10x64__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_1x128__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_2x128__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_3x128__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_4x128__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_5x128__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_6x128__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_7x128__hvx_broadcast) +DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION( + xnn_f32_gemm_minmax_ukernel_8x128__hvx_broadcast) #define DECLARE_F32_QC4W_GEMM_MINMAX_UKERNEL_FUNCTION(fn_name) \ XNN_INTERNAL void fn_name(size_t mr, size_t nr, size_t k, const float* a, \ diff --git a/src/xnnpack/internal.h b/src/xnnpack/internal.h index 6af849cf0c3..f5f9e46f860 100644 --- a/src/xnnpack/internal.h +++ b/src/xnnpack/internal.h @@ -8,6 +8,7 @@ #ifndef XNNPACK_SRC_XNNPACK_INTERNAL_H_ #define XNNPACK_SRC_XNNPACK_INTERNAL_H_ +#include #include #include @@ -15,7 +16,6 @@ #include "src/xnnpack/config-types.h" #include "src/xnnpack/math.h" #include "src/xnnpack/subgraph.h" -#include // Runtime values marked with this flag should be cleaned up (i.e. deallocated) // by the runtime. diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h index ed19f0a6445..542e3a3d9c5 100644 --- a/src/xnnpack/microfnptr.h +++ b/src/xnnpack/microfnptr.h @@ -233,6 +233,16 @@ typedef void (*xnn_packed_lhs_igemm_ukernel_fn)( size_t mr, size_t nc, size_t kc, size_t ks, const void* packed_lhs, const void* w, int8_t* c, size_t cm_stride, const void* params); +typedef void (*xnn_packed_f16_lhs_igemm_ukernel_fn)( + size_t mr, size_t nc, size_t kc, size_t ks, const void* packed_lhs, + const void* w, void* c, size_t cm_stride, + const struct xnn_f16_minmax_params* params); + +typedef void (*xnn_pf16_f16_packed_igemm_minmax_ukernel_fn)( + size_t mr, size_t nc, size_t kc, size_t ks, const void* packed_lhs, + const void* w, xnn_float16* c, size_t cm_stride, + const struct xnn_f16_minmax_params* params); + typedef void (*xnn_f32_igemm_ukernel_fn)( size_t mr, size_t nr, size_t kc, size_t ks, const float** a, const float* w, float* c, size_t cm_stride, size_t cn_stride, size_t a_offset, diff --git a/src/xnnpack/operator-type-defs.inc b/src/xnnpack/operator-type-defs.inc index daf7166bef5..9aa1f2a99e7 100644 --- a/src/xnnpack/operator-type-defs.inc +++ b/src/xnnpack/operator-type-defs.inc @@ -47,6 +47,8 @@ XNN_ENUM_ITEM(xnn_operator_type_convolution_nchw_f16, "Convolution (NCHW, F16)") XNN_ENUM_ITEM(xnn_operator_type_convolution_nchw_f32, "Convolution (NCHW, F32)") XNN_ENUM_ITEM(xnn_operator_type_convolution_nhwc_f16, "Convolution (NHWC, F16)") XNN_ENUM_ITEM(xnn_operator_type_convolution_nhwc_f32, "Convolution (NHWC, F32)") +XNN_ENUM_ITEM(xnn_operator_type_convolution_nhwc_pf16, + "Convolution (NHWC, PF16)") XNN_ENUM_ITEM(xnn_operator_type_convolution_nhwc_qdu8_f16_qc8w, "Convolution (NHWC, QD8, F16, QC8W)") XNN_ENUM_ITEM(xnn_operator_type_convolution_nhwc_qd8_f16_qc8w, diff --git a/src/xnnpack/pack-lh.h b/src/xnnpack/pack-lh.h index 62115605629..a239ab879a1 100644 --- a/src/xnnpack/pack-lh.h +++ b/src/xnnpack/pack-lh.h @@ -73,6 +73,7 @@ extern "C" { XNN_INTERNAL size_t igemm_packed_offset_fn( \ size_t m, size_t kc, size_t ks, size_t mr_packed, size_t kr, size_t sr); +#include "src/x16-pack-lh/x16-pack-lh-igemm.inc" #include "src/x8-pack-lh/x8-pack-lh-igemm.inc" #undef XNN_UKERNEL diff --git a/src/xnnpack/pack.h b/src/xnnpack/pack.h index 014a8a3bf75..07d9b524522 100644 --- a/src/xnnpack/pack.h +++ b/src/xnnpack/pack.h @@ -513,6 +513,20 @@ XNN_INTERNAL size_t xnn_packed_stride_kai_qb4_weights_and_biases( size_t k_stride, // size_t extra_bytes); +XNN_INTERNAL void xnn_pack_kai_f16_conv_goki_w_sme(size_t g, // + size_t nc, // + size_t ks, // + size_t kc, // + size_t nr, // + size_t kr, // + size_t sr, // + const uint16_t* k, // + const uint16_t* b, // + const void* scale, // + void* packed_weights, // + size_t extra_bytes, // + const void* params); + XNN_INTERNAL void xnn_pack_kai_qs8_conv_goki_w_sme2( size_t g, // size_t nc, // @@ -527,6 +541,13 @@ XNN_INTERNAL void xnn_pack_kai_qs8_conv_goki_w_sme2( void* packed_weights, // size_t extra_bytes, // const struct xnn_qs8_packing_params* params); + +// Returns the packed RHS size (in bytes) per group for KAI f16 conv_goki +// packer. +XNN_INTERNAL size_t xnn_packed_size_kai_f16_conv_goki_w(size_t nc, // + size_t ks, // + size_t kc // +); #endif // XNN_ENABLE_KLEIDIAI XNN_INTERNAL void xnn_pack_qs8_to_qu8_gemm_gio_w( @@ -577,6 +598,17 @@ typedef void (*xnn_pack_f16_igemm_fn)(size_t g, size_t nc, size_t ks, size_t kc, uint16_t* packed_weights, size_t extra_bytes, const void* params); +/* + * KleidiAI variant: same signature except the destination pointer type is + * void*. Some KleidiAI packers (e.g. xnn_pack_kai_f16_conv_goki_w_sme) use + * void* for the packed_weights parameter. Provide a typedef to allow tests to + * pass these packers while still keeping type safety at the call site. + */ +typedef void (*xnn_pack_f16_igemm_void_dst_fn)( + size_t g, size_t nc, size_t ks, size_t kc, size_t nr, size_t kr, size_t sr, + const uint16_t* kernel, const uint16_t* bias, const void* scale, + void* packed_weights, size_t extra_bytes, const void* params); + XNN_INTERNAL void xnn_pack_f16_conv_goki_w( size_t g, size_t nc, size_t ks, size_t kc, size_t nr, size_t kr, size_t sr, const uint16_t* kernel, const uint16_t* bias, const void* scale, diff --git a/test/BUILD.bazel b/test/BUILD.bazel index 494324782db..b530049ef5a 100644 --- a/test/BUILD.bazel +++ b/test/BUILD.bazel @@ -383,6 +383,17 @@ xnnpack_unit_test( ], ) +xnnpack_unit_test( + name = "pf16_f16_igemm_minmax_test", + srcs = [ + "pf16-f16-igemm-minmax.cc", + ], + defines = xnnpack_kleidiai_defines(), + deps = MICROKERNEL_TEST_DEPS + [ + ":gemm_microkernel_tester", + ], +) + xnnpack_unit_test( name = "f16_f32acc_igemm_minmax_test", srcs = [ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index cce34cbb3b7..8046bb35eca 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -206,6 +206,7 @@ SET(MICROKERNEL_GEMM_UNIT_TESTS f32-qc8w-gemm-minmax f32-qc8w-gemm-relu pf16-gemm-minmax + pf16-f16-igemm-minmax pf32-gemm-minmax pqs8-qc8w-gemm-minmax qd8-f16-qb4w-gemm-minmax diff --git a/test/gemm-microkernel-tester.cc b/test/gemm-microkernel-tester.cc index 9ace2fcf71b..d8fe4bac9a6 100644 --- a/test/gemm-microkernel-tester.cc +++ b/test/gemm-microkernel-tester.cc @@ -1,5 +1,7 @@ #include "test/gemm-microkernel-tester.h" +#include + #include #include #include @@ -10,7 +12,6 @@ #include #include -#include #include "include/xnnpack.h" #include "src/xnnpack/buffer.h" #include "src/xnnpack/common.h" @@ -2382,7 +2383,7 @@ void GemmMicrokernelTester::Test_PF16( "input_f16"); xnnpack::Buffer weights(n() * k(), /*extra_bytes=*/{0}, "weights"); - xnnpack::Buffer bias(n(), 0.0f); + xnnpack::Buffer bias(n(), xnn_float16_from_float(0.0f)); xnnpack::Buffer c((m() - 1) * cm_stride() + n(), /*extra_bytes=*/{0}, "c"); xnnpack::Buffer c_ref(m() * n(), 0, /*extra_bytes=*/{0}, @@ -2410,7 +2411,9 @@ void GemmMicrokernelTester::Test_PF16( ASSERT_NE(pack_lh_config, nullptr); // Loop over the iterations. - std::generate(input_f16.begin(), input_f16.end(), std::ref(f32rng)); + for (size_t idx = 0; idx < input_f16.size(); idx++) { + input_f16[idx] = xnn_float16_from_float(f32rng()); + } // Pack the left-hand operand. const size_t input_packed_size = @@ -2422,8 +2425,12 @@ void GemmMicrokernelTester::Test_PF16( /*lhs_stride=*/k() * sizeof(xnn_float16), input_packed.data()); - std::generate(weights.begin(), weights.end(), std::ref(f32rng)); - std::generate(bias.begin(), bias.end(), std::ref(f32rng)); + for (size_t idx = 0; idx < weights.size(); idx++) { + weights[idx] = xnn_float16_from_float(f32rng()); + } + for (size_t idx = 0; idx < bias.size(); idx++) { + bias[idx] = xnn_float16_from_float(f32rng()); + } // RHS packing. pack(/*flags=*/0, &gemm_config, k(), n(), @@ -2443,21 +2450,22 @@ void GemmMicrokernelTester::Test_PF16( // Compute 32-bit results and output quantization arguments. std::fill(c_ref.begin(), c_ref.end(), 0.0f); for (size_t m_index = 0; m_index < m(); m_index++) { - for (size_t n_index = 0; n_index < n(); n_index++) { - for (size_t k_index = 0; k_index < k(); k_index++) { + for (size_t n_index = 0; n_index < n(); n_index++) { + for (size_t k_index = 0; k_index < k(); k_index++) { + c_ref[m_index * n() + n_index] = + c_ref[m_index * n() + n_index] + + xnn_float16(input_f16[m_index * k() + k_index] * + weights[n_index * k() + k_index]); + } c_ref[m_index * n() + n_index] = - c_ref[m_index * n() + n_index] + - xnn_float16(input_f16[m_index * k() + k_index] * - weights[n_index * k() + k_index]); + c_ref[m_index * n() + n_index] + bias[n_index]; } - c_ref[m_index * n() + n_index] = - c_ref[m_index * n() + n_index] + bias[n_index]; } - } // Prepare parameters. xnn_f16_minmax_params minmax_params; - init_minmax_params(&minmax_params, min(), max()); + init_minmax_params(&minmax_params, xnn_float16_from_float(min()), + xnn_float16_from_float(max())); for (size_t m_index = 0; m_index < m(); m_index++) { for (size_t n_index = 0; n_index < n(); n_index++) { @@ -2784,6 +2792,163 @@ void GemmMicrokernelTester::Test_PQS8( } } } +#if XNN_ENABLE_KLEIDIAI +#if XNN_ENABLE_ARM_SME2 || XNN_ENABLE_ARM_SME +void GemmMicrokernelTester::Test_PF16( + xnn_pf16_f16_packed_igemm_minmax_ukernel_fn packed_igemm, + xnn_init_f16_minmax_params_fn init_minmax_params, + xnn_pack_lh_igemm_ukernel_fn pack_lh_for_igemm_fn, + xnn_pack_lh_igemm_size_fn size_for_igemm_fn, + xnn_pack_weights_and_biases_fn pack_wb, + xnn_packed_stride_weights_and_biases_fn packed_stride_wb) { + ASSERT_LE(m(), mr()); + ASSERT_EQ(xnn_initialize(nullptr), xnn_status_success); + + xnnpack::ReplicableRandomDevice rng; + auto f32rng = std::bind(std::uniform_real_distribution(-1.f, 1.f), + std::ref(rng)); + const float max_abs_product = 1.0f; + + // Inputs/weights/bias. + xnnpack::Buffer a_f16((mr() - 1) * a_stride() + k(), + xnnpack::XnnExtraBytes, "input_f16"); + xnnpack::Buffer b_f16(n() * ks() * k(), /*extra_bytes=*/{0}, + "weights_f16"); + // Bias in FP16 for conv_goki rhs_imatmul packer variant. + xnnpack::Buffer bias_f16(n(), xnn_float16_from_float(0.0f)); + + // Output buffer is FP16 as produced by the ukernel; we convert to float for + // reference comparison. + xnnpack::Buffer c((m() - 1) * cm_stride() + n()); + xnnpack::Buffer c_ref(m() * n(), 0.0f); + + // Fill inputs with proper fp16 conversion. + for (size_t idx = 0; idx < a_f16.size(); idx++) { + a_f16[idx] = xnn_float16_from_float(f32rng()); + } + for (size_t idx = 0; idx < b_f16.size(); idx++) { + b_f16[idx] = xnn_float16_from_float(f32rng()); + } + for (size_t idx = 0; idx < bias_f16.size(); idx++) { + bias_f16[idx] = xnn_float16_from_float(f32rng()); + } + + // Prepare im2col pointers (elements). + xnnpack::Buffer im2col(mr() * ks()); + // The junk data needs to be initialized for some kernels because msan will + // assert in functions like lrintf, etc. + xnnpack::Buffer junk(k(), 0, xnnpack::XnnExtraBytes); + // Zero row for packer when zero_index is used. + xnnpack::Buffer zero_row(k(), 0); + + for (size_t ks_index = 0; ks_index < ks(); ks_index++) { + for (size_t m_index = 0; m_index < mr(); m_index++) { + im2col[ks_index * mr() + m_index] = + a_f16.data() + a_stride() * m_index - a_offset(); + } + } + std::shuffle(im2col.begin(), im2col.end(), rng); + if (zero_index() != SIZE_MAX) { + for (size_t ks_index = 0; ks_index < ks(); ks_index++) { + im2col[ks_index * mr() + zero_index()] = a_f16.data(); + } + } + for (size_t ks_index = 0; ks_index < ks(); ks_index++) { + for (size_t m_index = m(); m_index < mr(); m_index++) { + im2col[ks_index * mr() + m_index] = junk.data(); + } + } + + // Compute packed weights buffer size using conv_goki packer directly; size helper returns bytes. + const size_t packed_rhs_size = + xnn_packed_size_kai_f16_conv_goki_w(n(), ks(), k()); + xnnpack::Buffer packed_w( + packed_rhs_size, /*extra_bytes=*/{0}, "packed_w_f16"); + std::fill(packed_w.begin(), packed_w.end(), 0); + + // Pack RHS (weights + FP16 bias) for IGEMM using conv_goki path. + xnn_pack_kai_f16_conv_goki_w_sme( + /*g=*/1, + /*nc=*/n(), + /*ks=*/ks(), + /*kc=*/k(), + /*nr=*/nr(), + /*kr=*/kr(), + /*sr=*/sr(), + /*k=*/reinterpret_cast(b_f16.data()), + /*b=*/reinterpret_cast(bias_f16.data()), + /*scale=*/nullptr, + /*packed_weights=*/packed_w.data(), + /*extra_bytes=*/0, + /*params=*/nullptr); + + // Pack the LHS for IGEMM. + const size_t packed_lhs_size = + size_for_igemm_fn(m(), k(), ks(), mr_packed(), kr(), sr()); + xnnpack::Buffer packed_lhs(packed_lhs_size); + const void* zero_pointer = (zero_index() != SIZE_MAX) + ? static_cast(a_f16.data()) + : nullptr; + pack_lh_for_igemm_fn( + m(), k(), ks(), mr_packed(), kr(), sr(), (const void**)im2col.data(), + a_offset() * sizeof(xnn_float16), zero_pointer, packed_lhs.data()); + + // Reference computation (float accumulation). + std::fill(c_ref.begin(), c_ref.end(), 0.0f); + for (size_t m_index = 0; m_index < m(); m_index++) { + for (size_t n_index = 0; n_index < n(); n_index++) { + float acc = 0.0f; + const size_t k_eff = ks() * k(); + for (size_t kk = 0; kk < k_eff; kk++) { + const size_t ks_index = kk / k(); + const size_t k_index = kk % k(); + const xnn_float16* base = im2col[ks_index * mr() + m_index]; + const float a_val = xnn_float16_to_float( + (base == a_f16.data()) ? base[k_index] + : base[k_index + a_offset()]); + const float b_val = xnn_float16_to_float( + b_f16[(n_index * ks() + ks_index) * k() + k_index]); + acc += a_val * b_val; + } + acc += xnn_float16_to_float(bias_f16[n_index]); + c_ref[m_index * n() + n_index] = acc; + } + } + + // MinMax clamp. + xnn_f16_minmax_params minmax_params; + init_minmax_params(&minmax_params, xnn_float16_from_float(min()), + xnn_float16_from_float(max())); + for (size_t mi = 0; mi < m(); mi++) { + for (size_t ni = 0; ni < n(); ni++) { + c_ref[mi * n() + ni] = + std::max(std::min(c_ref[mi * n() + ni], max()), min()); + } + } + + // Run kernel. + // Pass cm_stride in elements; ukernel wrapper converts to bytes when calling + // KAI. + packed_igemm(m(), n(), k(), ks(), packed_lhs.data(), packed_w.data(), + c.data(), cm_stride(), &minmax_params); + + const float tolerance = + compute_sum_tolerance(max_abs_product, ks() * k(), + xnnpack::NumericLimits::epsilon()); + for (size_t i = 0; i < m(); i++) { + for (size_t j = 0; j < n(); j++) { + const float c_val = xnn_float16_to_float(c[i * cm_stride() + j]); + ASSERT_NEAR(c_val, c_ref[i * n() + j], tolerance) + << "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j] + << ", optimized = " << c_val << ", Mr x Nr x Kr = " << mr() << " x " + << nr() << " x " << kr() << ", M x N x K = " << m() << " x " << n() + << " x " << k() << ", ks = " << ks() + << ", cm_stride = " << cm_stride(); + } + } +} +#endif +#endif void GemmMicrokernelTester::Test( xnn_qp8_f32_qb4w_gemm_minmax_ukernel_fn gemm, diff --git a/test/gemm-microkernel-tester.h b/test/gemm-microkernel-tester.h index 98c6e16d399..5db8546115c 100644 --- a/test/gemm-microkernel-tester.h +++ b/test/gemm-microkernel-tester.h @@ -9,6 +9,8 @@ #ifndef XNNPACK_TEST_GEMM_MICROKERNEL_TESTER_H_ #define XNNPACK_TEST_GEMM_MICROKERNEL_TESTER_H_ +#include + #include #include #include @@ -19,7 +21,6 @@ #include #include -#include #include "src/xnnpack/math.h" #include "src/xnnpack/microfnptr.h" #include "src/xnnpack/pack.h" @@ -350,6 +351,16 @@ class GemmMicrokernelTester { xnn_pack_weights_and_biases_fn pack, xnn_packed_stride_weights_and_biases_fn packed_stride); +#if XNN_ENABLE_ARM_SME2 || XNN_ENABLE_ARM_SME + // PF16 packed-LHS IGEMM (weights_and_biases API + packed_stride) + void Test_PF16(xnn_pf16_f16_packed_igemm_minmax_ukernel_fn packed_igemm, + xnn_init_f16_minmax_params_fn init_minmax_params, + xnn_pack_lh_igemm_ukernel_fn pack_lh_for_igemm_fn, + xnn_pack_lh_igemm_size_fn size_for_igemm_fn, + xnn_pack_weights_and_biases_fn pack_wb, + xnn_packed_stride_weights_and_biases_fn packed_stride_wb); +#endif + void Test_PQS8(xnn_pqs8_qc8w_gemm_minmax_ukernel_fn gemm, xnn_init_qs8_qc8w_conv_minmax_params_fn init_minmax_params, xnn_pack_weights_and_biases_fn pack, diff --git a/test/pf16-f16-igemm-minmax.cc b/test/pf16-f16-igemm-minmax.cc new file mode 100644 index 00000000000..6faa30c4bba --- /dev/null +++ b/test/pf16-f16-igemm-minmax.cc @@ -0,0 +1,358 @@ +// clang-format off +// Copyright (c) Facebook, Inc. and its affiliates. +// All rights reserved. +// +// Copyright 2019 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +// +// Auto-generated file. Do not edit! +// Specification: test/pf16-f16-igemm-minmax.yaml +// Generator: tools/generate-gemm-test.py + +#include +#include +#include +#include +#include + +#include +#include "src/xnnpack/allocator.h" +#include "src/xnnpack/common.h" +#include "src/xnnpack/gemm.h" +#include "src/xnnpack/hardware-config.h" +#include "src/xnnpack/igemm.h" +#include "src/xnnpack/microparams-init.h" +#include "src/xnnpack/pack-lh.h" +#include "src/xnnpack/pack.h" +#include "src/xnnpack/packw.h" +#include "src/xnnpack/ppmm.h" +#include "src/xnnpack/requantization.h" +#include "test/gemm-microkernel-tester.h" +#include "test/next_prime.h" + +namespace { + +struct ConstantOrFunction { + ConstantOrFunction(size_t x) : fn([x]() { return x; }) {} //NOLINT + ConstantOrFunction(int x) : fn([x]() { return x; }) {} //NOLINT + template + ConstantOrFunction(Fn fn) : fn(std::move(fn)) {} //NOLINT + + std::function fn; + + operator size_t() const { return fn(); } //NOLINT +}; + +} // namespace + + +namespace { + +// NOLINTNEXTLINE(clang-diagnostic-unused-function) +std::vector CreateTests1( + size_t k_block, size_t adj_k_block, + ConstantOrFunction mr, ConstantOrFunction nr, size_t kr, size_t sr, + ConstantOrFunction mr_packed, + bool is_igemm, + bool unsigned_inputs, + uint8_t planes, + std::function test_func, + uint64_t arch_flags = 0) { + std::string kbs = std::to_string(k_block); + std::string kb2s = std::to_string(k_block * 2); + std::string akbs = std::to_string(adj_k_block); + std::string nrs = std::to_string(nr); + + const GemmMicrokernelTester tester = GemmMicrokernelTester() + .mr(mr).nr(nr).kr(kr).sr(sr).mr_packed(mr_packed).unsigned_inputs(unsigned_inputs).planes(planes); + + std::vector gemm_tests; + gemm_tests.reserve(42); + + gemm_tests.push_back(GemmTestParams( + "k_eq_" + kbs, + tester.clone() + .m(mr).n(nr).k(k_block) + , test_func, arch_flags)); + gemm_tests.push_back(GemmTestParams( + "k_eq_" + kbs + "_subtile", + tester.clone() + .k(k_block) + , test_func, arch_flags) + .loop_n(1, nr) + .loop_m(1, mr)); + gemm_tests.push_back(GemmTestParams( + "k_eq_" + kbs + "_subtile_m", + tester.clone() + .n(nr).k(k_block) + , test_func, arch_flags) + .loop_m(1, mr)); + gemm_tests.push_back(GemmTestParams( + "k_eq_" + kbs + "_subtile_n", + tester.clone() + .m(mr).k(k_block) + , test_func, arch_flags) + .loop_n(1, nr)); + if (k_block > 1) { + gemm_tests.push_back(GemmTestParams( + "k_lt_" + akbs, + tester.clone() + .m(mr).n(nr) + , test_func, arch_flags) + .loop_k(1, adj_k_block - 1)); + gemm_tests.push_back(GemmTestParams( + "k_lt_" + akbs + "_subtile", + tester.clone() + , test_func, arch_flags) + .loop_k(1, adj_k_block - 1) + .loop_n(1, nr) + .loop_m(1, mr)); + } + gemm_tests.push_back(GemmTestParams( + "k_gt_" + akbs, + tester.clone() + .m(mr).n(nr) + , test_func, arch_flags) + .loop_k(adj_k_block + 1, adj_k_block * 2 - 1, k_block)); + gemm_tests.push_back(GemmTestParams( + "k_gt_" + akbs + "_subtile", + tester.clone() + , test_func, arch_flags) + .loop_k(adj_k_block + 1, adj_k_block * 2 - 1, k_block) + .loop_n(1, nr) + .loop_m(1, mr)); + if (k_block > 1) { + gemm_tests.push_back(GemmTestParams( + "k_div_" + kbs, + tester.clone() + .m(mr).n(nr) + , test_func, arch_flags) + .loop_k(adj_k_block + k_block, k_block * 5, k_block)); + gemm_tests.push_back(GemmTestParams( + "k_div_" + kbs + "_subtile", + tester.clone() + , test_func, arch_flags) + .loop_k(adj_k_block + k_block, k_block * 5, k_block) + .loop_n(1, nr) + .loop_m(1, mr)); + } + gemm_tests.push_back(GemmTestParams( + "n_gt_" + nrs, + tester.clone() + .m(mr) + , test_func, arch_flags) + .loop_n(nr + 1, nr * 2 - 1) + .loop_k(1, k_block * 3, k_block + 1)); + gemm_tests.push_back(GemmTestParams( + "n_gt_" + nrs + "_subtile", + tester.clone() + , test_func, arch_flags) + .loop_n(nr + 1, nr * 2 - 1) + .loop_k(1, k_block * 3, k_block + 1) + .loop_m(1, mr)); + gemm_tests.push_back(GemmTestParams( + "n_div_" + nrs, + tester.clone() + .m(mr) + , test_func, arch_flags) + .loop_n(nr * 2, nr * 3, nr) + .loop_k(1, k_block * 3, k_block + 1)); + gemm_tests.push_back(GemmTestParams( + "n_div_" + nrs + "_subtile", + tester.clone() + , test_func, arch_flags) + .loop_n(nr * 2, nr * 3, nr) + .loop_k(1, k_block * 3, k_block + 1) + .loop_m(1, mr)); + if (is_igemm) { + gemm_tests.push_back(GemmTestParams( + "small_kernel", + tester.clone() + .m(mr).n(nr).ks(3) + , test_func, arch_flags) + .loop_k(1, k_block * 3, k_block + 1)); + gemm_tests.push_back(GemmTestParams( + "small_kernel_subtile", + tester.clone() + .ks(3) + , test_func, arch_flags) + .loop_k(1, k_block * 3, k_block + 1) + .loop_n(1, nr) + .loop_m(1, mr)); + gemm_tests.push_back(GemmTestParams( + "n_gt_" + nrs + "_small_kernel", + tester.clone() + .m(mr).ks(3) + , test_func, arch_flags) + .loop_n(nr + 1, nr * 2 - 1) + .loop_k(1, k_block * 3, k_block + 1)); + gemm_tests.push_back(GemmTestParams( + "n_div_" + nrs + "_small_kernel", + tester.clone() + .m(mr).ks(3) + , test_func, arch_flags) + .loop_n(nr * 2, nr * 3, nr) + .loop_k(1, k_block * 3, k_block + 1)); + } + gemm_tests.push_back(GemmTestParams( + "strided_cm_subtile", + tester.clone() + .mr(mr).nr(nr).kr(kr).sr(sr) + .cm_stride(xnnpack::NextPrime(nr + 1)) + , test_func, arch_flags) + .loop_k(1, k_block * 3, k_block + 1) + .loop_n(1, nr) + .loop_m(1, mr)); + if (is_igemm) { + gemm_tests.push_back(GemmTestParams( + "a_offset", + tester.clone() + .m(mr).n(nr).ks(3) + .a_offset(xnnpack::NextPrime(mr * k_block * 3 + 1)) + , test_func, arch_flags) + .loop_k(1, k_block * 3, k_block + 1)); + gemm_tests.push_back(GemmTestParams( + "zero", + tester.clone() + .m(mr).n(nr).ks(3) + .a_offset(xnnpack::NextPrime(mr * k_block * 3 + 1)) + , test_func, arch_flags) + .loop_k(1, k_block * 3, k_block + 1) + .loop_zi(0, mr - 1)); + } + gemm_tests.push_back(GemmTestParams( + "min", + tester.clone() + .m(mr).n(nr).k(k_block).min(0.0f) + , test_func, arch_flags)); + gemm_tests.push_back(GemmTestParams( + "max", + tester.clone() + .m(mr).n(nr).k(k_block).max(0.0f) + , test_func, arch_flags)); + gemm_tests.push_back(GemmTestParams( + "strided_cm", + tester.clone() + .m(mr).n(nr).k(k_block) + .cm_stride(xnnpack::NextPrime(nr + 1)) + , test_func, arch_flags)); + + return gemm_tests; +} + +} // namespace + + +#if XNN_ENABLE_ARM_SME2 && XNN_ARCH_ARM64 + #if XNN_ENABLE_ARM_SME2 && XNN_ENABLE_KLEIDIAI + INSTANTIATE_TEST_SUITE_P( + PF16_F16_IGEMM_MINMAX_FP16_32X32C2__NEONSME2, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/2, + /*adj_k_block=*/2, + /*mr=*/[]() -> size_t { + const struct xnn_hardware_config* hardware_config = + xnn_init_hardware_config(); + if (hardware_config != nullptr && (hardware_config->arch_flags & xnn_arch_arm_sme2) == xnn_arch_arm_sme2) { + return xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme2_get_mr(); + } else { + return 0; + } + } + , /*nr=*/[]() -> size_t { + const struct xnn_hardware_config* hardware_config = + xnn_init_hardware_config(); + if (hardware_config != nullptr && (hardware_config->arch_flags & xnn_arch_arm_sme2) == xnn_arch_arm_sme2) { + return xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme2_get_nr(); + } else { + return 0; + } + } + , /*kr=*/2, /*sr=*/1, + /*mr_packed=*/[]() -> size_t { + const struct xnn_hardware_config* hardware_config = + xnn_init_hardware_config(); + if (hardware_config != nullptr && (hardware_config->arch_flags & xnn_arch_arm_sme2) == xnn_arch_arm_sme2) { + return xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme2_get_mr(); + } else { + return 0; + } + } + , + /*is_igemm=*/true, + /*unsigned_inputs=*/false, + /*planes=*/1, + [](GemmMicrokernelTester& tester) { + tester.Test_PF16(xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme2, + xnn_init_f16_minmax_scalar_params, + xnn_x16_pack_lh_ukernel__igemm_neonsme2, + xnn_x16_pack_lh_size__igemm_neonsme2, + xnn_pack_kai_f16_weights_and_biases, + xnn_packed_stride_kai_f16_weights_and_biases); + }, + xnn_arch_arm_sme2)), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + #endif // XNN_ENABLE_ARM_SME2 && XNN_ENABLE_KLEIDIAI +#endif // XNN_ENABLE_ARM_SME2 && XNN_ARCH_ARM64 + + +#if XNN_ENABLE_ARM_SME && XNN_ARCH_ARM64 + #if XNN_ENABLE_ARM_SME && XNN_ENABLE_KLEIDIAI + INSTANTIATE_TEST_SUITE_P( + PF16_F16_IGEMM_MINMAX_FP16_32X32C2__NEONSME, GemmTest, + testing::ValuesIn(CreateTests1( + /*k_block=*/2, + /*adj_k_block=*/2, + /*mr=*/[]() -> size_t { + const struct xnn_hardware_config* hardware_config = + xnn_init_hardware_config(); + if (hardware_config != nullptr && (hardware_config->arch_flags & xnn_arch_arm_sme) == xnn_arch_arm_sme) { + return xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme_get_mr(); + } else { + return 0; + } + } + , /*nr=*/[]() -> size_t { + const struct xnn_hardware_config* hardware_config = + xnn_init_hardware_config(); + if (hardware_config != nullptr && (hardware_config->arch_flags & xnn_arch_arm_sme) == xnn_arch_arm_sme) { + return xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme_get_nr(); + } else { + return 0; + } + } + , /*kr=*/2, /*sr=*/1, + /*mr_packed=*/[]() -> size_t { + const struct xnn_hardware_config* hardware_config = + xnn_init_hardware_config(); + if (hardware_config != nullptr && (hardware_config->arch_flags & xnn_arch_arm_sme) == xnn_arch_arm_sme) { + return xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme_get_mr(); + } else { + return 0; + } + } + , + /*is_igemm=*/true, + /*unsigned_inputs=*/false, + /*planes=*/1, + [](GemmMicrokernelTester& tester) { + tester.Test_PF16(xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme, + xnn_init_f16_minmax_scalar_params, + xnn_x16_pack_lh_ukernel__igemm_neonsme, + xnn_x16_pack_lh_size__igemm_neonsme, + xnn_pack_kai_f16_weights_and_biases, + xnn_packed_stride_kai_f16_weights_and_biases); + }, + xnn_arch_arm_sme)), + [](const testing::TestParamInfo& info) { + return info.param.test_name; + }); + + #endif // XNN_ENABLE_ARM_SME && XNN_ENABLE_KLEIDIAI +#endif // XNN_ENABLE_ARM_SME && XNN_ARCH_ARM64 + diff --git a/test/pf16-f16-igemm-minmax.yaml b/test/pf16-f16-igemm-minmax.yaml new file mode 100644 index 00000000000..5940d31d950 --- /dev/null +++ b/test/pf16-f16-igemm-minmax.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Arm KleidiAI PF16 packed-LHS IGEMM kernels +- name: xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme2 + init: xnn_init_f16_minmax_scalar_params + pack-lh-fn: xnn_x16_pack_lh_ukernel__igemm_neonsme2 + pack-lh-size-fn: xnn_x16_pack_lh_size__igemm_neonsme2 + pack: xnn_pack_kai_f16_weights_and_biases + packed-stride: xnn_packed_stride_kai_f16_weights_and_biases + cpp-check: XNN_ENABLE_ARM_SME2 && XNN_ENABLE_KLEIDIAI + k-block: 2 + mr-packed: 32 +- name: xnn_pf16_f16_igemm_minmax_fp16_ukernel_32x32c2__neonsme + init: xnn_init_f16_minmax_scalar_params + pack-lh-fn: xnn_x16_pack_lh_ukernel__igemm_neonsme + pack-lh-size-fn: xnn_x16_pack_lh_size__igemm_neonsme + pack: xnn_pack_kai_f16_weights_and_biases + packed-stride: xnn_packed_stride_kai_f16_weights_and_biases + cpp-check: XNN_ENABLE_ARM_SME && XNN_ENABLE_KLEIDIAI + k-block: 2 + mr-packed: 32 \ No newline at end of file