[clang] [llvm] [AArch64][clang][llvm] Add structured sparsity outer product (TMOP) intrinsics (PR #135145)

Fri Apr 11 03:46:39 PDT 2025

================
@@ -0,0 +1,112 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-tmop -target-feature +sme-f16f16 -target-feature +sme-b16b16 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-tmop -target-feature +sme-f16f16 -target-feature +sme-b16b16 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-tmop -target-feature +sme-f16f16 -target-feature +sme-b16b16 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme.h>
+
+#ifdef SME_OVERLOADED_FORMS
+#define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svtmopa_lane_za32_s8_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.stmopa.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZK:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+void test_svtmopa_lane_za32_s8_s8(svint8x2_t zn, svint8_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svtmopa_lane_za32,_s8_s8,)(1, zn, zm, zk, 3);
+}
+
+// CHECK-LABEL: @test_svtmopa_lane_za32_u8_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.utmopa.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZK:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+void test_svtmopa_lane_za32_u8_u8(svuint8x2_t zn, svuint8_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svtmopa_lane_za32,_u8_u8,)(1, zn, zm, zk, 3);
+}
+
+// CHECK-LABEL: @test_svtmopa_lane_za32_s8_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sutmopa.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZK:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+void test_svtmopa_lane_za32_s8_u8(svint8x2_t zn, svuint8_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svtmopa_lane_za32,_s8_u8,)(1, zn, zm, zk, 3);
+}
+
+// CHECK-LABEL: @test_svtmopa_lane_za32_u8_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.ustmopa.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZK:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+void test_svtmopa_lane_za32_u8_s8(svuint8x2_t zn, svint8_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svtmopa_lane_za32,_u8_s8,)(1, zn, zm, zk, 3);
+}
+
+// CHECK-LABEL: @test_svtmopa_lane_za32_s16_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.stmopa.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]], <vscale x 16 x i8> [[ZK:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+void test_svtmopa_lane_za32_s16_s16(svint16x2_t zn, svint16_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svtmopa_lane_za32,_s16_s16,)(1, zn, zm, zk, 3);
+}
+
+// CHECK-LABEL: @test_svtmopa_lane_za32_u16_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.utmopa.nxv8i16(i32 1, <vscale x 8 x i16> [[ZN_COERCE0:%.*]], <vscale x 8 x i16> [[ZN_COERCE1:%.*]], <vscale x 8 x i16> [[ZM:%.*]], <vscale x 16 x i8> [[ZK:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+void test_svtmopa_lane_za32_u16_u16(svuint16x2_t zn, svuint16_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svtmopa_lane_za32,_u16_u16,)(1, zn, zm, zk, 3);
+}
+
+// CHECK-LABEL: @test_svtmopa_lane_za32_f16_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.tmopa.nxv8f16(i32 1, <vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZM:%.*]], <vscale x 16 x i8> [[ZK:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+void test_svtmopa_lane_za32_f16_f16(svfloat16x2_t zn, svfloat16_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svtmopa_lane_za32,_f16_f16,)(1, zn, zm, zk, 3);
+}
+
+// CHECK-LABEL: @test_svtmopa_lane_za32_bf16_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.tmopa.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], <vscale x 16 x i8> [[ZK:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+void test_svtmopa_lane_za32_bf16_bf16(svbfloat16x2_t zn, svbfloat16_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svtmopa_lane_za32,_bf16_bf16,)(1, zn, zm, zk, 3);
+}
+
+// CHECK-LABEL: @test_svtmopa_lane_za16_f16_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.tmopa.nxv8f16(i32 1, <vscale x 8 x half> [[ZN_COERCE0:%.*]], <vscale x 8 x half> [[ZN_COERCE1:%.*]], <vscale x 8 x half> [[ZM:%.*]], <vscale x 16 x i8> [[ZK:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+void test_svtmopa_lane_za16_f16_f16(svfloat16x2_t zn, svfloat16_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svtmopa_lane_za16,_f16_f16,)(1, zn, zm, zk, 3);
+}
+
+// CHECK-LABEL: @test_svtmopa_lane_za16_bf16_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.tmopa.nxv8bf16(i32 1, <vscale x 8 x bfloat> [[ZN_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_COERCE1:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], <vscale x 16 x i8> [[ZK:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+void test_svtmopa_lane_za16_bf16_bf16(svbfloat16x2_t zn, svbfloat16_t zm, svuint8_t zk) __arm_streaming __arm_inout("za") {
+  SME_ACLE_FUNC(svtmopa_lane_za16,_bf16_bf16,)(1, zn, zm, zk, 3);
+}
+
+//void test_svtmopa_lane_za16_mf8_mf8_fpm(svfloat32x2_t zn, svfloat32_t zm, svuint8_t zk, fpm_t fpmr) __arm_streaming __arm_inout("za") {
----------------
jthackray wrote:

Thanks, now uncommented and running.

https://github.com/llvm/llvm-project/pull/135145