[clang] [llvm] [CLANG][LLVM][AArch64]Add SME2.1 intrinsics for MOVAZ tile to vector,… (PR #88499)
via cfe-commits
cfe-commits at lists.llvm.org
Fri Apr 12 03:58:36 PDT 2024
https://github.com/CarolineConcatto created https://github.com/llvm/llvm-project/pull/88499
… single
According to the specification in
ARM-software/acle#309 this adds the intrinsics
// And similarly for u8.
svint8_t svreadz_hor_za8_s8(uint64_t tile, uint32_t slice) __arm_streaming __arm_inout("za");
// And similarly for u16, bf16 and f16.
svint16_t svreadz_hor_za16_s16(uint64_t tile, uint32_t slice) __arm_streaming __arm_inout("za");
// And similarly for u32 and f32.
svint32_t svreadz_hor_za32_s32(uint64_t tile, uint32_t slice) __arm_streaming __arm_inout("za");
// And similarly for u64 and f64.
svint64_t svreadz_hor_za64_s64(uint64_t tile, uint32_t slice) __arm_streaming __arm_inout("za");
// And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64 svint8_t svreadz_hor_za128_s8(uint64_t tile, uint32_t slice) __arm_streaming __arm_inout("za");
>From 0f41b6dd3381b95d69ee769ab2ea4a18e31614bd Mon Sep 17 00:00:00 2001
From: Caroline Concatto <caroline.concatto at arm.com>
Date: Thu, 11 Apr 2024 16:10:16 +0000
Subject: [PATCH] [CLANG][LLVM][AArch64]Add SME2.1 intrinsics for MOVAZ tile to
vector, single
According to the specification in
ARM-software/acle#309 this adds the intrinsics
// And similarly for u8.
svint8_t svreadz_hor_za8_s8(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");
// And similarly for u16, bf16 and f16.
svint16_t svreadz_hor_za16_s16(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");
// And similarly for u32 and f32.
svint32_t svreadz_hor_za32_s32(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");
// And similarly for u64 and f64.
svint64_t svreadz_hor_za64_s64(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");
// And similarly for s16, s32, s64, u8, u16, u32, u64, bf16, f16, f32, f64
svint8_t svreadz_hor_za128_s8(uint64_t tile, uint32_t slice)
__arm_streaming __arm_inout("za");
---
clang/include/clang/Basic/arm_sme.td | 18 +
.../acle_sme2p1_movaz.c | 417 ++++++++++++++++
.../acle_sme2p1_imm.cpp | 21 +
llvm/include/llvm/IR/IntrinsicsAArch64.td | 14 +-
.../Target/AArch64/AArch64ISelLowering.cpp | 37 ++
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 3 +
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 3 +-
llvm/lib/Target/AArch64/SMEInstrFormats.td | 66 ++-
.../AArch64/sme2p1-intrinsics-movaz.ll | 445 ++++++++++++++++++
9 files changed, 1021 insertions(+), 3 deletions(-)
create mode 100644 clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c
create mode 100644 clang/test/Sema/aarch64-sme2p1-intrinsics/acle_sme2p1_imm.cpp
create mode 100644 llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 1ac6d5170ea283..10aa0d1709a74c 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -674,3 +674,21 @@ let TargetGuard = "sme2" in {
def SVLUTI2_LANE_ZT_X2 : Inst<"svluti2_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
}
+
+multiclass ZAReadz<string n_suffix, string t, string i_prefix, list<ImmCheck> ch> {
+ let TargetGuard = "sme2p1" in {
+ def NAME # _H : SInst<"svreadz_hor_" # n_suffix # "_{d}", "dim", t,
+ MergeNone, i_prefix # "_horiz",
+ [IsStreaming, IsInOutZA], ch>;
+
+ def NAME # _V : SInst<"svreadz_ver_" # n_suffix # "_{d}", "dim", t,
+ MergeNone, i_prefix # "_vert",
+ [IsStreaming, IsInOutZA], ch>;
+ }
+}
+
+defm SVREADZ_ZA8 : ZAReadz<"za8", "cUc", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_0>]>;
+defm SVREADZ_ZA16 : ZAReadz<"za16", "sUshb", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_1>]>;
+defm SVREADZ_ZA32 : ZAReadz<"za32", "iUif", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_3>]>;
+defm SVREADZ_ZA64 : ZAReadz<"za64", "lUld", "aarch64_sme_readz", [ImmCheck<0, ImmCheck0_7>]>;
+defm SVREADZ_ZA128 : ZAReadz<"za128", "csilUcUiUsUlbhfd", "aarch64_sme_readz_q", [ImmCheck<0, ImmCheck0_15>]>;
diff --git a/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c
new file mode 100644
index 00000000000000..a0b5a882d53b21
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2p1-intrinsics/acle_sme2p1_movaz.c
@@ -0,0 +1,417 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2p1 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme.h>
+
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreadz_hor_za8_s8(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z23test_svreadz_hor_za8_s8j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svint8_t test_svreadz_hor_za8_s8(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za8_s8(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreadz_hor_za8_u8(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z23test_svreadz_hor_za8_u8j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svuint8_t test_svreadz_hor_za8_u8(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za8_u8(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svreadz_hor_za16_s16(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z25test_svreadz_hor_za16_s16j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+svint16_t test_svreadz_hor_za16_s16(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za16_s16(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svreadz_hor_za16_u16(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z25test_svreadz_hor_za16_u16j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+svuint16_t test_svreadz_hor_za16_u16(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za16_u16(1, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svreadz_hor_za16_f16(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readz.horiz.nxv8f16(i32 0, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z25test_svreadz_hor_za16_f16j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readz.horiz.nxv8f16(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svreadz_hor_za16_f16(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za16_f16(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svreadz_hor_za16_bf16(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.horiz.nxv8bf16(i32 1, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @_Z26test_svreadz_hor_za16_bf16j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.horiz.nxv8bf16(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svreadz_hor_za16_bf16(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za16_bf16(1, slice);
+}
+
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svreadz_hor_za32_s32(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32 0, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z25test_svreadz_hor_za32_s32j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
+//
+svint32_t test_svreadz_hor_za32_s32(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za32_s32(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svreadz_hor_za32_u32(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32 2, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z25test_svreadz_hor_za32_u32j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32 2, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
+//
+svuint32_t test_svreadz_hor_za32_u32(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za32_u32(2, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svreadz_hor_za32_f32(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readz.horiz.nxv4f32(i32 3, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z25test_svreadz_hor_za32_f32j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readz.horiz.nxv4f32(i32 3, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_svreadz_hor_za32_f32(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za32_f32(3, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svreadz_hor_za64_s64(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32 0, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z25test_svreadz_hor_za64_s64j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
+//
+svint64_t test_svreadz_hor_za64_s64(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za64_s64(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svreadz_hor_za64_u64(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32 4, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z25test_svreadz_hor_za64_u64j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32 4, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
+//
+svuint64_t test_svreadz_hor_za64_u64(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za64_u64(4, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @test_svreadz_hor_za64_f64(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readz.horiz.nxv2f64(i32 7, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 2 x double> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x double> @_Z25test_svreadz_hor_za64_f64j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readz.horiz.nxv2f64(i32 7, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x double> [[TMP0]]
+//
+svfloat64_t test_svreadz_hor_za64_f64(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za64_f64(7, slice);
+}
+
+// ZA128
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreadz_hor_za128_s8(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32 0, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z25test_svreadz_hor_za128_s8j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32 0, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svint8_t test_svreadz_hor_za128_s8(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za128_s8(0, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svreadz_hor_za128_u8(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32 1, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 16 x i8> @_Z25test_svreadz_hor_za128_u8j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32 1, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svuint8_t test_svreadz_hor_za128_u8(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za128_u8(1, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svreadz_hor_za128_s16(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32 2, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z26test_svreadz_hor_za128_s16j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32 2, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+svint16_t test_svreadz_hor_za128_s16(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za128_s16(2, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x i16> @test_svreadz_hor_za128_u16(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32 3, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x i16> @_Z26test_svreadz_hor_za128_u16j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32 3, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]]
+//
+svuint16_t test_svreadz_hor_za128_u16(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za128_u16(3, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svreadz_hor_za128_f16(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readz.q.horiz.nxv8f16(i32 4, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z26test_svreadz_hor_za128_f16j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.readz.q.horiz.nxv8f16(i32 4, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svreadz_hor_za128_f16(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za128_f16(4, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svreadz_hor_za128_bf16(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.horiz.nxv8bf16(i32 5, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @_Z27test_svreadz_hor_za128_bf16j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.horiz.nxv8bf16(i32 5, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svreadz_hor_za128_bf16(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za128_bf16(5, slice);
+}
+
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svreadz_hor_za128_s32(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32 6, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z26test_svreadz_hor_za128_s32j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32 6, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
+//
+svint32_t test_svreadz_hor_za128_s32(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za128_s32(6, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svreadz_hor_za128_u32(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32 7, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z26test_svreadz_hor_za128_u32j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32 7, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]]
+//
+svuint32_t test_svreadz_hor_za128_u32(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za128_u32(7, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svreadz_hor_za128_f32(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readz.q.horiz.nxv4f32(i32 8, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z26test_svreadz_hor_za128_f32j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.readz.q.horiz.nxv4f32(i32 8, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_svreadz_hor_za128_f32(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za128_f32(8, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svreadz_hor_za128_s64(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32 13, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z26test_svreadz_hor_za128_s64j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32 13, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
+//
+svint64_t test_svreadz_hor_za128_s64(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za128_s64(13, slice);
+}
+// CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svreadz_hor_za128_u64(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32 14, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z26test_svreadz_hor_za128_u64j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32 14, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]]
+//
+svuint64_t test_svreadz_hor_za128_u64(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za128_u64(14, slice);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 2 x double> @test_svreadz_hor_za128_f64(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readz.q.horiz.nxv2f64(i32 15, i32 [[SLICE]])
+// CHECK-NEXT: ret <vscale x 2 x double> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 2 x double> @_Z26test_svreadz_hor_za128_f64j(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sme.readz.q.horiz.nxv2f64(i32 15, i32 [[SLICE]])
+// CPP-CHECK-NEXT: ret <vscale x 2 x double> [[TMP0]]
+//
+svfloat64_t test_svreadz_hor_za128_f64(uint32_t slice) __arm_streaming __arm_inout("za")
+{
+ return svreadz_hor_za128_f64(15, slice);
+}
diff --git a/clang/test/Sema/aarch64-sme2p1-intrinsics/acle_sme2p1_imm.cpp b/clang/test/Sema/aarch64-sme2p1-intrinsics/acle_sme2p1_imm.cpp
new file mode 100644
index 00000000000000..a97790d0be7f1e
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme2p1-intrinsics/acle_sme2p1_imm.cpp
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \
+// RUN: -target-feature +sve2 -target-feature +sme2p1 -target-feature +bf16 -fsyntax-only -verify %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sme.h>
+
+void tests_readz_tile_to_vector_single(uint32_t slice) __arm_streaming __arm_inout("za") {
+ svreadz_hor_za8_s8(-1, slice); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 0]}}
+ svreadz_hor_za16_s16(-1, slice); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+ svreadz_hor_za32_s32(-1, slice); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ svreadz_hor_za64_s64(-1, slice); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svreadz_hor_za128_s8(-1, slice); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
+ svreadz_hor_za128_s16(-1, slice); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
+ svreadz_hor_za128_s32(-1, slice); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
+ svreadz_hor_za128_s64(-1, slice); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
+ svreadz_hor_za128_bf16(-1, slice); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
+ return;
+}
+
+
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index bcaa37de74b630..c2984f2928c92e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2839,6 +2839,18 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_writeq_horiz : SME_VectorToTile_Intrinsic;
def int_aarch64_sme_writeq_vert : SME_VectorToTile_Intrinsic;
+
+ class SME_MOVAZ_TileToVector_Intrinsic
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+
+ def int_aarch64_sme_readz_horiz : SME_MOVAZ_TileToVector_Intrinsic;
+ def int_aarch64_sme_readz_vert : SME_MOVAZ_TileToVector_Intrinsic;
+
+ def int_aarch64_sme_readz_q_horiz : SME_MOVAZ_TileToVector_Intrinsic;
+ def int_aarch64_sme_readz_q_vert : SME_MOVAZ_TileToVector_Intrinsic;
+
def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
class SME_OuterProduct_Intrinsic
@@ -3646,4 +3658,4 @@ def int_aarch64_sve_pmov_to_pred_lane_zero : SVE2_1VectorArg_Pred_Intrinsic;
def int_aarch64_sve_pmov_to_vector_lane_merging : SVE2_Pred_1VectorArgIndexed_Intrinsic;
-def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic;
\ No newline at end of file
+def int_aarch64_sve_pmov_to_vector_lane_zeroing : SVE2_Pred_1VectorArg_Intrinsic;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 80181a77c9d238..f9436467a81e23 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2832,6 +2832,23 @@ AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
return BB;
}
+MachineBasicBlock *
+AArch64TargetLowering::EmitTileMovaz(unsigned Opc, unsigned BaseReg,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
+
+ MIB.add(MI.getOperand(0)); // Output ZPR
+ MIB.addReg(BaseReg + MI.getOperand(1).getImm(),
+ RegState::Define); // Output ZA Tile
+ MIB.addReg(BaseReg + MI.getOperand(1).getImm()); // Input Za Tile
+ MIB.add(MI.getOperand(2)); // slice index register
+ MIB.add(MI.getOperand(3)); // slice index offset
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
MachineBasicBlock *
AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
@@ -2992,6 +3009,26 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
return EmitZero(MI, BB);
case AArch64::ZERO_T_PSEUDO:
return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
+ case AArch64::MOVAZ_ZMI_H_B_PSEUDO:
+ return EmitTileMovaz(AArch64::MOVAZ_ZMI_H_B, AArch64::ZAB0, MI, BB);
+ case AArch64::MOVAZ_ZMI_H_H_PSEUDO:
+ return EmitTileMovaz(AArch64::MOVAZ_ZMI_H_H, AArch64::ZAH0, MI, BB);
+ case AArch64::MOVAZ_ZMI_H_S_PSEUDO:
+ return EmitTileMovaz(AArch64::MOVAZ_ZMI_H_S, AArch64::ZAS0, MI, BB);
+ case AArch64::MOVAZ_ZMI_H_D_PSEUDO:
+ return EmitTileMovaz(AArch64::MOVAZ_ZMI_H_D, AArch64::ZAD0, MI, BB);
+ case AArch64::MOVAZ_ZMI_H_Q_PSEUDO:
+ return EmitTileMovaz(AArch64::MOVAZ_ZMI_H_Q, AArch64::ZAQ0, MI, BB);
+ case AArch64::MOVAZ_ZMI_V_B_PSEUDO:
+ return EmitTileMovaz(AArch64::MOVAZ_ZMI_V_B, AArch64::ZAB0, MI, BB);
+ case AArch64::MOVAZ_ZMI_V_H_PSEUDO:
+ return EmitTileMovaz(AArch64::MOVAZ_ZMI_V_H, AArch64::ZAH0, MI, BB);
+ case AArch64::MOVAZ_ZMI_V_S_PSEUDO:
+ return EmitTileMovaz(AArch64::MOVAZ_ZMI_V_S, AArch64::ZAS0, MI, BB);
+ case AArch64::MOVAZ_ZMI_V_D_PSEUDO:
+ return EmitTileMovaz(AArch64::MOVAZ_ZMI_V_D, AArch64::ZAD0, MI, BB);
+ case AArch64::MOVAZ_ZMI_V_Q_PSEUDO:
+ return EmitTileMovaz(AArch64::MOVAZ_ZMI_V_Q, AArch64::ZAQ0, MI, BB);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 18439dc7f01020..8c86b47a701cdc 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -635,6 +635,9 @@ class AArch64TargetLowering : public TargetLowering {
MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg,
MachineInstr &MI,
MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitTileMovaz(unsigned Opc, unsigned BaseReg,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const;
MachineBasicBlock *EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const;
MachineBasicBlock *EmitZAInstr(unsigned Opc, unsigned BaseReg,
MachineInstr &MI, MachineBasicBlock *BB,
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 2db0fa25343450..e6c800d55077b6 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -777,7 +777,8 @@ defm FSUB_VG4_M4Z_D : sme2_multivec_accum_add_sub_vg4<"fsub", 0b1001, MatrixOp64
}
let Predicates = [HasSME2p1] in {
-defm MOVAZ_ZMI : sme2p1_movaz_tile_to_vec<"movaz">;
+defm MOVAZ_ZMI : sme2p1_movaz_tile_to_vec<"movaz", int_aarch64_sme_readz_horiz, int_aarch64_sme_readz_vert,
+ int_aarch64_sme_readz_q_horiz, int_aarch64_sme_readz_q_vert>;
defm MOVAZ_2ZMI : sme2p1_movaz_tile_to_vec_vg2<"movaz">;
defm MOVAZ_4ZMI : sme2p1_movaz_tile_to_vec_vg4<"movaz">;
defm MOVAZ_VG2_2ZM : sme2_mova_array_to_vec_vg2_multi<0b010, "movaz">;
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 3363aab4b093cc..db9b61f5c811c6 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -104,6 +104,13 @@ class sme2_move_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, Re
let usesCustomInserter = 1;
}
+class sme2_movez_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, RegisterOperand vector_ty, SMEMatrixTypeEnum za_flag>
+ : SMEPseudo2Instr<name, 0>,
+ Pseudo<(outs vector_ty:$Zn), (ins tile_imm:$tile, MatrixIndexGPR32Op12_15:$Rs, imm_ty:$imm), []> {
+ let SMEMatrixType = za_flag;
+ let usesCustomInserter = 1;
+}
+
//===----------------------------------------------------------------------===//
// SME pattern match helpers.
//===----------------------------------------------------------------------===//
@@ -189,6 +196,11 @@ class SME2_Tile_VG4_Multi_Pat<string name, SDPatternOperator intrinsic, Operand
: Pat<(intrinsic tile_imm:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4),
(!cast<Instruction>(name # _PSEUDO) $tile, $base, $offset, (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3))>;
+
+class SME2_Tile_Movaz_Pat<string name, SDPatternOperator intrinsic, ValueType out_vt, Operand tile_imm, Operand index_ty, ComplexPattern tileslice>
+ : Pat<(out_vt (intrinsic tile_imm:$tile, (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)))),
+ (!cast<Instruction>(name # _PSEUDO) $tile, $base, $offset)>;
+
//===----------------------------------------------------------------------===//
// SME pattern match helpers.
//===----------------------------------------------------------------------===//
@@ -4029,6 +4041,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi<string mnemonic>{
defm _V : sme2_mova_tile_to_vec_vg2_multi_inst<0b1, 0b000, mnemonic>;
}
+
// SME2p1 move tile to vector and zero tile, two registers
multiclass sme2p1_movaz_tile_to_vec_vg2<string mnemonic>{
defm _H : sme2_mova_tile_to_vec_vg2_multi_inst<0b0, 0b010, mnemonic>;
@@ -4737,9 +4750,60 @@ multiclass sme2p1_movaz_tile_to_vec_base<bit v, string mnemonic> {
}
}
-multiclass sme2p1_movaz_tile_to_vec<string mnemonic>{
+multiclass sme2p1_movaz_tile_to_vec<string mnemonic, SDPatternOperator intrinsic_horiz, SDPatternOperator intrinsic_vert,
+ SDPatternOperator intrinsic_horiz_q, SDPatternOperator intrinsic_vert_q>{
defm _H : sme2p1_movaz_tile_to_vec_base<0b0, mnemonic>;
defm _V : sme2p1_movaz_tile_to_vec_base<0b1, mnemonic>;
+
+ def NAME # _H_B_PSEUDO : sme2_movez_to_tile_pseudo<NAME, sme_elm_idx0_0, sme_elm_idx0_15, ZPR8, SMEMatrixTileB>;
+ def NAME # _H_H_PSEUDO : sme2_movez_to_tile_pseudo<NAME, sme_elm_idx0_1, sme_elm_idx0_7, ZPR16, SMEMatrixTileH>;
+ def NAME # _H_S_PSEUDO : sme2_movez_to_tile_pseudo<NAME, sme_elm_idx0_3, sme_elm_idx0_3, ZPR32, SMEMatrixTileS>;
+ def NAME # _H_D_PSEUDO : sme2_movez_to_tile_pseudo<NAME, sme_elm_idx0_7, sme_elm_idx0_1, ZPR64, SMEMatrixTileD>;
+ def NAME # _H_Q_PSEUDO : sme2_movez_to_tile_pseudo<NAME, sme_elm_idx0_15, sme_elm_idx0_0, ZPR128, SMEMatrixTileQ>;
+
+ def NAME # _V_B_PSEUDO : sme2_movez_to_tile_pseudo<NAME, sme_elm_idx0_0, sme_elm_idx0_15, ZPR8, SMEMatrixTileB>;
+ def NAME # _V_H_PSEUDO : sme2_movez_to_tile_pseudo<NAME, sme_elm_idx0_1, sme_elm_idx0_7, ZPR16, SMEMatrixTileH>;
+ def NAME # _V_S_PSEUDO : sme2_movez_to_tile_pseudo<NAME, sme_elm_idx0_3, sme_elm_idx0_3, ZPR32, SMEMatrixTileS>;
+ def NAME # _V_D_PSEUDO : sme2_movez_to_tile_pseudo<NAME, sme_elm_idx0_7, sme_elm_idx0_1, ZPR64, SMEMatrixTileD>;
+ def NAME # _V_Q_PSEUDO : sme2_movez_to_tile_pseudo<NAME, sme_elm_idx0_15, sme_elm_idx0_0, ZPR128, SMEMatrixTileQ>;
+
+ def : SME2_Tile_Movaz_Pat<NAME # _H_B, intrinsic_horiz, nxv16i8,sme_elm_idx0_0, sme_elm_idx0_15, tileslice8>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_H, intrinsic_horiz, nxv8i16, sme_elm_idx0_1, sme_elm_idx0_7, tileslice16>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_S, intrinsic_horiz, nxv4i32, sme_elm_idx0_3, sme_elm_idx0_3, tileslice32>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_D, intrinsic_horiz, nxv2i64, sme_elm_idx0_7, sme_elm_idx0_1, tileslice64>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_H, intrinsic_horiz, nxv8bf16, sme_elm_idx0_1, sme_elm_idx0_7, tileslice16>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_H, intrinsic_horiz, nxv8f16, sme_elm_idx0_1, sme_elm_idx0_7, tileslice16>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_S, intrinsic_horiz, nxv4f32, sme_elm_idx0_3, sme_elm_idx0_3, tileslice32>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_D, intrinsic_horiz, nxv2f64, sme_elm_idx0_7, sme_elm_idx0_1, tileslice64>;
+
+ def : SME2_Tile_Movaz_Pat<NAME # _V_B, intrinsic_vert, nxv16i8, sme_elm_idx0_0, sme_elm_idx0_15, tileslice8>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_H, intrinsic_vert, nxv8i16, sme_elm_idx0_1, sme_elm_idx0_7, tileslice16>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_S, intrinsic_vert, nxv4i32, sme_elm_idx0_3, sme_elm_idx0_3, tileslice32>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_D, intrinsic_vert, nxv2i64, sme_elm_idx0_7, sme_elm_idx0_1, tileslice64>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_H, intrinsic_vert, nxv8bf16, sme_elm_idx0_1, sme_elm_idx0_7, tileslice16>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_H, intrinsic_vert, nxv8f16, sme_elm_idx0_1, sme_elm_idx0_7, tileslice16>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_S, intrinsic_vert, nxv4f32, sme_elm_idx0_3, sme_elm_idx0_3, tileslice32>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_D, intrinsic_vert, nxv2f64, sme_elm_idx0_7, sme_elm_idx0_1, tileslice64>;
+
+ // H_Q
+ def : SME2_Tile_Movaz_Pat<NAME # _H_Q, intrinsic_horiz_q, nxv16i8, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_Q, intrinsic_horiz_q, nxv8i16, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_Q, intrinsic_horiz_q, nxv4i32, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_Q, intrinsic_horiz_q, nxv2i64, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_Q, intrinsic_horiz_q, nxv8bf16, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_Q, intrinsic_horiz_q, nxv8f16, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_Q, intrinsic_horiz_q, nxv4f32, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _H_Q, intrinsic_horiz_q, nxv2f64, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+
+ // _V_Q
+ def : SME2_Tile_Movaz_Pat<NAME # _V_Q, intrinsic_vert_q, nxv16i8, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_Q, intrinsic_vert_q, nxv8i16, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_Q, intrinsic_vert_q, nxv4i32, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_Q, intrinsic_vert_q, nxv2i64, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_Q, intrinsic_vert_q, nxv8bf16, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_Q, intrinsic_vert_q, nxv8f16, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_Q, intrinsic_vert_q, nxv4f32, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
+ def : SME2_Tile_Movaz_Pat<NAME # _V_Q, intrinsic_vert_q, nxv2f64, sme_elm_idx0_15, sme_elm_idx0_0, tileslice128>;
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll
new file mode 100644
index 00000000000000..7c556e09c2e960
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2p1-intrinsics-movaz.ll
@@ -0,0 +1,445 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p1 -verify-machineinstrs < %s | FileCheck %s
+
+;MOVAZ (tile to vector, single)
+
+;;
+; Horiz
+;;
+define <vscale x 16 x i8> @test_readz_hor_z8_i8(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z8_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.b, za0h.b[w12, 0]
+; CHECK-NEXT: movaz z0.b, za0h.b[w12, 14]
+; CHECK-NEXT: ret
+ %res = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32 0, i32 %slice)
+ %slice.max = add i32 %slice, 14
+ %res2 = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32 0, i32 %slice.max)
+ ret <vscale x 16 x i8> %res2
+}
+
+define <vscale x 8 x i16> @test_readz_hor_z16_i16(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z16_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.h, za0h.h[w12, 0]
+; CHECK-NEXT: movaz z0.h, za1h.h[w12, 7]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32 0, i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32 1, i32 %slice.max)
+ ret <vscale x 8 x i16> %res2
+}
+
+define <vscale x 4 x i32> @test_readz_hor_z32_i32(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z32_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.s, za0h.s[w12, 0]
+; CHECK-NEXT: movaz z0.s, za3h.s[w12, 3]
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32 0, i32 %slice)
+ %slice.max = add i32 %slice, 3
+ %res2 = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32 3, i32 %slice.max)
+ ret <vscale x 4 x i32> %res2
+}
+
+define <vscale x 2 x i64> @test_readz_hor_z64_i64(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z64_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.d, za0h.d[w12, 0]
+; CHECK-NEXT: movaz z1.d, za7h.d[w12, 1]
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32 0, i32 %slice)
+ %slice.max = add i32 %slice, 1
+ %res2 = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32 7, i32 %slice.max)
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 8 x bfloat> @test_readz_hor_z16_bf16(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z16_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.h, za0h.h[w12, 0]
+; CHECK-NEXT: movaz z0.h, za1h.h[w12, 7]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.horiz.nxv8bf16(i32 0, i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.horiz.nxv8bf16(i32 1, i32 %slice.max)
+ ret <vscale x 8 x bfloat> %res2
+}
+
+define <vscale x 8 x half> @test_readz_hor_z16_f16(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z16_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.h, za0h.h[w12, 0]
+; CHECK-NEXT: movaz z0.h, za1h.h[w12, 7]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x half> @llvm.aarch64.sme.readz.horiz.nxv8f16(i32 0, i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call <vscale x 8 x half> @llvm.aarch64.sme.readz.horiz.nxv8f16(i32 1, i32 %slice.max)
+ ret <vscale x 8 x half> %res2
+}
+
+define <vscale x 4 x float> @test_readz_hor_z32_f32(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z32_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.s, za0h.s[w12, 0]
+; CHECK-NEXT: movaz z0.s, za3h.s[w12, 3]
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x float> @llvm.aarch64.sme.readz.horiz.nxv4f32(i32 0, i32 %slice)
+ %slice.max = add i32 %slice, 3
+ %res2 = call <vscale x 4 x float> @llvm.aarch64.sme.readz.horiz.nxv4f32(i32 3, i32 %slice.max)
+ ret <vscale x 4 x float> %res2
+}
+
+define <vscale x 2 x double> @test_readz_hor_z64_f64(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z64_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.d, za0h.d[w12, 0]
+; CHECK-NEXT: movaz z1.d, za7h.d[w12, 1]
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x double> @llvm.aarch64.sme.readz.horiz.nxv2f64(i32 0, i32 %slice)
+ %slice.max = add i32 %slice, 1
+ %res2 = call <vscale x 2 x double> @llvm.aarch64.sme.readz.horiz.nxv2f64(i32 7, i32 %slice.max)
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 16 x i8> @test_readz_hor_z128_i8(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z128_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.q, za0h.q[w12, 0]
+; CHECK-NEXT: movaz z0.q, za15h.q[w12, 0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32 0, i32 %slice)
+ %res2 = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32 15, i32 %slice)
+ ret <vscale x 16 x i8> %res2
+}
+
+define <vscale x 8 x i16> @test_readz_hor_z128_i16(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z128_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.q, za0h.q[w12, 0]
+; CHECK-NEXT: movaz z0.q, za15h.q[w12, 0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32 0, i32 %slice)
+ %res2 = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32 15, i32 %slice)
+ ret <vscale x 8 x i16> %res2
+}
+
+define <vscale x 4 x i32> @test_readz_hor_z128_i32(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z128_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.q, za0h.q[w12, 0]
+; CHECK-NEXT: movaz z0.q, za15h.q[w12, 0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32 0, i32 %slice)
+ %res2 = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32 15, i32 %slice)
+ ret <vscale x 4 x i32> %res2
+}
+
+define <vscale x 2 x i64> @test_readz_hor_z128_i64(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z128_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.q, za0h.q[w12, 0]
+; CHECK-NEXT: movaz z1.q, za15h.q[w12, 0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32 0, i32 %slice)
+ %res2 = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32 15, i32 %slice)
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 8 x bfloat> @test_readz_hor_z128_bf16(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z128_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.q, za0h.q[w12, 0]
+; CHECK-NEXT: movaz z0.q, za15h.q[w12, 0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.horiz.nxv8bf16(i32 0, i32 %slice)
+ %res2 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.horiz.nxv8bf16(i32 15, i32 %slice)
+ ret <vscale x 8 x bfloat> %res2
+}
+
+define <vscale x 8 x half> @test_readz_hor_z128_f16(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z128_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.q, za0h.q[w12, 0]
+; CHECK-NEXT: movaz z0.q, za15h.q[w12, 0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x half> @llvm.aarch64.sme.readz.q.horiz.nxv8f16(i32 0, i32 %slice)
+ %res2 = call <vscale x 8 x half> @llvm.aarch64.sme.readz.q.horiz.nxv8f16(i32 15, i32 %slice)
+ ret <vscale x 8 x half> %res2
+}
+
+define <vscale x 4 x float> @test_readz_hor_z128_f32(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z128_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.q, za0h.q[w12, 0]
+; CHECK-NEXT: movaz z0.q, za15h.q[w12, 0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x float> @llvm.aarch64.sme.readz.q.horiz.nxv4f32(i32 0, i32 %slice)
+ %res2 = call <vscale x 4 x float> @llvm.aarch64.sme.readz.q.horiz.nxv4f32(i32 15, i32 %slice)
+ ret <vscale x 4 x float> %res2
+}
+
+define <vscale x 2 x double> @test_readz_hor_z128_f64(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_hor_z128_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.q, za0h.q[w12, 0]
+; CHECK-NEXT: movaz z1.q, za15h.q[w12, 0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x double> @llvm.aarch64.sme.readz.q.horiz.nxv2f64(i32 0, i32 %slice)
+ %res2 = call <vscale x 2 x double> @llvm.aarch64.sme.readz.q.horiz.nxv2f64(i32 15, i32 %slice)
+ ret <vscale x 2 x double> %res
+}
+
+;;
+; Vert
+;;
+define <vscale x 16 x i8> @test_readz_ver_z8_i8(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z8_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.b, za0v.b[w12, 0]
+; CHECK-NEXT: movaz z0.b, za0v.b[w12, 14]
+; CHECK-NEXT: ret
+ %res = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.vert.nxv16i8(i32 0, i32 %slice)
+ %slice.max = add i32 %slice, 14
+ %res2 = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.vert.nxv16i8(i32 0, i32 %slice.max)
+ ret <vscale x 16 x i8> %res2
+}
+
+define <vscale x 8 x i16> @test_readz_ver_z16_i16(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z16_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.h, za0v.h[w12, 0]
+; CHECK-NEXT: movaz z0.h, za1v.h[w12, 7]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.vert.nxv8i16(i32 0, i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.vert.nxv8i16(i32 1, i32 %slice.max)
+ ret <vscale x 8 x i16> %res2
+}
+
+define <vscale x 4 x i32> @test_readz_ver_z32_i32(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z32_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.s, za0v.s[w12, 0]
+; CHECK-NEXT: movaz z0.s, za3v.s[w12, 3]
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.vert.nxv4i32(i32 0, i32 %slice)
+ %slice.max = add i32 %slice, 3
+ %res2 = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.vert.nxv4i32(i32 3, i32 %slice.max)
+ ret <vscale x 4 x i32> %res2
+}
+
+define <vscale x 2 x i64> @test_readz_ver_z64_i64(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z64_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.d, za0v.d[w12, 0]
+; CHECK-NEXT: movaz z1.d, za7v.d[w12, 1]
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.vert.nxv2i64(i32 0, i32 %slice)
+ %slice.max = add i32 %slice, 1
+ %res2 = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.vert.nxv2i64(i32 7, i32 %slice.max)
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 8 x bfloat> @test_readz_ver_z16_bf16(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z16_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.h, za0v.h[w12, 0]
+; CHECK-NEXT: movaz z0.h, za1v.h[w12, 7]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.vert.nxv8bf16(i32 0, i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.vert.nxv8bf16(i32 1, i32 %slice.max)
+ ret <vscale x 8 x bfloat> %res2
+}
+
+define <vscale x 8 x half> @test_readz_ver_z16_f16(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z16_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.h, za0v.h[w12, 0]
+; CHECK-NEXT: movaz z0.h, za1v.h[w12, 7]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x half> @llvm.aarch64.sme.readz.vert.nxv8f16(i32 0, i32 %slice)
+ %slice.max = add i32 %slice, 7
+ %res2 = call <vscale x 8 x half> @llvm.aarch64.sme.readz.vert.nxv8f16(i32 1, i32 %slice.max)
+ ret <vscale x 8 x half> %res2
+}
+
+define <vscale x 4 x float> @test_readz_ver_z32_f32(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z32_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.s, za0v.s[w12, 0]
+; CHECK-NEXT: movaz z0.s, za3v.s[w12, 3]
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x float> @llvm.aarch64.sme.readz.vert.nxv4f32(i32 0, i32 %slice)
+ %slice.max = add i32 %slice, 3
+ %res2 = call <vscale x 4 x float> @llvm.aarch64.sme.readz.vert.nxv4f32(i32 3, i32 %slice.max)
+ ret <vscale x 4 x float> %res2
+}
+
+define <vscale x 2 x double> @test_readz_ver_z64_f64(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z64_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.d, za0v.d[w12, 0]
+; CHECK-NEXT: movaz z1.d, za7v.d[w12, 1]
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x double> @llvm.aarch64.sme.readz.vert.nxv2f64(i32 0, i32 %slice)
+ %slice.max = add i32 %slice, 1
+ %res2 = call <vscale x 2 x double> @llvm.aarch64.sme.readz.vert.nxv2f64(i32 7, i32 %slice.max)
+ ret <vscale x 2 x double> %res
+}
+
+define <vscale x 16 x i8> @test_readz_ver_z128_i8(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z128_i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.q, za0v.q[w12, 0]
+; CHECK-NEXT: movaz z0.q, za15v.q[w12, 0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.vert.nxv16i8(i32 0, i32 %slice)
+ %res2 = call <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.vert.nxv16i8(i32 15, i32 %slice)
+ ret <vscale x 16 x i8> %res2
+}
+
+define <vscale x 8 x i16> @test_readz_ver_z128_i16(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z128_i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.q, za0v.q[w12, 0]
+; CHECK-NEXT: movaz z0.q, za15v.q[w12, 0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.vert.nxv8i16(i32 0, i32 %slice)
+ %res2 = call <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.vert.nxv8i16(i32 15, i32 %slice)
+ ret <vscale x 8 x i16> %res2
+}
+
+define <vscale x 4 x i32> @test_readz_ver_z128_i32(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z128_i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.q, za0v.q[w12, 0]
+; CHECK-NEXT: movaz z0.q, za15v.q[w12, 0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.vert.nxv4i32(i32 0, i32 %slice)
+ %res2 = call <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.vert.nxv4i32(i32 15, i32 %slice)
+ ret <vscale x 4 x i32> %res2
+}
+
+define <vscale x 2 x i64> @test_readz_ver_z128_i64(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z128_i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.q, za0v.q[w12, 0]
+; CHECK-NEXT: movaz z1.q, za15v.q[w12, 0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.vert.nxv2i64(i32 0, i32 %slice)
+ %res2 = call <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.vert.nxv2i64(i32 15, i32 %slice)
+ ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 8 x bfloat> @test_readz_ver_z128_bf16(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z128_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.q, za0v.q[w12, 0]
+; CHECK-NEXT: movaz z0.q, za15v.q[w12, 0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.vert.nxv8bf16(i32 0, i32 %slice)
+ %res2 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.vert.nxv8bf16(i32 15, i32 %slice)
+ ret <vscale x 8 x bfloat> %res2
+}
+
+define <vscale x 8 x half> @test_readz_ver_z128_f16(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z128_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.q, za0v.q[w12, 0]
+; CHECK-NEXT: movaz z0.q, za15v.q[w12, 0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 8 x half> @llvm.aarch64.sme.readz.q.vert.nxv8f16(i32 0, i32 %slice)
+ %res2 = call <vscale x 8 x half> @llvm.aarch64.sme.readz.q.vert.nxv8f16(i32 15, i32 %slice)
+ ret <vscale x 8 x half> %res2
+}
+
+define <vscale x 4 x float> @test_readz_ver_z128_f32(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z128_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.q, za0v.q[w12, 0]
+; CHECK-NEXT: movaz z0.q, za15v.q[w12, 0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x float> @llvm.aarch64.sme.readz.q.vert.nxv4f32(i32 0, i32 %slice)
+ %res2 = call <vscale x 4 x float> @llvm.aarch64.sme.readz.q.vert.nxv4f32(i32 15, i32 %slice)
+ ret <vscale x 4 x float> %res2
+}
+
+define <vscale x 2 x double> @test_readz_ver_z128_f64(i32 %tile, i32 %slice) {
+; CHECK-LABEL: test_readz_ver_z128_f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: movaz z0.q, za0v.q[w12, 0]
+; CHECK-NEXT: movaz z1.q, za15v.q[w12, 0]
+; CHECK-NEXT: ret
+ %res = call <vscale x 2 x double> @llvm.aarch64.sme.readz.q.vert.nxv2f64(i32 0, i32 %slice)
+ %res2 = call <vscale x 2 x double> @llvm.aarch64.sme.readz.q.vert.nxv2f64(i32 15, i32 %slice)
+ ret <vscale x 2 x double> %res
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sme.readz.horiz.nxv16i8(i32, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sme.readz.horiz.nxv8i16(i32, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sme.readz.horiz.nxv4i32(i32, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sme.readz.horiz.nxv2i64(i32, i32)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.horiz.nxv8bf16(i32, i32)
+declare <vscale x 8 x half> @llvm.aarch64.sme.readz.horiz.nxv8f16(i32, i32)
+declare <vscale x 4 x float> @llvm.aarch64.sme.readz.horiz.nxv4f32(i32, i32)
+declare <vscale x 2 x double> @llvm.aarch64.sme.readz.horiz.nxv2f64(i32, i32)
+declare <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.horiz.nxv16i8(i32, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.horiz.nxv8i16(i32, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.horiz.nxv4i32(i32, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.horiz.nxv2i64(i32, i32)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.horiz.nxv8bf16(i32, i32)
+declare <vscale x 8 x half> @llvm.aarch64.sme.readz.q.horiz.nxv8f16(i32, i32)
+declare <vscale x 4 x float> @llvm.aarch64.sme.readz.q.horiz.nxv4f32(i32, i32)
+declare <vscale x 2 x double> @llvm.aarch64.sme.readz.q.horiz.nxv2f64(i32, i32)
+
+
+declare <vscale x 16 x i8> @llvm.aarch64.sme.readz.vert.nxv16i8(i32, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sme.readz.vert.nxv8i16(i32, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sme.readz.vert.nxv4i32(i32, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sme.readz.vert.nxv2i64(i32, i32)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.vert.nxv8bf16(i32, i32)
+declare <vscale x 8 x half> @llvm.aarch64.sme.readz.vert.nxv8f16(i32, i32)
+declare <vscale x 4 x float> @llvm.aarch64.sme.readz.vert.nxv4f32(i32, i32)
+declare <vscale x 2 x double> @llvm.aarch64.sme.readz.vert.nxv2f64(i32, i32)
+declare <vscale x 16 x i8> @llvm.aarch64.sme.readz.q.vert.nxv16i8(i32, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sme.readz.q.vert.nxv8i16(i32, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sme.readz.q.vert.nxv4i32(i32, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sme.readz.q.vert.nxv2i64(i32, i32)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sme.readz.q.vert.nxv8bf16(i32, i32)
+declare <vscale x 8 x half> @llvm.aarch64.sme.readz.q.vert.nxv8f16(i32, i32)
+declare <vscale x 4 x float> @llvm.aarch64.sme.readz.q.vert.nxv4f32(i32, i32)
+declare <vscale x 2 x double> @llvm.aarch64.sme.readz.q.vert.nxv2f64(i32, i32)
More information about the cfe-commits
mailing list