[llvm] [AArch64] Model ZA array using inaccessible memory (PR #132058)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 11 08:21:13 PDT 2025
https://github.com/Lukacma updated https://github.com/llvm/llvm-project/pull/132058
>From 1e85f2acb77d83678c21ad72f2d348f487f0518d Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Wed, 19 Mar 2025 15:35:59 +0000
Subject: [PATCH 1/4] [AArch64] Model ZA array using inaccessible memory
---
llvm/include/llvm/IR/IntrinsicsAArch64.td | 80 ++--
llvm/lib/Target/AArch64/SMEInstrFormats.td | 39 +-
.../AArch64/sme-intrinsics-mova-extract.ll | 259 ++++++++-----
.../AArch64/sme2-intrinsics-extract-mova.ll | 362 ++++++++++++------
4 files changed, 483 insertions(+), 257 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 6dfc3c8f2a393..7648fc55d54ae 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2940,7 +2940,7 @@ def int_aarch64_sve_whilewr_d : SVE2_CONFLICT_DETECT_Intrinsic;
let TargetPrefix = "aarch64" in {
class SME_Load_Store_Intrinsic<LLVMType pred_ty>
: DefaultAttrsIntrinsic<[],
- [pred_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<2>>]>;
+ [pred_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<2>>]>;
// Loads
def int_aarch64_sme_ld1b_horiz : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
@@ -2968,18 +2968,18 @@ let TargetPrefix = "aarch64" in {
// Spill + fill
class SME_LDR_STR_ZA_Intrinsic
- : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty, llvm_i32_ty]>;
+ : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty, llvm_i32_ty], [IntrInaccessibleMemOrArgMemOnly]>;
def int_aarch64_sme_ldr : SME_LDR_STR_ZA_Intrinsic;
def int_aarch64_sme_str : SME_LDR_STR_ZA_Intrinsic;
class SME_TileToVector_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
- llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<2>>]>;
+ llvm_i32_ty, llvm_i32_ty], [IntrReadMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<2>>]>;
class SME_VectorToTile_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty, llvm_i32_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
- llvm_anyvector_ty], [ImmArg<ArgIndex<0>>]>;
+ llvm_anyvector_ty], [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
def int_aarch64_sme_read_horiz : SME_TileToVector_Intrinsic;
def int_aarch64_sme_read_vert : SME_TileToVector_Intrinsic;
@@ -2994,13 +2994,13 @@ let TargetPrefix = "aarch64" in {
class SME_MOVAZ_TileToVector_X2_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[llvm_i32_ty, llvm_i32_ty],
- [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+ [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
class SME_MOVAZ_TileToVector_X4_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>,LLVMMatchType<0>],
[llvm_i32_ty, llvm_i32_ty],
- [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+ [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
def int_aarch64_sme_readz_horiz_x2 : SME_MOVAZ_TileToVector_X2_Intrinsic;
def int_aarch64_sme_readz_vert_x2 : SME_MOVAZ_TileToVector_X2_Intrinsic;
@@ -3011,7 +3011,7 @@ let TargetPrefix = "aarch64" in {
class SME_MOVAZ_TileToVector_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[llvm_i32_ty, llvm_i32_ty],
- [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+ [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
def int_aarch64_sme_readz_horiz : SME_MOVAZ_TileToVector_Intrinsic;
def int_aarch64_sme_readz_vert : SME_MOVAZ_TileToVector_Intrinsic;
@@ -3022,12 +3022,12 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_readz_x2
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[llvm_i32_ty],
- [IntrNoMem, IntrHasSideEffects]>;
+ [IntrInaccessibleMemOnly]>;
def int_aarch64_sme_readz_x4
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_i32_ty],
- [IntrNoMem, IntrHasSideEffects]>;
+ [IntrInaccessibleMemOnly]>;
def int_aarch64_sme_write_lane_zt
: DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_anyvector_ty, llvm_i32_ty],
@@ -3038,7 +3038,7 @@ let TargetPrefix = "aarch64" in {
[ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
- def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
+ def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [IntrWriteMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
def int_aarch64_sme_in_streaming_mode : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrNoMem]>, ClangBuiltin<"__builtin_arm_in_streaming_mode">;
class SME_OuterProduct_Intrinsic
@@ -3047,7 +3047,7 @@ let TargetPrefix = "aarch64" in {
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
LLVMMatchType<0>,
- llvm_anyvector_ty], [ImmArg<ArgIndex<0>>]>;
+ llvm_anyvector_ty], [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
def int_aarch64_sme_mopa : SME_OuterProduct_Intrinsic;
def int_aarch64_sme_mops : SME_OuterProduct_Intrinsic;
@@ -3069,7 +3069,7 @@ let TargetPrefix = "aarch64" in {
[llvm_i32_ty,
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
- llvm_anyvector_ty], [ImmArg<ArgIndex<0>>]>;
+ llvm_anyvector_ty], [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
def int_aarch64_sme_addha : SME_AddVectorToTile_Intrinsic;
def int_aarch64_sme_addva : SME_AddVectorToTile_Intrinsic;
@@ -3189,56 +3189,56 @@ let TargetPrefix = "aarch64" in {
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>],
- []>;
+ [IntrInaccessibleMemOnly]>;
class SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>],
- []>;
+ [IntrInaccessibleMemOnly]>;
class SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
LLVMMatchType<0>],
- []>;
+ [IntrInaccessibleMemOnly]>;
class SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
- []>;
+ [IntrInaccessibleMemOnly]>;
class SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
- []>;
+ [IntrInaccessibleMemOnly]>;
class SME2_Matrix_ArrayVector_Single_Index_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty,
LLVMMatchType<0>, llvm_i32_ty],
- [ImmArg<ArgIndex<3>>]>;
+ [IntrInaccessibleMemOnly, ImmArg<ArgIndex<3>>]>;
class SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, llvm_i32_ty],
- [ImmArg<ArgIndex<4>>]>;
+ [IntrInaccessibleMemOnly, ImmArg<ArgIndex<4>>]>;
class SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
LLVMMatchType<0>, llvm_i32_ty],
- [ImmArg<ArgIndex<6>>]>;
+ [IntrInaccessibleMemOnly, ImmArg<ArgIndex<6>>]>;
class SME2_VG2_Multi_Imm_Intrinsic
: DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>],
@@ -3257,14 +3257,14 @@ let TargetPrefix = "aarch64" in {
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>],
- []>;
+ [IntrWriteMem, IntrInaccessibleMemOnly]>;
class SME2_ZA_Write_VG4_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
- []>;
+ [IntrWriteMem, IntrInaccessibleMemOnly]>;
class SME2_VG2_Multi_Single_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
@@ -3353,50 +3353,50 @@ let TargetPrefix = "aarch64" in {
class SME2_ZA_ArrayVector_Read_VG2_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[llvm_i32_ty],
- []>;
+ [IntrReadMem, IntrInaccessibleMemOnly]>;
class SME2_ZA_ArrayVector_Read_VG4_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_i32_ty],
- []>;
+ [IntrReadMem, IntrInaccessibleMemOnly]>;
class SME2_Matrix_TileVector_Read_VG2_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[llvm_i32_ty, llvm_i32_ty],
- []>;
+ [IntrReadMem, IntrInaccessibleMemOnly]>;
class SME2_Matrix_TileVector_Read_VG4_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_i32_ty, llvm_i32_ty],
- []>;
+ [IntrReadMem, IntrInaccessibleMemOnly]>;
class SME2_ZA_ArrayVector_Write_VG2_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>],
- []>;
+ [IntrWriteMem, IntrInaccessibleMemOnly]>;
class SME2_ZA_ArrayVector_Write_VG4_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
- []>;
+ [IntrWriteMem, IntrInaccessibleMemOnly]>;
class SME2_Matrix_TileVector_Write_VG2_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty, llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>],
- [ImmArg<ArgIndex<0>>]>;
+ [IntrWriteMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
class SME2_Matrix_TileVector_Write_VG4_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty, llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
- [ImmArg<ArgIndex<0>>]>;
+ [IntrWriteMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
class SME2_VG2_Multi_Single_Single_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
@@ -3562,7 +3562,7 @@ let TargetPrefix = "aarch64" in {
// Multi-vector zeroing
foreach vg = ["vg1x2", "vg1x4", "vg2x1", "vg2x2", "vg2x4", "vg4x1", "vg4x2", "vg4x4"] in {
- def int_aarch64_sme_zero_za64_ # vg : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects]>;
+ def int_aarch64_sme_zero_za64_ # vg : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [IntrWriteMem, IntrInaccessibleMemOnly]>;
}
// Multi-vector signed saturating doubling multiply high
@@ -4002,57 +4002,57 @@ let TargetPrefix = "aarch64" in {
[llvm_i32_ty,
llvm_nxv16i1_ty, llvm_nxv16i1_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty],
- [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+ [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly]>;
class SME_FP8_ZA_LANE_VGx1_Intrinsic
: DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty,
llvm_nxv16i8_ty,
llvm_i32_ty],
- [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<3>>]>;
+ [IntrInaccessibleMemOnly, ImmArg<ArgIndex<3>>]>;
class SME_FP8_ZA_LANE_VGx2_Intrinsic
: DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty,
llvm_nxv16i8_ty,
llvm_i32_ty],
- [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<4>>]>;
+ [IntrInaccessibleMemOnly, ImmArg<ArgIndex<4>>]>;
class SME_FP8_ZA_LANE_VGx4_Intrinsic
: DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
llvm_nxv16i8_ty,
llvm_i32_ty],
- [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<6>>]>;
+ [IntrInaccessibleMemOnly, ImmArg<ArgIndex<6>>]>;
class SME_FP8_ZA_SINGLE_VGx1_Intrinsic
: DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty,
llvm_nxv16i8_ty],
- [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+ [IntrInaccessibleMemOnly]>;
class SME_FP8_ZA_SINGLE_VGx2_Intrinsic
: DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty,
llvm_nxv16i8_ty],
- [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+ [IntrInaccessibleMemOnly]>;
class SME_FP8_ZA_SINGLE_VGx4_Intrinsic
: DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
llvm_nxv16i8_ty],
- [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+ [IntrInaccessibleMemOnly]>;
class SME_FP8_ZA_MULTI_VGx2_Intrinsic
: DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty],
- [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+ [IntrInaccessibleMemOnly]>;
class SME_FP8_ZA_MULTI_VGx4_Intrinsic
: DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
- [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+ [IntrInaccessibleMemOnly]>;
//
// CVT from FP8 to half-precision/BFloat16 multi-vector
//
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 4f6a413ba5e5c..ff850751acf48 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -102,6 +102,8 @@ class sme_outer_product_pseudo<ZPRRegOp zpr_ty, SMEMatrixTypeEnum za_flag>
// Translated to the actual instructions in AArch64ISelLowering.cpp
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
}
class sme2_za_array_2op_multi_single_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
@@ -110,6 +112,8 @@ class sme2_za_array_2op_multi_single_pseudo<string name, Operand index_ty, Regis
Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm), []> {
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
}
class sme2_za_array_2op_multi_multi_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
@@ -118,6 +122,8 @@ class sme2_za_array_2op_multi_multi_pseudo<string name, Operand index_ty, Regist
Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3, multi_vector_ty:$Zn, multi_vector_ty:$Zm), []> {
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
}
class sme2_za_array_2op_multi_index_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
@@ -126,6 +132,8 @@ class sme2_za_array_2op_multi_index_pseudo<string name, Operand index_ty, Regist
Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm, imm_ty:$i), []> {
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
}
class sme2_move_to_za_pseudo<string name, Operand imm_ty, RegisterOperand multi_vector_ty, SMEMatrixTypeEnum za_flag>
@@ -133,6 +141,7 @@ class sme2_move_to_za_pseudo<string name, Operand imm_ty, RegisterOperand multi_
Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rs, imm_ty:$imm, multi_vector_ty:$Zn), []> {
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayStore = 1;
}
class sme2_move_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, RegisterOperand multi_vector_ty, SMEMatrixTypeEnum za_flag>
@@ -140,6 +149,7 @@ class sme2_move_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, Re
Pseudo<(outs), (ins tile_imm:$tile, MatrixIndexGPR32Op12_15:$Rs, imm_ty:$imm, multi_vector_ty:$Zn), []> {
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayStore = 1;
}
class sem2p1_zero_matrix_pseudo<string name, Operand index_ty, SMEMatrixTypeEnum za_flag>
@@ -147,6 +157,7 @@ class sem2p1_zero_matrix_pseudo<string name, Operand index_ty, SMEMatrixTypeEnum
Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rs, index_ty:$imm), []> {
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayStore = 1;
}
class sme2_movez_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, RegisterOperand vector_ty, SMEMatrixTypeEnum za_flag>
@@ -154,6 +165,8 @@ class sme2_movez_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, R
Pseudo<(outs vector_ty:$Zn), (ins tile_imm:$tile, MatrixIndexGPR32Op12_15:$Rs, imm_ty:$imm), []> {
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
}
class sme2_movaz_array_to_tile_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
@@ -162,6 +175,8 @@ class sme2_movaz_array_to_tile_pseudo<string name, Operand index_ty, RegisterOpe
Pseudo<(outs multi_vector_ty:$Zd), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3), []> {
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
}
//===----------------------------------------------------------------------===//
@@ -665,6 +680,8 @@ class sme_add_vector_to_tile_pseudo<ZPRRegOp zpr_ty, SMEMatrixTypeEnum za_flag>
// Translated to the actual instructions in AArch64ISelLowering.cpp
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
}
multiclass sme_add_vector_to_tile_u32<bit V, string mnemonic, SDPatternOperator op> {
@@ -1123,6 +1140,7 @@ class sme_mova_insert_pseudo<SMEMatrixTypeEnum za_flag>
// Translated to the actual instructions in AArch64ISelLowering.cpp
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayStore = 1;
}
multiclass sme_vector_v_to_tile<string mnemonic, bit is_col> {
@@ -1317,6 +1335,7 @@ multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
is_col, sme_elm_idx0_15, mnemonic> {
bits<4> imm;
let Inst{8-5} = imm;
+ let mayLoad = 1;
}
def _H : sme_tile_to_vector_inst<0b0, 0b01, ZPR16, !if(is_col, TileVectorOpV16,
TileVectorOpH16),
@@ -1325,6 +1344,7 @@ multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
bits<3> imm;
let Inst{8} = ZAn;
let Inst{7-5} = imm;
+ let mayLoad = 1;
}
def _S : sme_tile_to_vector_inst<0b0, 0b10, ZPR32, !if(is_col, TileVectorOpV32,
TileVectorOpH32),
@@ -1333,6 +1353,7 @@ multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
bits<2> imm;
let Inst{8-7} = ZAn;
let Inst{6-5} = imm;
+ let mayLoad = 1;
}
def _D : sme_tile_to_vector_inst<0b0, 0b11, ZPR64, !if(is_col, TileVectorOpV64,
TileVectorOpH64),
@@ -1341,12 +1362,14 @@ multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
bits<1> imm;
let Inst{8-6} = ZAn;
let Inst{5} = imm;
+ let mayLoad = 1;
}
def _Q : sme_tile_to_vector_inst<0b1, 0b11, ZPR128, !if(is_col, TileVectorOpV128,
TileVectorOpH128),
is_col, sme_elm_idx0_0, mnemonic> {
bits<4> ZAn;
let Inst{8-5} = ZAn;
+ let mayLoad = 1;
}
defm : sme_tile_to_vector_aliases<!cast<Instruction>(NAME # _B), ZPR8,
@@ -1817,7 +1840,9 @@ multiclass sme2_multivec_accum_add_sub_vg2<string mnemonic, bits<4> op,
def : InstAlias<mnemonic # "\t$ZAdn[$Rv, $imm3], $Zm",
(!cast<Instruction>(NAME) matrix_ty:$ZAdn, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, vector_ty:$Zm), 0>;
- def _PSEUDO : sme2_move_to_za_pseudo<NAME, sme_elm_idx0_7, vector_ty, SMEMatrixArray>;
+ def _PSEUDO : sme2_move_to_za_pseudo<NAME, sme_elm_idx0_7, vector_ty, SMEMatrixArray>{
+ let mayLoad = 1;
+ }
def : SME2_ZA_VG1x2_Multi_Pat<NAME, intrinsic, vty, sme_elm_idx0_7, tileslice16>;
}
@@ -1840,7 +1865,9 @@ multiclass sme2_multivec_accum_add_sub_vg4<string mnemonic, bits<4> op,
def : InstAlias<mnemonic # "\t$ZAdn[$Rv, $imm3], $Zm",
(!cast<Instruction>(NAME) matrix_ty:$ZAdn, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, vector_ty:$Zm), 0>;
- def _PSEUDO : sme2_move_to_za_pseudo<NAME, sme_elm_idx0_7, vector_ty, SMEMatrixArray>;
+ def _PSEUDO : sme2_move_to_za_pseudo<NAME, sme_elm_idx0_7, vector_ty, SMEMatrixArray>{
+ let mayLoad = 1;
+ }
def : SME2_ZA_VG1x4_Multi_Pat<NAME, intrinsic, vty, sme_elm_idx0_7, tileslice16>;
}
@@ -4349,6 +4376,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
uimm3s2range, mnemonic>, SMEPseudo2Instr<NAME # _B, 1> {
bits<3> imm;
let Inst{7-5} = imm;
+ let mayLoad = 1;
}
def _H : sme2_mova_tile_to_vec_vg2_multi_base<0b01, v, opc, ZZ_h_mul_r,
@@ -4359,6 +4387,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
bits<2> imm;
let Inst{7} = ZAn;
let Inst{6-5} = imm;
+ let mayLoad = 1;
}
def _S : sme2_mova_tile_to_vec_vg2_multi_base<0b10, v, opc, ZZ_s_mul_r,
@@ -4369,6 +4398,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
bits<1> imm;
let Inst{7-6} = ZAn;
let Inst{5} = imm;
+ let mayLoad = 1;
}
def _D : sme2_mova_tile_to_vec_vg2_multi_base<0b11, v, opc, ZZ_d_mul_r,
@@ -4377,6 +4407,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
uimm0s2range, mnemonic>, SMEPseudo2Instr<NAME # _D, 1> {
bits<3> ZAn;
let Inst{7-5} = ZAn;
+ let mayLoad = 1;
}
if !eq(mnemonic, "mova") then {
@@ -4491,6 +4522,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
uimm2s4range, mnemonic>, SMEPseudo2Instr<NAME # _B, 1> {
bits<2> imm;
let Inst{6-5} = imm;
+ let mayLoad = 1;
}
def _H : sme2_mova_tile_to_vec_vg4_multi_base<0b01, v, {opc,0,?,?},
@@ -4502,6 +4534,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
bits<1> imm;
let Inst{6} = ZAn;
let Inst{5} = imm;
+ let mayLoad = 1;
}
def _S : sme2_mova_tile_to_vec_vg4_multi_base<0b10, v, {opc,0,?,?},
@@ -4511,6 +4544,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
uimm0s4range, mnemonic>, SMEPseudo2Instr<NAME # _S, 1> {
bits<2> ZAn;
let Inst{6-5} = ZAn;
+ let mayLoad = 1;
}
def _D : sme2_mova_tile_to_vec_vg4_multi_base<0b11, v, {opc,?,?,?},
@@ -4520,6 +4554,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
uimm0s4range, mnemonic>, SMEPseudo2Instr<NAME # _D, 1> {
bits<3> ZAn;
let Inst{7-5} = ZAn;
+ let mayLoad = 1;
}
if !eq(mnemonic, "mova") then {
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
index 48fbd14bd8540..5370b3e9dc9df 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
@@ -1,27 +1,26 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming -verify-machineinstrs < %s | FileCheck %s
-define <vscale x 16 x i8> @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
+define void @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
; CHECK-LABEL: extract_row_b:
; CHECK: // %bb.0:
+; CHECK-NEXT: mov z7.d, z0.d
; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.b, p0/m, za0h.b[w12, 0]
-; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 2]
; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: mov z0.b, p0/m, za0h.b[w12, 0]
+; CHECK-NEXT: mov z4.d, z7.d
+; CHECK-NEXT: mov z5.d, z7.d
+; CHECK-NEXT: mov z6.d, z7.d
+; CHECK-NEXT: mov z1.b, p0/m, za0h.b[w12, 2]
; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 4]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 6]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 8]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 10]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 12]
-; CHECK-NEXT: mov z0.b, p0/m, za0h.b[w12, 14]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z3.b, p0/m, za0h.b[w12, 6]
+; CHECK-NEXT: mov z4.b, p0/m, za0h.b[w12, 8]
+; CHECK-NEXT: mov z5.b, p0/m, za0h.b[w12, 10]
+; CHECK-NEXT: mov z6.b, p0/m, za0h.b[w12, 12]
+; CHECK-NEXT: mov z7.b, p0/m, za0h.b[w12, 14]
+; CHECK-NEXT: b dummy_use_8_nxv16i8
%z0 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.2 = add i32 %tileslice, 2
%z1 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.2)
@@ -37,30 +36,33 @@ define <vscale x 16 x i8> @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x
%z6 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.12)
%tileslice.14 = add i32 %tileslice, 14
%z7 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.14)
- ret <vscale x 16 x i8> %z0
+
+ ; Force retention of z0..z7
+ tail call void @dummy_use_8_nxv16i8(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
+ <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
+ ret void
}
-define <vscale x 16 x i8> @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
+define void @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
; CHECK-LABEL: extract_col_b:
; CHECK: // %bb.0:
+; CHECK-NEXT: mov z7.d, z0.d
; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.b, p0/m, za0v.b[w12, 1]
-; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 3]
; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: mov z0.b, p0/m, za0v.b[w12, 1]
+; CHECK-NEXT: mov z4.d, z7.d
+; CHECK-NEXT: mov z5.d, z7.d
+; CHECK-NEXT: mov z6.d, z7.d
+; CHECK-NEXT: mov z1.b, p0/m, za0v.b[w12, 3]
; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 5]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 7]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 9]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 11]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 13]
-; CHECK-NEXT: mov z0.b, p0/m, za0v.b[w12, 15]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z3.b, p0/m, za0v.b[w12, 7]
+; CHECK-NEXT: mov z4.b, p0/m, za0v.b[w12, 9]
+; CHECK-NEXT: mov z5.b, p0/m, za0v.b[w12, 11]
+; CHECK-NEXT: mov z6.b, p0/m, za0v.b[w12, 13]
+; CHECK-NEXT: mov z7.b, p0/m, za0v.b[w12, 15]
+; CHECK-NEXT: b dummy_use_8_nxv16i8
%tileslice.1 = add i32 %tileslice, 1
%z0 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.1)
%tileslice.3 = add i32 %tileslice, 3
@@ -77,22 +79,24 @@ define <vscale x 16 x i8> @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x
%z6 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.13)
%tileslice.15 = add i32 %tileslice, 15
%z7 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.15)
- ret <vscale x 16 x i8> %z0
+
+ tail call void @dummy_use_8_nxv16i8(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
+ <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
+ ret void
}
-define <vscale x 8 x i16> @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
+define void @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
; CHECK-LABEL: extract_row_h:
; CHECK: // %bb.0:
+; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.h, p0/m, za0h.h[w12, 0]
-; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 2]
; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z0.h, p0/m, za0h.h[w12, 0]
+; CHECK-NEXT: mov z1.h, p0/m, za0h.h[w12, 2]
; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 4]
-; CHECK-NEXT: mov z0.h, p0/m, za0h.h[w12, 6]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z3.h, p0/m, za0h.h[w12, 6]
+; CHECK-NEXT: b dummy_use_4_nxv8i16
%z0 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.2 = add i32 %tileslice, 2
%z1 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.2)
@@ -100,22 +104,23 @@ define <vscale x 8 x i16> @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i
%z2 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.4)
%tileslice.6 = add i32 %tileslice, 6
%z3 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.6)
- ret <vscale x 8 x i16> %z0
+
+ tail call void @dummy_use_4_nxv8i16(<vscale x 8 x i16> %z0, <vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z3)
+ ret void
}
-define <vscale x 8 x i16> @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
+define void @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
; CHECK-LABEL: extract_col_h:
; CHECK: // %bb.0:
+; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.h, p0/m, za1v.h[w12, 1]
-; CHECK-NEXT: mov z2.h, p0/m, za1v.h[w12, 3]
; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z0.h, p0/m, za1v.h[w12, 1]
+; CHECK-NEXT: mov z1.h, p0/m, za1v.h[w12, 3]
; CHECK-NEXT: mov z2.h, p0/m, za1v.h[w12, 5]
-; CHECK-NEXT: mov z0.h, p0/m, za1v.h[w12, 7]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z3.h, p0/m, za1v.h[w12, 7]
+; CHECK-NEXT: b dummy_use_4_nxv8i16
%tileslice.1 = add i32 %tileslice, 1
%z0 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.1)
%tileslice.3 = add i32 %tileslice, 3
@@ -124,30 +129,31 @@ define <vscale x 8 x i16> @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i
%z2 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.5)
%tileslice.7 = add i32 %tileslice, 7
%z3 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.7)
- ret <vscale x 8 x i16> %z0
+
+ tail call void @dummy_use_4_nxv8i16(<vscale x 8 x i16> %z0, <vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z3)
+ ret void
}
-define <vscale x 8 x half> @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
+define void @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
; CHECK-LABEL: extract_f16:
; CHECK: // %bb.0:
+; CHECK-NEXT: mov z7.d, z0.d
; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.h, p0/m, za0h.h[w12, 0]
-; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 1]
; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: mov z0.h, p0/m, za0h.h[w12, 0]
+; CHECK-NEXT: mov z4.d, z7.d
+; CHECK-NEXT: mov z5.d, z7.d
+; CHECK-NEXT: mov z6.d, z7.d
+; CHECK-NEXT: mov z1.h, p0/m, za0h.h[w12, 1]
; CHECK-NEXT: mov z2.h, p0/m, za0v.h[w12, 2]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.h, p0/m, za0v.h[w12, 3]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 4]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 5]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.h, p0/m, za0v.h[w12, 6]
-; CHECK-NEXT: mov z0.h, p0/m, za0v.h[w12, 7]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z3.h, p0/m, za0v.h[w12, 3]
+; CHECK-NEXT: mov z4.h, p0/m, za0h.h[w12, 4]
+; CHECK-NEXT: mov z5.h, p0/m, za0h.h[w12, 5]
+; CHECK-NEXT: mov z6.h, p0/m, za0v.h[w12, 6]
+; CHECK-NEXT: mov z7.h, p0/m, za0v.h[w12, 7]
+; CHECK-NEXT: b dummy_use_8_nxv8f16
%z0 = call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.1 = add i32 %tileslice, 1
%z1 = call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -163,30 +169,32 @@ define <vscale x 8 x half> @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i
%z6 = call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.6)
%tileslice.7 = add i32 %tileslice, 7
%z7 = call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.7)
- ret <vscale x 8 x half> %z0
+
+ tail call void @dummy_use_8_nxv8f16(<vscale x 8 x half> %z0, <vscale x 8 x half> %z1, <vscale x 8 x half> %z2, <vscale x 8 x half> %z3,
+ <vscale x 8 x half> %z4, <vscale x 8 x half> %z5, <vscale x 8 x half> %z6, <vscale x 8 x half> %z7)
+ ret void
}
-define <vscale x 8 x bfloat> @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 %tileslice, ptr %ptr) {
+define void @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 %tileslice, ptr %ptr) {
; CHECK-LABEL: extract_bf16:
; CHECK: // %bb.0:
+; CHECK-NEXT: mov z7.d, z0.d
; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.h, p0/m, za0h.h[w12, 0]
-; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 1]
; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: mov z0.h, p0/m, za0h.h[w12, 0]
+; CHECK-NEXT: mov z4.d, z7.d
+; CHECK-NEXT: mov z5.d, z7.d
+; CHECK-NEXT: mov z6.d, z7.d
+; CHECK-NEXT: mov z1.h, p0/m, za0h.h[w12, 1]
; CHECK-NEXT: mov z2.h, p0/m, za0v.h[w12, 2]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.h, p0/m, za0v.h[w12, 3]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 4]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 5]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.h, p0/m, za0v.h[w12, 6]
-; CHECK-NEXT: mov z0.h, p0/m, za0v.h[w12, 7]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z3.h, p0/m, za0v.h[w12, 3]
+; CHECK-NEXT: mov z4.h, p0/m, za0h.h[w12, 4]
+; CHECK-NEXT: mov z5.h, p0/m, za0h.h[w12, 5]
+; CHECK-NEXT: mov z6.h, p0/m, za0v.h[w12, 6]
+; CHECK-NEXT: mov z7.h, p0/m, za0v.h[w12, 7]
+; CHECK-NEXT: b dummy_use_8_nxv8bf16
%z0 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.1 = add i32 %tileslice, 1
%z1 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -202,53 +210,57 @@ define <vscale x 8 x bfloat> @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x
%z6 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.6)
%tileslice.7 = add i32 %tileslice, 7
%z7 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.7)
- ret <vscale x 8 x bfloat> %z0
+
+ tail call void @dummy_use_8_nxv8bf16(<vscale x 8 x bfloat> %z0, <vscale x 8 x bfloat> %z1, <vscale x 8 x bfloat> %z2, <vscale x 8 x bfloat> %z3,
+ <vscale x 8 x bfloat> %z4, <vscale x 8 x bfloat> %z5, <vscale x 8 x bfloat> %z6, <vscale x 8 x bfloat> %z7)
+ ret void
}
-define <vscale x 4 x i32> @extract_row_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
+define void @extract_row_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
; CHECK-LABEL: extract_row_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 0]
-; CHECK-NEXT: mov z0.s, p0/m, za0h.s[w12, 2]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z0.s, p0/m, za0h.s[w12, 0]
+; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 2]
+; CHECK-NEXT: b dummy_use_2_nxv4i32
%z0 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.2 = add i32 %tileslice, 2
%z1 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.2)
- ret <vscale x 4 x i32> %z0
+
+ tail call void @dummy_use_2_nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
+ ret void
}
-define <vscale x 4 x i32> @extract_col_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
+define void @extract_col_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
; CHECK-LABEL: extract_col_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.s, p0/m, za3v.s[w12, 1]
-; CHECK-NEXT: mov z0.s, p0/m, za3v.s[w12, 3]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z0.s, p0/m, za3v.s[w12, 1]
+; CHECK-NEXT: mov z1.s, p0/m, za3v.s[w12, 3]
+; CHECK-NEXT: b dummy_use_2_nxv4i32
%tileslice.1 = add i32 %tileslice, 1
%z0 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 3, i32 %tileslice.1)
%tileslice.3 = add i32 %tileslice, 3
%z1 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 3, i32 %tileslice.3)
- ret <vscale x 4 x i32> %z0
+
+ tail call void @dummy_use_2_nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
+ ret void
}
-define <vscale x 4 x float> @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
+define void @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
; CHECK-LABEL: extract_f32:
; CHECK: // %bb.0:
+; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 0]
-; CHECK-NEXT: mov z2.s, p0/m, za0h.s[w12, 1]
; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z0.s, p0/m, za0h.s[w12, 0]
+; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 1]
; CHECK-NEXT: mov z2.s, p0/m, za0v.s[w12, 2]
-; CHECK-NEXT: mov z0.s, p0/m, za0v.s[w12, 3]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z3.s, p0/m, za0v.s[w12, 3]
+; CHECK-NEXT: b dummy_use_4_nxv4f32
%z0 = call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.1 = add i32 %tileslice, 1
%z1 = call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -256,7 +268,9 @@ define <vscale x 4 x float> @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x
%z2 = call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.2)
%tileslice.3 = add i32 %tileslice, 3
%z3 = call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.3)
- ret <vscale x 4 x float> %z0
+
+ tail call void @dummy_use_4_nxv4f32(<vscale x 4 x float> %z0, <vscale x 4 x float> %z1, <vscale x 4 x float> %z2, <vscale x 4 x float> %z3)
+ ret void
}
define <vscale x 2 x i64> @extract_row_d(<vscale x 2 x i64> %zd, <vscale x 2 x i1> %pg, i32 %tileslice) {
@@ -280,19 +294,20 @@ define <vscale x 2 x i64> @extract_col_d(<vscale x 2 x i64> %zd, <vscale x 2 x i
ret <vscale x 2 x i64> %z0
}
-define <vscale x 2 x double> @extract_f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 %tileslice) {
+define void @extract_f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 %tileslice) {
; CHECK-LABEL: extract_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.d, p0/m, za0h.d[w12, 0]
-; CHECK-NEXT: mov z0.d, p0/m, za0v.d[w12, 1]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z0.d, p0/m, za0h.d[w12, 0]
+; CHECK-NEXT: mov z1.d, p0/m, za0v.d[w12, 1]
+; CHECK-NEXT: b dummy_use_2_nxv2f64
%z0 = call <vscale x 2 x double> @llvm.aarch64.sme.read.horiz.nxv2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.1 = add i32 %tileslice, 1
%z1 = call <vscale x 2 x double> @llvm.aarch64.sme.read.vert.nxv2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 0, i32 %tileslice.1)
- ret <vscale x 2 x double> %z0
+
+ tail call void @dummy_use_2_nxv2f64(<vscale x 2 x double> %z0, <vscale x 2 x double> %z1)
+ ret void
}
define <vscale x 16 x i8> @extract_row_q_v16i18(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg) {
@@ -507,3 +522,35 @@ declare <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i3
declare <vscale x 4 x float> @llvm.aarch64.sme.readq.vert.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, i32, i32)
declare <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i32, i32)
declare <vscale x 2 x double> @llvm.aarch64.sme.readq.vert.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, i32, i32)
+
+; ------------------------------------------------------------------------------
+; Dummy external functions to force code retention.
+; The compiler does not see their implementations, so it must keep the calls.
+; ------------------------------------------------------------------------------
+
+declare void @dummy_use_8_nxv16i8(
+ <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
+ <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>
+)
+
+declare void @dummy_use_4_nxv8i16(
+ <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>
+)
+
+declare void @dummy_use_8_nxv8f16(
+ <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
+ <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>
+)
+
+declare void @dummy_use_8_nxv8bf16(
+ <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
+ <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>
+)
+
+declare void @dummy_use_2_nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+
+declare void @dummy_use_4_nxv4f32(
+ <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>
+)
+
+declare void @dummy_use_2_nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll
index ca5399a0503e9..c01c96cc56975 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -force-streaming < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-b16b16 -verify-machineinstrs -force-streaming < %s | FileCheck %s
;
; Move Multi-Vector From Tile (Read) x2
@@ -7,82 +7,106 @@
; Horizontal
-define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_horiz_vg2_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_horiz_vg2_b(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg2_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.b, z1.b }, za0h.b[w12, 0:1]
-; CHECK-NEXT: mov { z0.b, z1.b }, za0h.b[w12, 14:15]
+; CHECK-NEXT: mov { z2.b, z3.b }, za0h.b[w12, 14:15]
+; CHECK-NEXT: add z0.b, z0.b, z2.b
; CHECK-NEXT: ret
%res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 %slice)
%slice.14 = add i32 %slice, 14
%res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 %slice.14)
- ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+ %val1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+ %val2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+ %sum = add <vscale x 16 x i8> %val1, %val2
+ ret <vscale x 16 x i8> %sum
}
-define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_horiz_vg2_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_horiz_vg2_h(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg2_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h, z1.h }, za0h.h[w12, 0:1]
-; CHECK-NEXT: mov { z0.h, z1.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT: mov { z2.h, z3.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT: add z0.h, z0.h, z2.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 0, i32 %slice)
%slice.6 = add i32 %slice, 6
%res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 %slice.6)
- ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+ %val1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+ %sum = add <vscale x 8 x i16> %val1, %val2
+ ret <vscale x 8 x i16> %sum
}
-define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_horiz_vg2_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_horiz_vg2_f16(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg2_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h, z1.h }, za0h.h[w12, 0:1]
-; CHECK-NEXT: mov { z0.h, z1.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT: mov { z2.h, z3.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT: fadd z0.h, z0.h, z2.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 0, i32 %slice)
%slice.6 = add i32 %slice, 6
%res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 1, i32 %slice.6)
- ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
+ %val1 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+ %sum = fadd <vscale x 8 x half> %val1, %val2
+ ret <vscale x 8 x half> %sum
}
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_horiz_vg2_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_horiz_vg2_bf16(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg2_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h, z1.h }, za0h.h[w12, 0:1]
-; CHECK-NEXT: mov { z0.h, z1.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT: mov { z2.h, z3.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT: bfadd z0.h, z0.h, z2.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 0, i32 %slice)
%slice.6 = add i32 %slice, 6
%res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 1, i32 %slice.6)
- ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+ %val1 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+ %sum = fadd <vscale x 8 x bfloat> %val1, %val2
+ ret <vscale x 8 x bfloat> %sum
}
-define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_horiz_vg2_s(i32 %slice) {
+define <vscale x 4 x i32> @za_read_horiz_vg2_s(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg2_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.s, z1.s }, za0h.s[w12, 0:1]
-; CHECK-NEXT: mov { z0.s, z1.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT: mov { z2.s, z3.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT: add z0.s, z0.s, z2.s
; CHECK-NEXT: ret
%res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 0, i32 %slice)
%slice.2 = add i32 %slice, 2
%res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 %slice.2)
- ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
+ %val1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
+ %val2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2, 0
+ %sum = add <vscale x 4 x i32> %val1, %val2
+ ret <vscale x 4 x i32> %sum
}
-define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_horiz_vg2_f32(i32 %slice) {
+define <vscale x 4 x float> @za_read_horiz_vg2_f32(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg2_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.s, z1.s }, za0h.s[w12, 0:1]
-; CHECK-NEXT: mov { z0.s, z1.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT: mov { z2.s, z3.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT: fadd z0.s, z0.s, z2.s
; CHECK-NEXT: ret
%res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 0, i32 %slice)
%slice.2 = add i32 %slice, 2
%res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 3, i32 %slice.2)
- ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
+ %val1 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res, 0
+ %val2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res2, 0
+ %sum = fadd <vscale x 4 x float> %val1, %val2
+ ret <vscale x 4 x float> %sum
}
define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_horiz_vg2_d(i32 %slice) {
@@ -107,82 +131,106 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_horiz_vg2_f64(i
; Vertical
-define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vert_vg2_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_vert_vg2_b(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg2_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.b, z1.b }, za0v.b[w12, 0:1]
-; CHECK-NEXT: mov { z0.b, z1.b }, za0v.b[w12, 14:15]
+; CHECK-NEXT: mov { z2.b, z3.b }, za0v.b[w12, 14:15]
+; CHECK-NEXT: add z0.b, z0.b, z2.b
; CHECK-NEXT: ret
%res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 %slice)
%slice.14 = add i32 %slice, 14
%res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 %slice.14)
- ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+ %val1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+ %val2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+ %sum = add <vscale x 16 x i8> %val1, %val2
+ ret <vscale x 16 x i8> %sum
}
-define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vert_vg2_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_vert_vg2_h(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg2_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h, z1.h }, za0v.h[w12, 0:1]
-; CHECK-NEXT: mov { z0.h, z1.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT: mov { z2.h, z3.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT: add z0.h, z0.h, z2.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 0, i32 %slice)
%slice.6 = add i32 %slice, 6
%res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 %slice.6)
- ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+ %val1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+ %sum = add <vscale x 8 x i16> %val1, %val2
+ ret <vscale x 8 x i16> %sum
}
-define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vert_vg2_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_vert_vg2_f16(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg2_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h, z1.h }, za0v.h[w12, 0:1]
-; CHECK-NEXT: mov { z0.h, z1.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT: mov { z2.h, z3.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT: fadd z0.h, z0.h, z2.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 0, i32 %slice)
%slice.6 = add i32 %slice, 6
%res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 1, i32 %slice.6)
- ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
+ %val1 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+ %sum = fadd <vscale x 8 x half> %val1, %val2
+ ret <vscale x 8 x half> %sum
}
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vert_vg2_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_vert_vg2_bf16(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg2_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h, z1.h }, za0v.h[w12, 0:1]
-; CHECK-NEXT: mov { z0.h, z1.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT: mov { z2.h, z3.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT: bfadd z0.h, z0.h, z2.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 0, i32 %slice)
%slice.6 = add i32 %slice, 6
%res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 1, i32 %slice.6)
- ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+ %val1 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+ %sum = fadd <vscale x 8 x bfloat> %val1, %val2
+ ret <vscale x 8 x bfloat> %sum
}
-define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vert_vg2_s(i32 %slice) {
+define <vscale x 4 x i32> @za_read_vert_vg2_s(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg2_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.s, z1.s }, za0v.s[w12, 0:1]
-; CHECK-NEXT: mov { z0.s, z1.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT: mov { z2.s, z3.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT: add z0.s, z0.s, z2.s
; CHECK-NEXT: ret
%res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 0, i32 %slice)
%slice.2 = add i32 %slice, 2
%res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 %slice.2)
- ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
+ %val1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
+ %val2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2, 0
+ %sum = add <vscale x 4 x i32> %val1, %val2
+ ret <vscale x 4 x i32> %sum
}
-define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vert_vg2_f32(i32 %slice) {
+define <vscale x 4 x float> @za_read_vert_vg2_f32(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg2_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.s, z1.s }, za0v.s[w12, 0:1]
-; CHECK-NEXT: mov { z0.s, z1.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT: mov { z2.s, z3.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT: fadd z0.s, z0.s, z2.s
; CHECK-NEXT: ret
%res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 0, i32 %slice)
%slice.2 = add i32 %slice, 2
%res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 3, i32 %slice.2)
- ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
+ %val1 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res, 0
+ %val2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res2, 0
+ %sum = fadd <vscale x 4 x float> %val1, %val2
+ ret <vscale x 4 x float> %sum
}
define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vert_vg2_d(i32 %slice) {
@@ -211,56 +259,72 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vert_vg2_f64(i3
; Horizontal
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_horiz_vg4_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_horiz_vg4_b(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg4_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.b - z3.b }, za0h.b[w12, 0:3]
-; CHECK-NEXT: mov { z0.b - z3.b }, za0h.b[w12, 12:15]
+; CHECK-NEXT: mov { z4.b - z7.b }, za0h.b[w12, 12:15]
+; CHECK-NEXT: add z0.b, z0.b, z4.b
; CHECK-NEXT: ret
%res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 %slice)
%slice.12 = add i32 %slice, 12
%res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 %slice.12)
- ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+ %val1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+ %val2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+ %sum = add <vscale x 16 x i8> %val1, %val2
+ ret <vscale x 16 x i8> %sum
}
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_horiz_vg4_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_horiz_vg4_h(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg4_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h - z3.h }, za0h.h[w12, 0:3]
-; CHECK-NEXT: mov { z0.h - z3.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT: mov { z4.h - z7.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT: add z0.h, z0.h, z4.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 0, i32 %slice)
%slice.4 = add i32 %slice, 4
%res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 %slice.4)
- ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+ %val1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+ %sum = add <vscale x 8 x i16> %val1, %val2
+ ret <vscale x 8 x i16> %sum
}
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_horiz_vg4_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_horiz_vg4_f16(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg4_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h - z3.h }, za0h.h[w12, 0:3]
-; CHECK-NEXT: mov { z0.h - z3.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT: mov { z4.h - z7.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT: fadd z0.h, z0.h, z4.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 0, i32 %slice)
%slice.4 = add i32 %slice, 4
%res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 1, i32 %slice.4)
- ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
+ %val1 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+ %sum = fadd <vscale x 8 x half> %val1, %val2
+ ret <vscale x 8 x half> %sum
}
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_horiz_vg4_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_horiz_vg4_bf16(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg4_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h - z3.h }, za0h.h[w12, 0:3]
-; CHECK-NEXT: mov { z0.h - z3.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT: mov { z4.h - z7.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT: bfadd z0.h, z0.h, z4.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 0, i32 %slice)
%slice.4 = add i32 %slice, 4
%res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 1, i32 %slice.4)
- ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+ %val1 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+ %sum = fadd <vscale x 8 x bfloat> %val1, %val2
+ ret <vscale x 8 x bfloat> %sum
}
define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_horiz_vg4_s(i32 %slice) {
@@ -305,56 +369,72 @@ define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <v
; Vertical
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vert_vg4_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_vert_vg4_b(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg4_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.b - z3.b }, za0v.b[w12, 0:3]
-; CHECK-NEXT: mov { z0.b - z3.b }, za0v.b[w12, 12:15]
+; CHECK-NEXT: mov { z4.b - z7.b }, za0v.b[w12, 12:15]
+; CHECK-NEXT: add z0.b, z0.b, z4.b
; CHECK-NEXT: ret
%res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 %slice)
%slice.12 = add i32 %slice, 12
%res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 %slice.12)
- ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+ %val1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+ %val2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+ %sum = add <vscale x 16 x i8> %val1, %val2
+ ret <vscale x 16 x i8> %sum
}
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vert_vg4_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_vert_vg4_h(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg4_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h - z3.h }, za0v.h[w12, 0:3]
-; CHECK-NEXT: mov { z0.h - z3.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT: mov { z4.h - z7.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT: add z0.h, z0.h, z4.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 0, i32 %slice)
%slice.4 = add i32 %slice, 4
%res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 %slice.4)
- ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+ %val1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+ %sum = add <vscale x 8 x i16> %val1, %val2
+ ret <vscale x 8 x i16> %sum
}
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vert_vg4_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_vert_vg4_f16(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg4_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h - z3.h }, za0v.h[w12, 0:3]
-; CHECK-NEXT: mov { z0.h - z3.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT: mov { z4.h - z7.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT: fadd z0.h, z0.h, z4.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 0, i32 %slice)
%slice.4 = add i32 %slice, 4
%res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 1, i32 %slice.4)
- ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
+ %val1 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+ %sum = fadd <vscale x 8 x half> %val1, %val2
+ ret <vscale x 8 x half> %sum
}
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vert_vg4_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_vert_vg4_bf16(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg4_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h - z3.h }, za0v.h[w12, 0:3]
-; CHECK-NEXT: mov { z0.h - z3.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT: mov { z4.h - z7.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT: bfadd z0.h, z0.h, z4.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 0, i32 %slice)
%slice.4 = add i32 %slice, 4
%res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 1, i32 %slice.4)
- ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+ %val1 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+ %sum = fadd <vscale x 8 x bfloat> %val1, %val2
+ ret <vscale x 8 x bfloat> %sum
}
define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vert_vg4_s(i32 %slice) {
@@ -399,214 +479,278 @@ define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <v
; Move Multi-Vector From ZA (Read) x2
-define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vg1x2_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_vg1x2_b(i32 %slice) {
; CHECK-LABEL: za_read_vg1x2_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: add z0.b, z0.b, z2.b
; CHECK-NEXT: ret
%res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x2.nxv16i8(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x2.nxv16i8(i32 %slice.7)
- ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+ %val1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+ %val2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+ %sum = add <vscale x 16 x i8> %val1, %val2
+ ret <vscale x 16 x i8> %sum
}
-define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vg1x2_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_vg1x2_h(i32 %slice) {
; CHECK-LABEL: za_read_vg1x2_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: add z0.h, z0.h, z2.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x2.nxv8i16(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x2.nxv8i16(i32 %slice.7)
- ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+ %val1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+ %sum = add <vscale x 8 x i16> %val1, %val2
+ ret <vscale x 8 x i16> %sum
}
-define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vg1x2_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_vg1x2_f16(i32 %slice) {
; CHECK-LABEL: za_read_vg1x2_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: fadd z0.h, z0.h, z2.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x2.nxv8f16(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x2.nxv8f16(i32 %slice.7)
- ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
+ %val1 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+ %sum = fadd <vscale x 8 x half> %val1, %val2
+ ret <vscale x 8 x half> %sum
}
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vg1x2_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_vg1x2_bf16(i32 %slice) {
; CHECK-LABEL: za_read_vg1x2_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: bfadd z0.h, z0.h, z2.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x2.nxv8bf16(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x2.nxv8bf16(i32 %slice.7)
- ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+ %val1 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+ %sum = fadd <vscale x 8 x bfloat> %val1, %val2
+ ret <vscale x 8 x bfloat> %sum
}
-define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vg1x2_s(i32 %slice) {
+define <vscale x 4 x i32> @za_read_vg1x2_s(i32 %slice) {
; CHECK-LABEL: za_read_vg1x2_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: add z0.s, z0.s, z2.s
; CHECK-NEXT: ret
%res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x2.nxv4i32(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x2.nxv4i32(i32 %slice.7)
- ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
+ %val1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
+ %val2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2, 0
+ %sum = add <vscale x 4 x i32> %val1, %val2
+ ret <vscale x 4 x i32> %sum
}
-define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vg1x2_f32(i32 %slice) {
+define <vscale x 4 x float> @za_read_vg1x2_f32(i32 %slice) {
; CHECK-LABEL: za_read_vg1x2_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: fadd z0.s, z0.s, z2.s
; CHECK-NEXT: ret
%res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x2.nxv4f32(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x2.nxv4f32(i32 %slice.7)
- ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
+ %val1 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res, 0
+ %val2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res2, 0
+ %sum = fadd <vscale x 4 x float> %val1, %val2
+ ret <vscale x 4 x float> %sum
}
-define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vg1x2_d(i32 %slice) {
+define <vscale x 2 x i64> @za_read_vg1x2_d(i32 %slice) {
; CHECK-LABEL: za_read_vg1x2_d:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: add z0.d, z0.d, z2.d
; CHECK-NEXT: ret
%res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 %slice.7)
- ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res2
+ %val1 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 0
+ %val2 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %res2, 0
+ %sum = add <vscale x 2 x i64> %val1, %val2
+ ret <vscale x 2 x i64> %sum
}
-define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vg1x2_f64(i32 %slice) {
+define <vscale x 2 x double> @za_read_vg1x2_f64(i32 %slice) {
; CHECK-LABEL: za_read_vg1x2_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: fadd z0.d, z0.d, z2.d
; CHECK-NEXT: ret
%res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 %slice.7)
- ret { <vscale x 2 x double>, <vscale x 2 x double> } %res2
+ %val1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %res, 0
+ %val2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %res2, 0
+ %sum = fadd <vscale x 2 x double> %val1, %val2
+ ret <vscale x 2 x double> %sum
}
; Move Multi-Vector From ZA (Read) x4
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vg1x4_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_vg1x4_b(i32 %slice) {
; CHECK-LABEL: za_read_vg1x4_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: add z0.b, z0.b, z4.b
; CHECK-NEXT: ret
%res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x4.nxv16i8(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x4.nxv16i8(i32 %slice.7)
- ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+ %val1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+ %val2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+ %sum = add <vscale x 16 x i8> %val1, %val2
+ ret <vscale x 16 x i8> %sum
}
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vg1x4_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_vg1x4_h(i32 %slice) {
; CHECK-LABEL: za_read_vg1x4_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: add z0.h, z0.h, z4.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x4.nxv8i16(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x4.nxv8i16(i32 %slice.7)
- ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+ %val1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+ %sum = add <vscale x 8 x i16> %val1, %val2
+ ret <vscale x 8 x i16> %sum
}
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vg1x4_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_vg1x4_f16(i32 %slice) {
; CHECK-LABEL: za_read_vg1x4_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: fadd z0.h, z0.h, z4.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x4.nxv8f16(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x4.nxv8f16(i32 %slice.7)
- ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
+ %val1 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+ %sum = fadd <vscale x 8 x half> %val1, %val2
+ ret <vscale x 8 x half> %sum
}
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vg1x4_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_vg1x4_bf16(i32 %slice) {
; CHECK-LABEL: za_read_vg1x4_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: bfadd z0.h, z0.h, z4.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x4.nxv8bf16(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x4.nxv8bf16(i32 %slice.7)
- ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+ %val1 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+ %sum = fadd <vscale x 8 x bfloat> %val1, %val2
+ ret <vscale x 8 x bfloat> %sum
}
-define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vg1x4_s(i32 %slice) {
+define <vscale x 4 x i32> @za_read_vg1x4_s(i32 %slice) {
; CHECK-LABEL: za_read_vg1x4_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: add z0.s, z0.s, z4.s
; CHECK-NEXT: ret
%res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x4.nxv4i32(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x4.nxv4i32(i32 %slice.7)
- ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
+ %val1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
+ %val2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res2, 0
+ %sum = add <vscale x 4 x i32> %val1, %val2
+ ret <vscale x 4 x i32> %sum
}
-define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vg1x4_f32(i32 %slice) {
+define <vscale x 4 x float> @za_read_vg1x4_f32(i32 %slice) {
; CHECK-LABEL: za_read_vg1x4_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: fadd z0.s, z0.s, z4.s
; CHECK-NEXT: ret
%res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x4.nxv4f32(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x4.nxv4f32(i32 %slice.7)
- ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res2
+ %val1 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res, 0
+ %val2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res2, 0
+ %sum = fadd <vscale x 4 x float> %val1, %val2
+ ret <vscale x 4 x float> %sum
}
-define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vg1x4_d(i32 %slice) {
+define <vscale x 2 x i64> @za_read_vg1x4_d(i32 %slice) {
; CHECK-LABEL: za_read_vg1x4_d:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: add z0.d, z0.d, z4.d
; CHECK-NEXT: ret
%res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 %slice.7)
- ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res2
+ %val1 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 0
+ %val2 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res2, 0
+ %sum = add <vscale x 2 x i64> %val1, %val2
+ ret <vscale x 2 x i64> %sum
}
-define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vg1x4_f64(i32 %slice) {
+define <vscale x 2 x double> @za_read_vg1x4_f64(i32 %slice) {
; CHECK-LABEL: za_read_vg1x4_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: fadd z0.d, z0.d, z4.d
; CHECK-NEXT: ret
%res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 %slice.7)
- ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res2
+ %val1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res, 0
+ %val2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res2, 0
+ %sum = fadd <vscale x 2 x double> %val1, %val2
+ ret <vscale x 2 x double> %sum
}
declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32, i32)
>From 9077c3a872e3b718c9b33b2a4f6e94fcc723bdfa Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Tue, 8 Apr 2025 16:57:03 +0000
Subject: [PATCH 2/4] Simplify dummy functions
---
.../AArch64/sme-intrinsics-mova-extract.ll | 109 ++++++++----------
1 file changed, 48 insertions(+), 61 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
index 5370b3e9dc9df..4324b6ec64f25 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
@@ -20,7 +20,7 @@ define void @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %
; CHECK-NEXT: mov z5.b, p0/m, za0h.b[w12, 10]
; CHECK-NEXT: mov z6.b, p0/m, za0h.b[w12, 12]
; CHECK-NEXT: mov z7.b, p0/m, za0h.b[w12, 14]
-; CHECK-NEXT: b dummy_use_8_nxv16i8
+; CHECK-NEXT: b use
%z0 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.2 = add i32 %tileslice, 2
%z1 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.2)
@@ -38,8 +38,8 @@ define void @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %
%z7 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.14)
; Force retention of z0..z7
- tail call void @dummy_use_8_nxv16i8(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
- <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
+ tail call void @use(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
+ <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
ret void
}
@@ -62,7 +62,7 @@ define void @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %
; CHECK-NEXT: mov z5.b, p0/m, za0v.b[w12, 11]
; CHECK-NEXT: mov z6.b, p0/m, za0v.b[w12, 13]
; CHECK-NEXT: mov z7.b, p0/m, za0v.b[w12, 15]
-; CHECK-NEXT: b dummy_use_8_nxv16i8
+; CHECK-NEXT: b use
%tileslice.1 = add i32 %tileslice, 1
%z0 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.1)
%tileslice.3 = add i32 %tileslice, 3
@@ -80,8 +80,8 @@ define void @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %
%tileslice.15 = add i32 %tileslice, 15
%z7 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.15)
- tail call void @dummy_use_8_nxv16i8(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
- <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
+ tail call void @use(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
+ <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
ret void
}
@@ -96,7 +96,7 @@ define void @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %t
; CHECK-NEXT: mov z1.h, p0/m, za0h.h[w12, 2]
; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 4]
; CHECK-NEXT: mov z3.h, p0/m, za0h.h[w12, 6]
-; CHECK-NEXT: b dummy_use_4_nxv8i16
+; CHECK-NEXT: b use
%z0 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.2 = add i32 %tileslice, 2
%z1 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.2)
@@ -105,7 +105,7 @@ define void @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %t
%tileslice.6 = add i32 %tileslice, 6
%z3 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.6)
- tail call void @dummy_use_4_nxv8i16(<vscale x 8 x i16> %z0, <vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z3)
+ tail call void @use(<vscale x 8 x i16> %z0, <vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z3)
ret void
}
@@ -120,7 +120,7 @@ define void @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %t
; CHECK-NEXT: mov z1.h, p0/m, za1v.h[w12, 3]
; CHECK-NEXT: mov z2.h, p0/m, za1v.h[w12, 5]
; CHECK-NEXT: mov z3.h, p0/m, za1v.h[w12, 7]
-; CHECK-NEXT: b dummy_use_4_nxv8i16
+; CHECK-NEXT: b use
%tileslice.1 = add i32 %tileslice, 1
%z0 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.1)
%tileslice.3 = add i32 %tileslice, 3
@@ -130,7 +130,7 @@ define void @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %t
%tileslice.7 = add i32 %tileslice, 7
%z3 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.7)
- tail call void @dummy_use_4_nxv8i16(<vscale x 8 x i16> %z0, <vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z3)
+ tail call void @use(<vscale x 8 x i16> %z0, <vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z3)
ret void
}
@@ -153,7 +153,7 @@ define void @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 %ti
; CHECK-NEXT: mov z5.h, p0/m, za0h.h[w12, 5]
; CHECK-NEXT: mov z6.h, p0/m, za0v.h[w12, 6]
; CHECK-NEXT: mov z7.h, p0/m, za0v.h[w12, 7]
-; CHECK-NEXT: b dummy_use_8_nxv8f16
+; CHECK-NEXT: b use
%z0 = call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.1 = add i32 %tileslice, 1
%z1 = call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -170,8 +170,8 @@ define void @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 %ti
%tileslice.7 = add i32 %tileslice, 7
%z7 = call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.7)
- tail call void @dummy_use_8_nxv8f16(<vscale x 8 x half> %z0, <vscale x 8 x half> %z1, <vscale x 8 x half> %z2, <vscale x 8 x half> %z3,
- <vscale x 8 x half> %z4, <vscale x 8 x half> %z5, <vscale x 8 x half> %z6, <vscale x 8 x half> %z7)
+ tail call void @use(<vscale x 8 x half> %z0, <vscale x 8 x half> %z1, <vscale x 8 x half> %z2, <vscale x 8 x half> %z3,
+ <vscale x 8 x half> %z4, <vscale x 8 x half> %z5, <vscale x 8 x half> %z6, <vscale x 8 x half> %z7)
ret void
}
@@ -194,7 +194,7 @@ define void @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32
; CHECK-NEXT: mov z5.h, p0/m, za0h.h[w12, 5]
; CHECK-NEXT: mov z6.h, p0/m, za0v.h[w12, 6]
; CHECK-NEXT: mov z7.h, p0/m, za0v.h[w12, 7]
-; CHECK-NEXT: b dummy_use_8_nxv8bf16
+; CHECK-NEXT: b use
%z0 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.1 = add i32 %tileslice, 1
%z1 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -211,8 +211,8 @@ define void @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32
%tileslice.7 = add i32 %tileslice, 7
%z7 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.7)
- tail call void @dummy_use_8_nxv8bf16(<vscale x 8 x bfloat> %z0, <vscale x 8 x bfloat> %z1, <vscale x 8 x bfloat> %z2, <vscale x 8 x bfloat> %z3,
- <vscale x 8 x bfloat> %z4, <vscale x 8 x bfloat> %z5, <vscale x 8 x bfloat> %z6, <vscale x 8 x bfloat> %z7)
+ tail call void @use(<vscale x 8 x bfloat> %z0, <vscale x 8 x bfloat> %z1, <vscale x 8 x bfloat> %z2, <vscale x 8 x bfloat> %z3,
+ <vscale x 8 x bfloat> %z4, <vscale x 8 x bfloat> %z5, <vscale x 8 x bfloat> %z6, <vscale x 8 x bfloat> %z7)
ret void
}
@@ -223,12 +223,12 @@ define void @extract_row_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %t
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov z0.s, p0/m, za0h.s[w12, 0]
; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 2]
-; CHECK-NEXT: b dummy_use_2_nxv4i32
+; CHECK-NEXT: b use
%z0 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.2 = add i32 %tileslice, 2
%z1 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.2)
- tail call void @dummy_use_2_nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
+ tail call void @use(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
ret void
}
@@ -239,13 +239,13 @@ define void @extract_col_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %t
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov z0.s, p0/m, za3v.s[w12, 1]
; CHECK-NEXT: mov z1.s, p0/m, za3v.s[w12, 3]
-; CHECK-NEXT: b dummy_use_2_nxv4i32
+; CHECK-NEXT: b use
%tileslice.1 = add i32 %tileslice, 1
%z0 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 3, i32 %tileslice.1)
%tileslice.3 = add i32 %tileslice, 3
%z1 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 3, i32 %tileslice.3)
- tail call void @dummy_use_2_nxv4i32(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
+ tail call void @use(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
ret void
}
@@ -260,7 +260,7 @@ define void @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 %t
; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 1]
; CHECK-NEXT: mov z2.s, p0/m, za0v.s[w12, 2]
; CHECK-NEXT: mov z3.s, p0/m, za0v.s[w12, 3]
-; CHECK-NEXT: b dummy_use_4_nxv4f32
+; CHECK-NEXT: b use
%z0 = call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.1 = add i32 %tileslice, 1
%z1 = call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -269,7 +269,7 @@ define void @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 %t
%tileslice.3 = add i32 %tileslice, 3
%z3 = call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.3)
- tail call void @dummy_use_4_nxv4f32(<vscale x 4 x float> %z0, <vscale x 4 x float> %z1, <vscale x 4 x float> %z2, <vscale x 4 x float> %z3)
+ tail call void @use(<vscale x 4 x float> %z0, <vscale x 4 x float> %z1, <vscale x 4 x float> %z2, <vscale x 4 x float> %z3)
ret void
}
@@ -301,12 +301,12 @@ define void @extract_f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 %
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov z0.d, p0/m, za0h.d[w12, 0]
; CHECK-NEXT: mov z1.d, p0/m, za0v.d[w12, 1]
-; CHECK-NEXT: b dummy_use_2_nxv2f64
+; CHECK-NEXT: b use
%z0 = call <vscale x 2 x double> @llvm.aarch64.sme.read.horiz.nxv2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.1 = add i32 %tileslice, 1
%z1 = call <vscale x 2 x double> @llvm.aarch64.sme.read.vert.nxv2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 0, i32 %tileslice.1)
- tail call void @dummy_use_2_nxv2f64(<vscale x 2 x double> %z0, <vscale x 2 x double> %z1)
+ tail call void @use(<vscale x 2 x double> %z0, <vscale x 2 x double> %z1)
ret void
}
@@ -453,21 +453,33 @@ define <vscale x 2 x double> @extract_col_q_v2f64(<vscale x 2 x double> %zd, <vs
define <vscale x 4 x i32> @test_sink_offset_operand(<vscale x 4 x i1> %pg, i32 %base, i32 %N) {
; CHECK-LABEL: test_sink_offset_operand:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov z0.s, #0 // =0x0
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT: mov z3.s, #0 // =0x0
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: .LBB26_1: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z0.d, z3.d
+; CHECK-NEXT: mov z1.d, z3.d
; CHECK-NEXT: subs w1, w1, #3
-; CHECK-NEXT: mov z3.d, z0.d
-; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 0]
-; CHECK-NEXT: mov z2.s, p0/m, za0h.s[w12, 1]
-; CHECK-NEXT: mov z3.s, p0/m, za0h.s[w12, 2]
+; CHECK-NEXT: mov z2.d, z3.d
+; CHECK-NEXT: mov z0.s, p0/m, za0h.s[w12, 0]
+; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 1]
+; CHECK-NEXT: mov z2.s, p0/m, za0h.s[w12, 2]
; CHECK-NEXT: b.ne .LBB26_1
; CHECK-NEXT: // %bb.2: // %exit
-; CHECK-NEXT: add z0.s, z1.s, z2.s
-; CHECK-NEXT: add z0.s, z0.s, z3.s
+; CHECK-NEXT: add z3.s, z0.s, z1.s
+; CHECK-NEXT: add z8.s, z3.s, z2.s
+; CHECK-NEXT: bl use
+; CHECK-NEXT: mov z0.d, z8.d
+; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
entry:
%add1 = add i32 %base, 1
@@ -486,6 +498,7 @@ for.body:
exit:
%tmp1 = add <vscale x 4 x i32> %z0, %z1
%res = add <vscale x 4 x i32> %tmp1, %z2
+ tail call void @use(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2)
ret <vscale x 4 x i32> %res
}
@@ -524,33 +537,7 @@ declare <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i6
declare <vscale x 2 x double> @llvm.aarch64.sme.readq.vert.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, i32, i32)
; ------------------------------------------------------------------------------
-; Dummy external functions to force code retention.
-; The compiler does not see their implementations, so it must keep the calls.
+; Dummy external function to force code retention.
; ------------------------------------------------------------------------------
-declare void @dummy_use_8_nxv16i8(
- <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
- <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>
-)
-
-declare void @dummy_use_4_nxv8i16(
- <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>
-)
-
-declare void @dummy_use_8_nxv8f16(
- <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
- <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>
-)
-
-declare void @dummy_use_8_nxv8bf16(
- <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
- <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>
-)
-
-declare void @dummy_use_2_nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
-
-declare void @dummy_use_4_nxv4f32(
- <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>
-)
-
-declare void @dummy_use_2_nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>)
+declare void @use(...)
>From b0900684e2fde319034a415c2c499e98353b2821 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Thu, 10 Apr 2025 13:26:18 +0000
Subject: [PATCH 3/4] Rename SME classes not accessing ZA to SVE
---
llvm/include/llvm/IR/IntrinsicsAArch64.td | 201 +++++++++++-----------
1 file changed, 99 insertions(+), 102 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 7648fc55d54ae..222a20d9e4044 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3098,9 +3098,9 @@ let TargetPrefix = "aarch64" in {
[IntrNoMem, IntrHasSideEffects]>;
def int_aarch64_sme_za_enable
- : DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
+ : DefaultAttrsIntrinsic<[], [], [IntrWriteMem, IntrInaccessibleMemOnly]>;
def int_aarch64_sme_za_disable
- : DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
+ : DefaultAttrsIntrinsic<[], [], [IntrWriteMem, IntrInaccessibleMemOnly]>;
// Clamp
//
@@ -3240,13 +3240,13 @@ let TargetPrefix = "aarch64" in {
LLVMMatchType<0>, llvm_i32_ty],
[IntrInaccessibleMemOnly, ImmArg<ArgIndex<6>>]>;
- class SME2_VG2_Multi_Imm_Intrinsic
+ class SVE2_VG2_Multi_Imm_Intrinsic
: DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>],
[llvm_anyvector_ty, LLVMMatchType<0>,
llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<2>>]>;
- class SME2_VG4_Multi_Imm_Intrinsic
+ class SVE2_VG4_Multi_Imm_Intrinsic
: DefaultAttrsIntrinsic<[LLVMSubdivide4VectorType<0>],
[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>,
@@ -3266,13 +3266,13 @@ let TargetPrefix = "aarch64" in {
LLVMMatchType<0>, LLVMMatchType<0>],
[IntrWriteMem, IntrInaccessibleMemOnly]>;
- class SME2_VG2_Multi_Single_Intrinsic
+ class SVE2_VG2_Multi_Single_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[LLVMMatchType<0>, LLVMMatchType<0>,
LLVMMatchType<0>],
[IntrNoMem]>;
- class SME2_VG4_Multi_Single_Intrinsic
+ class SVE2_VG4_Multi_Single_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
[LLVMMatchType<0>, LLVMMatchType<0>,
@@ -3280,13 +3280,13 @@ let TargetPrefix = "aarch64" in {
LLVMMatchType<0>],
[IntrNoMem]>;
- class SME2_VG2_Multi_Multi_Intrinsic
+ class SVE2_VG2_Multi_Multi_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[LLVMMatchType<0>, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem]>;
- class SME2_VG4_Multi_Multi_Intrinsic
+ class SVE2_VG4_Multi_Multi_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
[LLVMMatchType<0>, LLVMMatchType<0>,
@@ -3310,42 +3310,42 @@ let TargetPrefix = "aarch64" in {
LLVMMatchType<0>, LLVMMatchType<0>,
LLVMMatchType<0>], [IntrNoMem]>;
- class SME2_CVT_VG2_SINGLE_Intrinsic
+ class SVE2_CVT_VG2_SINGLE_Intrinsic
: DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>],
[llvm_anyvector_ty, LLVMMatchType<0>],
[IntrNoMem]>;
- class SME2_CVT_VG2_SINGLE_BF16_Intrinsic
+ class SVE2_CVT_VG2_SINGLE_BF16_Intrinsic
: DefaultAttrsIntrinsic<[llvm_nxv8bf16_ty],
[llvm_nxv4f32_ty, llvm_nxv4f32_ty],
[IntrNoMem]>;
- class SME2_CVT_WIDENING_VG2_Intrinsic
+ class SVE2_CVT_WIDENING_VG2_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[LLVMSubdivide2VectorType<0>], [IntrNoMem]>;
- class SME2_CVT_VG4_SINGLE_Intrinsic
+ class SVE2_CVT_VG4_SINGLE_Intrinsic
: DefaultAttrsIntrinsic<[LLVMSubdivide4VectorType<0>],
[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem]>;
- class SME2_CVT_X2_Intrinsic
+ class SVE2_CVT_X2_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[llvm_anyvector_ty, LLVMMatchType<1>],
[IntrNoMem]>;
- class SME2_CVT_X4_Intrinsic
+ class SVE2_CVT_X4_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, LLVMMatchType<1>],
[IntrNoMem]>;
- class SME2_BFMLS_Intrinsic
+ class SVE2_BFMLS_Intrinsic
: DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
[llvm_nxv4f32_ty, llvm_nxv8bf16_ty, llvm_nxv8bf16_ty],
[IntrNoMem]>;
- class SME2_BFMLS_Lane_Intrinsic
+ class SVE2_BFMLS_Lane_Intrinsic
: DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
[llvm_nxv4f32_ty, llvm_nxv8bf16_ty, llvm_nxv8bf16_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<3>>]>;
@@ -3398,13 +3398,13 @@ let TargetPrefix = "aarch64" in {
LLVMMatchType<0>, LLVMMatchType<0>],
[IntrWriteMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
- class SME2_VG2_Multi_Single_Single_Intrinsic
+ class SVE2_VG2_Multi_Single_Single_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[LLVMMatchType<0>, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem]>;
- class SME2_VG4_Multi_Single_Single_Intrinsic
+ class SVE2_VG4_Multi_Single_Single_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
[LLVMMatchType<0>, LLVMMatchType<0>,
@@ -3422,11 +3422,11 @@ let TargetPrefix = "aarch64" in {
[LLVMMatchType<0>, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
- class SME2_VG2_Unpk_Intrinsic
+ class SVE2_VG2_Unpk_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[LLVMSubdivide2VectorType<0>], [IntrNoMem]>;
- class SME2_VG4_Unpk_Intrinsic
+ class SVE2_VG4_Unpk_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
[LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>],
@@ -3467,33 +3467,33 @@ let TargetPrefix = "aarch64" in {
// Multi-vector rounding shift left intrinsics
//
- def int_aarch64_sve_srshl_single_x2 : SME2_VG2_Multi_Single_Intrinsic;
- def int_aarch64_sve_urshl_single_x2 : SME2_VG2_Multi_Single_Intrinsic;
- def int_aarch64_sve_srshl_single_x4 : SME2_VG4_Multi_Single_Intrinsic;
- def int_aarch64_sve_urshl_single_x4 : SME2_VG4_Multi_Single_Intrinsic;
+ def int_aarch64_sve_srshl_single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+ def int_aarch64_sve_urshl_single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+ def int_aarch64_sve_srshl_single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
+ def int_aarch64_sve_urshl_single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
- def int_aarch64_sve_srshl_x2 : SME2_VG2_Multi_Multi_Intrinsic;
- def int_aarch64_sve_urshl_x2 : SME2_VG2_Multi_Multi_Intrinsic;
- def int_aarch64_sve_srshl_x4 : SME2_VG4_Multi_Multi_Intrinsic;
- def int_aarch64_sve_urshl_x4 : SME2_VG4_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_srshl_x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_urshl_x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_srshl_x4 : SVE2_VG4_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_urshl_x4 : SVE2_VG4_Multi_Multi_Intrinsic;
// Multi-vector saturating rounding shift right intrinsics
- def int_aarch64_sve_sqrshr_x2 : SME2_VG2_Multi_Imm_Intrinsic;
- def int_aarch64_sve_uqrshr_x2 : SME2_VG2_Multi_Imm_Intrinsic;
- def int_aarch64_sve_sqrshr_x4 : SME2_VG4_Multi_Imm_Intrinsic;
- def int_aarch64_sve_uqrshr_x4 : SME2_VG4_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_sqrshr_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_uqrshr_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_sqrshr_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_uqrshr_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
- def int_aarch64_sve_sqrshrn_x2 : SME2_VG2_Multi_Imm_Intrinsic;
- def int_aarch64_sve_uqrshrn_x2 : SME2_VG2_Multi_Imm_Intrinsic;
- def int_aarch64_sve_sqrshrn_x4 : SME2_VG4_Multi_Imm_Intrinsic;
- def int_aarch64_sve_uqrshrn_x4 : SME2_VG4_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_sqrshrn_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_uqrshrn_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_sqrshrn_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_uqrshrn_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
- def int_aarch64_sve_sqrshru_x2 : SME2_VG2_Multi_Imm_Intrinsic;
- def int_aarch64_sve_sqrshru_x4 : SME2_VG4_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_sqrshru_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_sqrshru_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
- def int_aarch64_sve_sqrshrun_x2 : SME2_VG2_Multi_Imm_Intrinsic;
- def int_aarch64_sve_sqrshrun_x4 : SME2_VG4_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_sqrshrun_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_sqrshrun_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
//
// Multi-vector multiply-add/subtract long
@@ -3553,25 +3553,23 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_usmla_za32_lane_vg4x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic;
def int_aarch64_sme_usmla_za32_lane_vg4x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
- def int_aarch64_sve_bfmlslb : SME2_BFMLS_Intrinsic;
- def int_aarch64_sve_bfmlslb_lane : SME2_BFMLS_Lane_Intrinsic;
+ def int_aarch64_sve_bfmlslb : SVE2_BFMLS_Intrinsic;
+ def int_aarch64_sve_bfmlslb_lane : SVE2_BFMLS_Lane_Intrinsic;
- def int_aarch64_sve_bfmlslt : SME2_BFMLS_Intrinsic;
- def int_aarch64_sve_bfmlslt_lane : SME2_BFMLS_Lane_Intrinsic;
+ def int_aarch64_sve_bfmlslt : SVE2_BFMLS_Intrinsic;
+ def int_aarch64_sve_bfmlslt_lane : SVE2_BFMLS_Lane_Intrinsic;
// Multi-vector zeroing
foreach vg = ["vg1x2", "vg1x4", "vg2x1", "vg2x2", "vg2x4", "vg4x1", "vg4x2", "vg4x4"] in {
def int_aarch64_sme_zero_za64_ # vg : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [IntrWriteMem, IntrInaccessibleMemOnly]>;
}
-
// Multi-vector signed saturating doubling multiply high
+ def int_aarch64_sve_sqdmulh_single_vgx2 : SVE2_VG2_Multi_Single_Intrinsic;
+ def int_aarch64_sve_sqdmulh_single_vgx4 : SVE2_VG4_Multi_Single_Intrinsic;
- def int_aarch64_sve_sqdmulh_single_vgx2 : SME2_VG2_Multi_Single_Intrinsic;
- def int_aarch64_sve_sqdmulh_single_vgx4 : SME2_VG4_Multi_Single_Intrinsic;
-
- def int_aarch64_sve_sqdmulh_vgx2 : SME2_VG2_Multi_Multi_Intrinsic;
- def int_aarch64_sve_sqdmulh_vgx4 : SME2_VG4_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_sqdmulh_vgx2 : SVE2_VG2_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_sqdmulh_vgx4 : SVE2_VG4_Multi_Multi_Intrinsic;
// Multi-vector floating-point round to integral value
@@ -3586,11 +3584,11 @@ let TargetPrefix = "aarch64" in {
foreach ty = ["f", "s", "u"] in {
foreach instr = ["max", "min"] in {
- def int_aarch64_sve_ # ty # instr # _single_x2 : SME2_VG2_Multi_Single_Intrinsic;
- def int_aarch64_sve_ # ty # instr # _single_x4 : SME2_VG4_Multi_Single_Intrinsic;
+ def int_aarch64_sve_ # ty # instr # _single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+ def int_aarch64_sve_ # ty # instr # _single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
- def int_aarch64_sve_ # ty # instr # _x2 : SME2_VG2_Multi_Multi_Intrinsic;
- def int_aarch64_sve_ # ty # instr # _x4 : SME2_VG4_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_ # ty # instr # _x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_ # ty # instr # _x4 : SVE2_VG4_Multi_Multi_Intrinsic;
}
}
@@ -3599,11 +3597,11 @@ let TargetPrefix = "aarch64" in {
//
foreach instr = ["fmaxnm", "fminnm"] in {
- def int_aarch64_sve_ # instr # _single_x2 : SME2_VG2_Multi_Single_Intrinsic;
- def int_aarch64_sve_ # instr # _single_x4 : SME2_VG4_Multi_Single_Intrinsic;
+ def int_aarch64_sve_ # instr # _single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+ def int_aarch64_sve_ # instr # _single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
- def int_aarch64_sve_ # instr # _x2 : SME2_VG2_Multi_Multi_Intrinsic;
- def int_aarch64_sve_ # instr # _x4 : SME2_VG4_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_ # instr # _x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_ # instr # _x4 : SVE2_VG4_Multi_Multi_Intrinsic;
}
//
@@ -3611,8 +3609,8 @@ let TargetPrefix = "aarch64" in {
//
foreach instr = ["famax", "famin"] in {
- def int_aarch64_sme_ # instr # _x2 : SME2_VG2_Multi_Multi_Intrinsic;
- def int_aarch64_sme_ # instr # _x4 : SME2_VG4_Multi_Multi_Intrinsic;
+ def int_aarch64_sme_ # instr # _x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+ def int_aarch64_sme_ # instr # _x4 : SVE2_VG4_Multi_Multi_Intrinsic;
}
//
@@ -3634,48 +3632,47 @@ let TargetPrefix = "aarch64" in {
//
//Multi-vector floating-point convert from half-precision to deinterleaved single-precision.
//
-
- def int_aarch64_sve_fcvtl_widen_x2 : SME2_CVT_WIDENING_VG2_Intrinsic;
+ def int_aarch64_sve_fcvtl_widen_x2 : SVE2_CVT_WIDENING_VG2_Intrinsic;
//
// Multi-vector floating-point CVT from single-precision to interleaved half-precision/BFloat16
//
- def int_aarch64_sve_fcvtn_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
- def int_aarch64_sve_bfcvtn_x2 : SME2_CVT_VG2_SINGLE_BF16_Intrinsic;
+ def int_aarch64_sve_fcvtn_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+ def int_aarch64_sve_bfcvtn_x2 : SVE2_CVT_VG2_SINGLE_BF16_Intrinsic;
//
// Multi-vector convert to/from floating-point.
//
- def int_aarch64_sve_fcvt_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
- def int_aarch64_sve_bfcvt_x2 : SME2_CVT_VG2_SINGLE_BF16_Intrinsic;
- def int_aarch64_sve_fcvtzs_x2 : SME2_CVT_X2_Intrinsic;
- def int_aarch64_sve_fcvtzu_x2 : SME2_CVT_X2_Intrinsic;
- def int_aarch64_sve_scvtf_x2 : SME2_CVT_X2_Intrinsic;
- def int_aarch64_sve_ucvtf_x2 : SME2_CVT_X2_Intrinsic;
- def int_aarch64_sve_fcvtzs_x4 : SME2_CVT_X4_Intrinsic;
- def int_aarch64_sve_fcvtzu_x4 : SME2_CVT_X4_Intrinsic;
- def int_aarch64_sve_scvtf_x4 : SME2_CVT_X4_Intrinsic;
- def int_aarch64_sve_ucvtf_x4 : SME2_CVT_X4_Intrinsic;
- def int_aarch64_sve_fcvt_widen_x2 : SME2_CVT_WIDENING_VG2_Intrinsic;
+ def int_aarch64_sve_fcvt_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+ def int_aarch64_sve_bfcvt_x2 : SVE2_CVT_VG2_SINGLE_BF16_Intrinsic;
+ def int_aarch64_sve_fcvtzs_x2 : SVE2_CVT_X2_Intrinsic;
+ def int_aarch64_sve_fcvtzu_x2 : SVE2_CVT_X2_Intrinsic;
+ def int_aarch64_sve_scvtf_x2 : SVE2_CVT_X2_Intrinsic;
+ def int_aarch64_sve_ucvtf_x2 : SVE2_CVT_X2_Intrinsic;
+ def int_aarch64_sve_fcvtzs_x4 : SVE2_CVT_X4_Intrinsic;
+ def int_aarch64_sve_fcvtzu_x4 : SVE2_CVT_X4_Intrinsic;
+ def int_aarch64_sve_scvtf_x4 : SVE2_CVT_X4_Intrinsic;
+ def int_aarch64_sve_ucvtf_x4 : SVE2_CVT_X4_Intrinsic;
+ def int_aarch64_sve_fcvt_widen_x2 : SVE2_CVT_WIDENING_VG2_Intrinsic;
//
// Multi-vector saturating extract narrow
//
- def int_aarch64_sve_sqcvt_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
- def int_aarch64_sve_uqcvt_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
- def int_aarch64_sve_sqcvtu_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
- def int_aarch64_sve_sqcvt_x4 : SME2_CVT_VG4_SINGLE_Intrinsic;
- def int_aarch64_sve_uqcvt_x4 : SME2_CVT_VG4_SINGLE_Intrinsic;
- def int_aarch64_sve_sqcvtu_x4 : SME2_CVT_VG4_SINGLE_Intrinsic;
+ def int_aarch64_sve_sqcvt_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+ def int_aarch64_sve_uqcvt_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+ def int_aarch64_sve_sqcvtu_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+ def int_aarch64_sve_sqcvt_x4 : SVE2_CVT_VG4_SINGLE_Intrinsic;
+ def int_aarch64_sve_uqcvt_x4 : SVE2_CVT_VG4_SINGLE_Intrinsic;
+ def int_aarch64_sve_sqcvtu_x4 : SVE2_CVT_VG4_SINGLE_Intrinsic;
//
// Multi-vector saturating extract narrow and interleave
//
- def int_aarch64_sve_sqcvtn_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
- def int_aarch64_sve_uqcvtn_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
- def int_aarch64_sve_sqcvtun_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
- def int_aarch64_sve_sqcvtn_x4 : SME2_CVT_VG4_SINGLE_Intrinsic;
- def int_aarch64_sve_uqcvtn_x4 : SME2_CVT_VG4_SINGLE_Intrinsic;
- def int_aarch64_sve_sqcvtun_x4 : SME2_CVT_VG4_SINGLE_Intrinsic;
+ def int_aarch64_sve_sqcvtn_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+ def int_aarch64_sve_uqcvtn_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+ def int_aarch64_sve_sqcvtun_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+ def int_aarch64_sve_sqcvtn_x4 : SVE2_CVT_VG4_SINGLE_Intrinsic;
+ def int_aarch64_sve_uqcvtn_x4 : SVE2_CVT_VG4_SINGLE_Intrinsic;
+ def int_aarch64_sve_sqcvtun_x4 : SVE2_CVT_VG4_SINGLE_Intrinsic;
//
// Multi-Single add/sub
@@ -3694,15 +3691,15 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_sub_write_za_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic;
// Multi-vector clamps
- def int_aarch64_sve_sclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
- def int_aarch64_sve_uclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
- def int_aarch64_sve_fclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
- def int_aarch64_sve_bfclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
+ def int_aarch64_sve_sclamp_single_x2 : SVE2_VG2_Multi_Single_Single_Intrinsic;
+ def int_aarch64_sve_uclamp_single_x2 : SVE2_VG2_Multi_Single_Single_Intrinsic;
+ def int_aarch64_sve_fclamp_single_x2 : SVE2_VG2_Multi_Single_Single_Intrinsic;
+ def int_aarch64_sve_bfclamp_single_x2 : SVE2_VG2_Multi_Single_Single_Intrinsic;
- def int_aarch64_sve_sclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
- def int_aarch64_sve_uclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
- def int_aarch64_sve_fclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
- def int_aarch64_sve_bfclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
+ def int_aarch64_sve_sclamp_single_x4 : SVE2_VG4_Multi_Single_Single_Intrinsic;
+ def int_aarch64_sve_uclamp_single_x4 : SVE2_VG4_Multi_Single_Single_Intrinsic;
+ def int_aarch64_sve_fclamp_single_x4 : SVE2_VG4_Multi_Single_Single_Intrinsic;
+ def int_aarch64_sve_bfclamp_single_x4 : SVE2_VG4_Multi_Single_Single_Intrinsic;
//
// Multi-vector add/sub and accumulate into ZA
@@ -3739,8 +3736,8 @@ let TargetPrefix = "aarch64" in {
//
// Multi-Single Vector add
//
- def int_aarch64_sve_add_single_x2 : SME2_VG2_Multi_Single_Intrinsic;
- def int_aarch64_sve_add_single_x4 : SME2_VG4_Multi_Single_Intrinsic;
+ def int_aarch64_sve_add_single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+ def int_aarch64_sve_add_single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
// 2-way and 4-way multi-vector signed/unsigned integer dot-product
foreach ty = ["s", "u"] in {
@@ -3798,10 +3795,10 @@ let TargetPrefix = "aarch64" in {
//
// Signed/unsigned multi-vector unpacks
//
- def int_aarch64_sve_sunpk_x2 : SME2_VG2_Unpk_Intrinsic;
- def int_aarch64_sve_uunpk_x2 : SME2_VG2_Unpk_Intrinsic;
- def int_aarch64_sve_sunpk_x4 : SME2_VG4_Unpk_Intrinsic;
- def int_aarch64_sve_uunpk_x4 : SME2_VG4_Unpk_Intrinsic;
+ def int_aarch64_sve_sunpk_x2 : SVE2_VG2_Unpk_Intrinsic;
+ def int_aarch64_sve_uunpk_x2 : SVE2_VG2_Unpk_Intrinsic;
+ def int_aarch64_sve_sunpk_x4 : SVE2_VG4_Unpk_Intrinsic;
+ def int_aarch64_sve_uunpk_x4 : SVE2_VG4_Unpk_Intrinsic;
// 2-way and 4-way vector selects
def int_aarch64_sve_sel_x2 : SVE2_VG2_Sel_Intrinsic;
@@ -4133,4 +4130,4 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_fp8_fvdot_lane_za16_vg1x2 : SME_FP8_ZA_LANE_VGx2_Intrinsic;
def int_aarch64_sme_fp8_fvdotb_lane_za32_vg1x4 : SME_FP8_ZA_LANE_VGx2_Intrinsic;
def int_aarch64_sme_fp8_fvdott_lane_za32_vg1x4 : SME_FP8_ZA_LANE_VGx2_Intrinsic;
-}
+}
\ No newline at end of file
>From 77b2736152223b569cfb822cbd167e211d41afc3 Mon Sep 17 00:00:00 2001
From: Marian Lukac <Marian.Lukac at arm.com>
Date: Fri, 11 Apr 2025 15:20:00 +0000
Subject: [PATCH 4/4] Fix incorrect properties for some ZA intrinsics
---
llvm/include/llvm/IR/IntrinsicsAArch64.td | 66 ++++++++++++++++-------
1 file changed, 48 insertions(+), 18 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 222a20d9e4044..dff343b85ef51 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3257,14 +3257,14 @@ let TargetPrefix = "aarch64" in {
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>],
- [IntrWriteMem, IntrInaccessibleMemOnly]>;
+ [IntrInaccessibleMemOnly]>;
class SME2_ZA_Write_VG4_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
- [IntrWriteMem, IntrInaccessibleMemOnly]>;
+ [IntrInaccessibleMemOnly]>;
class SVE2_VG2_Multi_Single_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
@@ -3677,18 +3677,48 @@ let TargetPrefix = "aarch64" in {
//
// Multi-Single add/sub
//
- def int_aarch64_sme_add_write_single_za_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic;
- def int_aarch64_sme_sub_write_single_za_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic;
- def int_aarch64_sme_add_write_single_za_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic;
- def int_aarch64_sme_sub_write_single_za_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic;
+
+ class SME2_Add_Sub_Write_VG2_Multi_Single_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_anyvector_ty, LLVMMatchType<0>,
+ LLVMMatchType<0>],
+ [IntrInaccessibleMemOnly, IntrWriteMem]>;
+
+ class SME2_Add_Sub_Write_VG4_Multi_Single_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
+ LLVMMatchType<0>],
+ [IntrInaccessibleMemOnly, IntrWriteMem]>;
+
+ def int_aarch64_sme_add_write_single_za_vg1x2 : SME2_Add_Sub_Write_VG2_Multi_Single_Intrinsic;
+ def int_aarch64_sme_sub_write_single_za_vg1x2 : SME2_Add_Sub_Write_VG2_Multi_Single_Intrinsic;
+ def int_aarch64_sme_add_write_single_za_vg1x4 : SME2_Add_Sub_Write_VG4_Multi_Single_Intrinsic;
+ def int_aarch64_sme_sub_write_single_za_vg1x4 : SME2_Add_Sub_Write_VG4_Multi_Single_Intrinsic;
//
// Multi-Multi add/sub
//
- def int_aarch64_sme_add_write_za_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic;
- def int_aarch64_sme_sub_write_za_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic;
- def int_aarch64_sme_add_write_za_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic;
- def int_aarch64_sme_sub_write_za_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic;
+ class SME2_Add_Sub_Write_VG2_Multi_Multi_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_anyvector_ty, LLVMMatchType<0>,
+ LLVMMatchType<0>, LLVMMatchType<0>],
+ [IntrInaccessibleMemOnly, IntrWriteMem]>;
+
+ class SME2_Add_Sub_Write_VG4_Multi_Multi_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>,
+ LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
+ LLVMMatchType<0>, LLVMMatchType<0>],
+ [IntrInaccessibleMemOnly, IntrWriteMem]>;
+
+ def int_aarch64_sme_add_write_za_vg1x2 : SME2_Add_Sub_Write_VG2_Multi_Multi_Intrinsic;
+ def int_aarch64_sme_sub_write_za_vg1x2 : SME2_Add_Sub_Write_VG2_Multi_Multi_Intrinsic;
+ def int_aarch64_sme_add_write_za_vg1x4 : SME2_Add_Sub_Write_VG4_Multi_Multi_Intrinsic;
+ def int_aarch64_sme_sub_write_za_vg1x4 : SME2_Add_Sub_Write_VG4_Multi_Multi_Intrinsic;
// Multi-vector clamps
def int_aarch64_sve_sclamp_single_x2 : SVE2_VG2_Multi_Single_Single_Intrinsic;
@@ -3984,12 +4014,12 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sve_fp8_fmlalltt : SVE2_FP8_FMLA_FDOT;
def int_aarch64_sve_fp8_fmlalltt_lane : SVE2_FP8_FMLA_FDOT_Lane;
- class SME2_FP8_CVT_X2_Single_Intrinsic
+ class SVE2_FP8_CVT_X2_Single_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[llvm_nxv16i8_ty],
[IntrReadMem, IntrInaccessibleMemOnly]>;
- class SME2_FP8_CVT_Single_X4_Intrinsic
+ class SVE2_FP8_CVT_Single_X4_Intrinsic
: DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
[llvm_nxv4f32_ty, llvm_nxv4f32_ty, llvm_nxv4f32_ty, llvm_nxv4f32_ty],
[IntrReadMem, IntrInaccessibleMemOnly]>;
@@ -4053,14 +4083,14 @@ let TargetPrefix = "aarch64" in {
//
// CVT from FP8 to half-precision/BFloat16 multi-vector
//
- def int_aarch64_sve_fp8_cvt1_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
- def int_aarch64_sve_fp8_cvt2_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
+ def int_aarch64_sve_fp8_cvt1_x2 : SVE2_FP8_CVT_X2_Single_Intrinsic;
+ def int_aarch64_sve_fp8_cvt2_x2 : SVE2_FP8_CVT_X2_Single_Intrinsic;
//
// CVT from FP8 to deinterleaved half-precision/BFloat16 multi-vector
//
- def int_aarch64_sve_fp8_cvtl1_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
- def int_aarch64_sve_fp8_cvtl2_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
+ def int_aarch64_sve_fp8_cvtl1_x2 : SVE2_FP8_CVT_X2_Single_Intrinsic;
+ def int_aarch64_sve_fp8_cvtl2_x2 : SVE2_FP8_CVT_X2_Single_Intrinsic;
//
// CVT to FP8 from half-precision/BFloat16/single-precision multi-vector
@@ -4070,8 +4100,8 @@ let TargetPrefix = "aarch64" in {
[llvm_anyvector_ty, LLVMMatchType<0>],
[IntrReadMem, IntrInaccessibleMemOnly]>;
- def int_aarch64_sve_fp8_cvt_x4 : SME2_FP8_CVT_Single_X4_Intrinsic;
- def int_aarch64_sve_fp8_cvtn_x4 : SME2_FP8_CVT_Single_X4_Intrinsic;
+ def int_aarch64_sve_fp8_cvt_x4 : SVE2_FP8_CVT_Single_X4_Intrinsic;
+ def int_aarch64_sve_fp8_cvtn_x4 : SVE2_FP8_CVT_Single_X4_Intrinsic;
// FP8 outer product
def int_aarch64_sme_fp8_fmopa_za16 : SME_FP8_OuterProduct_Intrinsic;
More information about the llvm-commits
mailing list