[llvm] efe9cb0 - [AArch64] Model ZA array using inaccessible memory (#132058)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 14 06:55:09 PDT 2025
Author: Lukacma
Date: 2025-04-14T14:55:06+01:00
New Revision: efe9cb0f79a074ab472ec51d8463aac6931d670a
URL: https://github.com/llvm/llvm-project/commit/efe9cb0f79a074ab472ec51d8463aac6931d670a
DIFF: https://github.com/llvm/llvm-project/commit/efe9cb0f79a074ab472ec51d8463aac6931d670a.diff
LOG: [AArch64] Model ZA array using inaccessible memory (#132058)
This patch changes how ZA array is modelled at LLVM-IR level. Currently
accesses to ZA are represented at LLVM-IR level as memory reads and
writes and at instruction level as unmodeled side-effects. This patch
changes that and models them as purely Inaccessible memory accesses
without any unmodeled side-effects.
Added:
Modified:
llvm/include/llvm/IR/IntrinsicsAArch64.td
llvm/lib/Target/AArch64/SMEInstrFormats.td
llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 77ea0bcaa4b5f..2c6129cedebbf 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2940,7 +2940,7 @@ def int_aarch64_sve_whilewr_d : SVE2_CONFLICT_DETECT_Intrinsic;
let TargetPrefix = "aarch64" in {
class SME_Load_Store_Intrinsic<LLVMType pred_ty>
: DefaultAttrsIntrinsic<[],
- [pred_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<2>>]>;
+ [pred_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<2>>]>;
// Loads
def int_aarch64_sme_ld1b_horiz : SME_Load_Store_Intrinsic<llvm_nxv16i1_ty>;
@@ -2968,18 +2968,18 @@ let TargetPrefix = "aarch64" in {
// Spill + fill
class SME_LDR_STR_ZA_Intrinsic
- : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty, llvm_i32_ty]>;
+ : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty, llvm_i32_ty], [IntrInaccessibleMemOrArgMemOnly]>;
def int_aarch64_sme_ldr : SME_LDR_STR_ZA_Intrinsic;
def int_aarch64_sme_str : SME_LDR_STR_ZA_Intrinsic;
class SME_TileToVector_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
- llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<2>>]>;
+ llvm_i32_ty, llvm_i32_ty], [IntrReadMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<2>>]>;
class SME_VectorToTile_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty, llvm_i32_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
- llvm_anyvector_ty], [ImmArg<ArgIndex<0>>]>;
+ llvm_anyvector_ty], [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
def int_aarch64_sme_read_horiz : SME_TileToVector_Intrinsic;
def int_aarch64_sme_read_vert : SME_TileToVector_Intrinsic;
@@ -2994,13 +2994,13 @@ let TargetPrefix = "aarch64" in {
class SME_MOVAZ_TileToVector_X2_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[llvm_i32_ty, llvm_i32_ty],
- [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+ [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
class SME_MOVAZ_TileToVector_X4_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>,LLVMMatchType<0>],
[llvm_i32_ty, llvm_i32_ty],
- [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+ [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
def int_aarch64_sme_readz_horiz_x2 : SME_MOVAZ_TileToVector_X2_Intrinsic;
def int_aarch64_sme_readz_vert_x2 : SME_MOVAZ_TileToVector_X2_Intrinsic;
@@ -3011,7 +3011,7 @@ let TargetPrefix = "aarch64" in {
class SME_MOVAZ_TileToVector_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[llvm_i32_ty, llvm_i32_ty],
- [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+ [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
def int_aarch64_sme_readz_horiz : SME_MOVAZ_TileToVector_Intrinsic;
def int_aarch64_sme_readz_vert : SME_MOVAZ_TileToVector_Intrinsic;
@@ -3022,12 +3022,12 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_readz_x2
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[llvm_i32_ty],
- [IntrNoMem, IntrHasSideEffects]>;
+ [IntrInaccessibleMemOnly]>;
def int_aarch64_sme_readz_x4
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_i32_ty],
- [IntrNoMem, IntrHasSideEffects]>;
+ [IntrInaccessibleMemOnly]>;
def int_aarch64_sme_write_lane_zt
: DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_anyvector_ty, llvm_i32_ty],
@@ -3038,7 +3038,7 @@ let TargetPrefix = "aarch64" in {
[ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrWriteMem]>;
- def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>]>;
+ def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [IntrWriteMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
def int_aarch64_sme_in_streaming_mode : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrNoMem]>, ClangBuiltin<"__builtin_arm_in_streaming_mode">;
class SME_OuterProduct_Intrinsic
@@ -3047,7 +3047,7 @@ let TargetPrefix = "aarch64" in {
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
LLVMMatchType<0>,
- llvm_anyvector_ty], [ImmArg<ArgIndex<0>>]>;
+ llvm_anyvector_ty], [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
def int_aarch64_sme_mopa : SME_OuterProduct_Intrinsic;
def int_aarch64_sme_mops : SME_OuterProduct_Intrinsic;
@@ -3112,7 +3112,7 @@ let TargetPrefix = "aarch64" in {
[llvm_i32_ty,
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
- llvm_anyvector_ty], [ImmArg<ArgIndex<0>>]>;
+ llvm_anyvector_ty], [IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
def int_aarch64_sme_addha : SME_AddVectorToTile_Intrinsic;
def int_aarch64_sme_addva : SME_AddVectorToTile_Intrinsic;
@@ -3141,9 +3141,9 @@ let TargetPrefix = "aarch64" in {
[IntrNoMem, IntrHasSideEffects]>;
def int_aarch64_sme_za_enable
- : DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
+ : DefaultAttrsIntrinsic<[], [], [IntrWriteMem, IntrInaccessibleMemOnly]>;
def int_aarch64_sme_za_disable
- : DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>;
+ : DefaultAttrsIntrinsic<[], [], [IntrWriteMem, IntrInaccessibleMemOnly]>;
// Clamp
//
@@ -3232,64 +3232,64 @@ let TargetPrefix = "aarch64" in {
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>],
- []>;
+ [IntrInaccessibleMemOnly]>;
class SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>],
- []>;
+ [IntrInaccessibleMemOnly]>;
class SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
LLVMMatchType<0>],
- []>;
+ [IntrInaccessibleMemOnly]>;
class SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
- []>;
+ [IntrInaccessibleMemOnly]>;
class SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
- []>;
+ [IntrInaccessibleMemOnly]>;
class SME2_Matrix_ArrayVector_Single_Index_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty,
LLVMMatchType<0>, llvm_i32_ty],
- [ImmArg<ArgIndex<3>>]>;
+ [IntrInaccessibleMemOnly, ImmArg<ArgIndex<3>>]>;
class SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, llvm_i32_ty],
- [ImmArg<ArgIndex<4>>]>;
+ [IntrInaccessibleMemOnly, ImmArg<ArgIndex<4>>]>;
class SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
LLVMMatchType<0>, llvm_i32_ty],
- [ImmArg<ArgIndex<6>>]>;
+ [IntrInaccessibleMemOnly, ImmArg<ArgIndex<6>>]>;
- class SME2_VG2_Multi_Imm_Intrinsic
+ class SVE2_VG2_Multi_Imm_Intrinsic
: DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>],
[llvm_anyvector_ty, LLVMMatchType<0>,
llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<2>>]>;
- class SME2_VG4_Multi_Imm_Intrinsic
+ class SVE2_VG4_Multi_Imm_Intrinsic
: DefaultAttrsIntrinsic<[LLVMSubdivide4VectorType<0>],
[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>,
@@ -3300,22 +3300,22 @@ let TargetPrefix = "aarch64" in {
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>],
- []>;
+ [IntrInaccessibleMemOnly]>;
class SME2_ZA_Write_VG4_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
- []>;
+ [IntrInaccessibleMemOnly]>;
- class SME2_VG2_Multi_Single_Intrinsic
+ class SVE2_VG2_Multi_Single_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[LLVMMatchType<0>, LLVMMatchType<0>,
LLVMMatchType<0>],
[IntrNoMem]>;
- class SME2_VG4_Multi_Single_Intrinsic
+ class SVE2_VG4_Multi_Single_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
[LLVMMatchType<0>, LLVMMatchType<0>,
@@ -3323,13 +3323,13 @@ let TargetPrefix = "aarch64" in {
LLVMMatchType<0>],
[IntrNoMem]>;
- class SME2_VG2_Multi_Multi_Intrinsic
+ class SVE2_VG2_Multi_Multi_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[LLVMMatchType<0>, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem]>;
- class SME2_VG4_Multi_Multi_Intrinsic
+ class SVE2_VG4_Multi_Multi_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
[LLVMMatchType<0>, LLVMMatchType<0>,
@@ -3353,42 +3353,42 @@ let TargetPrefix = "aarch64" in {
LLVMMatchType<0>, LLVMMatchType<0>,
LLVMMatchType<0>], [IntrNoMem]>;
- class SME2_CVT_VG2_SINGLE_Intrinsic
+ class SVE2_CVT_VG2_SINGLE_Intrinsic
: DefaultAttrsIntrinsic<[LLVMSubdivide2VectorType<0>],
[llvm_anyvector_ty, LLVMMatchType<0>],
[IntrNoMem]>;
- class SME2_CVT_VG2_SINGLE_BF16_Intrinsic
+ class SVE2_CVT_VG2_SINGLE_BF16_Intrinsic
: DefaultAttrsIntrinsic<[llvm_nxv8bf16_ty],
[llvm_nxv4f32_ty, llvm_nxv4f32_ty],
[IntrNoMem]>;
- class SME2_CVT_WIDENING_VG2_Intrinsic
+ class SVE2_CVT_WIDENING_VG2_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[LLVMSubdivide2VectorType<0>], [IntrNoMem]>;
- class SME2_CVT_VG4_SINGLE_Intrinsic
+ class SVE2_CVT_VG4_SINGLE_Intrinsic
: DefaultAttrsIntrinsic<[LLVMSubdivide4VectorType<0>],
[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem]>;
- class SME2_CVT_X2_Intrinsic
+ class SVE2_CVT_X2_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[llvm_anyvector_ty, LLVMMatchType<1>],
[IntrNoMem]>;
- class SME2_CVT_X4_Intrinsic
+ class SVE2_CVT_X4_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, LLVMMatchType<1>],
[IntrNoMem]>;
- class SME2_BFMLS_Intrinsic
+ class SVE2_BFMLS_Intrinsic
: DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
[llvm_nxv4f32_ty, llvm_nxv8bf16_ty, llvm_nxv8bf16_ty],
[IntrNoMem]>;
- class SME2_BFMLS_Lane_Intrinsic
+ class SVE2_BFMLS_Lane_Intrinsic
: DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
[llvm_nxv4f32_ty, llvm_nxv8bf16_ty, llvm_nxv8bf16_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<3>>]>;
@@ -3396,58 +3396,58 @@ let TargetPrefix = "aarch64" in {
class SME2_ZA_ArrayVector_Read_VG2_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[llvm_i32_ty],
- []>;
+ [IntrReadMem, IntrInaccessibleMemOnly]>;
class SME2_ZA_ArrayVector_Read_VG4_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_i32_ty],
- []>;
+ [IntrReadMem, IntrInaccessibleMemOnly]>;
class SME2_Matrix_TileVector_Read_VG2_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[llvm_i32_ty, llvm_i32_ty],
- []>;
+ [IntrReadMem, IntrInaccessibleMemOnly]>;
class SME2_Matrix_TileVector_Read_VG4_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_i32_ty, llvm_i32_ty],
- []>;
+ [IntrReadMem, IntrInaccessibleMemOnly]>;
class SME2_ZA_ArrayVector_Write_VG2_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>],
- []>;
+ [IntrWriteMem, IntrInaccessibleMemOnly]>;
class SME2_ZA_ArrayVector_Write_VG4_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
- []>;
+ [IntrWriteMem, IntrInaccessibleMemOnly]>;
class SME2_Matrix_TileVector_Write_VG2_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty, llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>],
- [ImmArg<ArgIndex<0>>]>;
+ [IntrWriteMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
class SME2_Matrix_TileVector_Write_VG4_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty, llvm_i32_ty,
llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
- [ImmArg<ArgIndex<0>>]>;
+ [IntrWriteMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>]>;
- class SME2_VG2_Multi_Single_Single_Intrinsic
+ class SVE2_VG2_Multi_Single_Single_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[LLVMMatchType<0>, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
[IntrNoMem]>;
- class SME2_VG4_Multi_Single_Single_Intrinsic
+ class SVE2_VG4_Multi_Single_Single_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
[LLVMMatchType<0>, LLVMMatchType<0>,
@@ -3465,11 +3465,11 @@ let TargetPrefix = "aarch64" in {
[LLVMMatchType<0>, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
- class SME2_VG2_Unpk_Intrinsic
+ class SVE2_VG2_Unpk_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[LLVMSubdivide2VectorType<0>], [IntrNoMem]>;
- class SME2_VG4_Unpk_Intrinsic
+ class SVE2_VG4_Unpk_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
LLVMMatchType<0>, LLVMMatchType<0>],
[LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>],
@@ -3510,33 +3510,33 @@ let TargetPrefix = "aarch64" in {
// Multi-vector rounding shift left intrinsics
//
- def int_aarch64_sve_srshl_single_x2 : SME2_VG2_Multi_Single_Intrinsic;
- def int_aarch64_sve_urshl_single_x2 : SME2_VG2_Multi_Single_Intrinsic;
- def int_aarch64_sve_srshl_single_x4 : SME2_VG4_Multi_Single_Intrinsic;
- def int_aarch64_sve_urshl_single_x4 : SME2_VG4_Multi_Single_Intrinsic;
+ def int_aarch64_sve_srshl_single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+ def int_aarch64_sve_urshl_single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+ def int_aarch64_sve_srshl_single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
+ def int_aarch64_sve_urshl_single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
- def int_aarch64_sve_srshl_x2 : SME2_VG2_Multi_Multi_Intrinsic;
- def int_aarch64_sve_urshl_x2 : SME2_VG2_Multi_Multi_Intrinsic;
- def int_aarch64_sve_srshl_x4 : SME2_VG4_Multi_Multi_Intrinsic;
- def int_aarch64_sve_urshl_x4 : SME2_VG4_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_srshl_x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_urshl_x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_srshl_x4 : SVE2_VG4_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_urshl_x4 : SVE2_VG4_Multi_Multi_Intrinsic;
// Multi-vector saturating rounding shift right intrinsics
- def int_aarch64_sve_sqrshr_x2 : SME2_VG2_Multi_Imm_Intrinsic;
- def int_aarch64_sve_uqrshr_x2 : SME2_VG2_Multi_Imm_Intrinsic;
- def int_aarch64_sve_sqrshr_x4 : SME2_VG4_Multi_Imm_Intrinsic;
- def int_aarch64_sve_uqrshr_x4 : SME2_VG4_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_sqrshr_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_uqrshr_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_sqrshr_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_uqrshr_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
- def int_aarch64_sve_sqrshrn_x2 : SME2_VG2_Multi_Imm_Intrinsic;
- def int_aarch64_sve_uqrshrn_x2 : SME2_VG2_Multi_Imm_Intrinsic;
- def int_aarch64_sve_sqrshrn_x4 : SME2_VG4_Multi_Imm_Intrinsic;
- def int_aarch64_sve_uqrshrn_x4 : SME2_VG4_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_sqrshrn_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_uqrshrn_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_sqrshrn_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_uqrshrn_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
- def int_aarch64_sve_sqrshru_x2 : SME2_VG2_Multi_Imm_Intrinsic;
- def int_aarch64_sve_sqrshru_x4 : SME2_VG4_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_sqrshru_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_sqrshru_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
- def int_aarch64_sve_sqrshrun_x2 : SME2_VG2_Multi_Imm_Intrinsic;
- def int_aarch64_sve_sqrshrun_x4 : SME2_VG4_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_sqrshrun_x2 : SVE2_VG2_Multi_Imm_Intrinsic;
+ def int_aarch64_sve_sqrshrun_x4 : SVE2_VG4_Multi_Imm_Intrinsic;
//
// Multi-vector multiply-add/subtract long
@@ -3596,25 +3596,23 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_usmla_za32_lane_vg4x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic;
def int_aarch64_sme_usmla_za32_lane_vg4x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic;
- def int_aarch64_sve_bfmlslb : SME2_BFMLS_Intrinsic;
- def int_aarch64_sve_bfmlslb_lane : SME2_BFMLS_Lane_Intrinsic;
+ def int_aarch64_sve_bfmlslb : SVE2_BFMLS_Intrinsic;
+ def int_aarch64_sve_bfmlslb_lane : SVE2_BFMLS_Lane_Intrinsic;
- def int_aarch64_sve_bfmlslt : SME2_BFMLS_Intrinsic;
- def int_aarch64_sve_bfmlslt_lane : SME2_BFMLS_Lane_Intrinsic;
+ def int_aarch64_sve_bfmlslt : SVE2_BFMLS_Intrinsic;
+ def int_aarch64_sve_bfmlslt_lane : SVE2_BFMLS_Lane_Intrinsic;
// Multi-vector zeroing
foreach vg = ["vg1x2", "vg1x4", "vg2x1", "vg2x2", "vg2x4", "vg4x1", "vg4x2", "vg4x4"] in {
- def int_aarch64_sme_zero_za64_ # vg : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects]>;
+ def int_aarch64_sme_zero_za64_ # vg : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [IntrWriteMem, IntrInaccessibleMemOnly]>;
}
-
// Multi-vector signed saturating doubling multiply high
+ def int_aarch64_sve_sqdmulh_single_vgx2 : SVE2_VG2_Multi_Single_Intrinsic;
+ def int_aarch64_sve_sqdmulh_single_vgx4 : SVE2_VG4_Multi_Single_Intrinsic;
- def int_aarch64_sve_sqdmulh_single_vgx2 : SME2_VG2_Multi_Single_Intrinsic;
- def int_aarch64_sve_sqdmulh_single_vgx4 : SME2_VG4_Multi_Single_Intrinsic;
-
- def int_aarch64_sve_sqdmulh_vgx2 : SME2_VG2_Multi_Multi_Intrinsic;
- def int_aarch64_sve_sqdmulh_vgx4 : SME2_VG4_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_sqdmulh_vgx2 : SVE2_VG2_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_sqdmulh_vgx4 : SVE2_VG4_Multi_Multi_Intrinsic;
// Multi-vector floating-point round to integral value
@@ -3629,11 +3627,11 @@ let TargetPrefix = "aarch64" in {
foreach ty = ["f", "s", "u"] in {
foreach instr = ["max", "min"] in {
- def int_aarch64_sve_ # ty # instr # _single_x2 : SME2_VG2_Multi_Single_Intrinsic;
- def int_aarch64_sve_ # ty # instr # _single_x4 : SME2_VG4_Multi_Single_Intrinsic;
+ def int_aarch64_sve_ # ty # instr # _single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+ def int_aarch64_sve_ # ty # instr # _single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
- def int_aarch64_sve_ # ty # instr # _x2 : SME2_VG2_Multi_Multi_Intrinsic;
- def int_aarch64_sve_ # ty # instr # _x4 : SME2_VG4_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_ # ty # instr # _x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_ # ty # instr # _x4 : SVE2_VG4_Multi_Multi_Intrinsic;
}
}
@@ -3642,11 +3640,11 @@ let TargetPrefix = "aarch64" in {
//
foreach instr = ["fmaxnm", "fminnm"] in {
- def int_aarch64_sve_ # instr # _single_x2 : SME2_VG2_Multi_Single_Intrinsic;
- def int_aarch64_sve_ # instr # _single_x4 : SME2_VG4_Multi_Single_Intrinsic;
+ def int_aarch64_sve_ # instr # _single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+ def int_aarch64_sve_ # instr # _single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
- def int_aarch64_sve_ # instr # _x2 : SME2_VG2_Multi_Multi_Intrinsic;
- def int_aarch64_sve_ # instr # _x4 : SME2_VG4_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_ # instr # _x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+ def int_aarch64_sve_ # instr # _x4 : SVE2_VG4_Multi_Multi_Intrinsic;
}
//
@@ -3654,8 +3652,8 @@ let TargetPrefix = "aarch64" in {
//
foreach instr = ["famax", "famin"] in {
- def int_aarch64_sme_ # instr # _x2 : SME2_VG2_Multi_Multi_Intrinsic;
- def int_aarch64_sme_ # instr # _x4 : SME2_VG4_Multi_Multi_Intrinsic;
+ def int_aarch64_sme_ # instr # _x2 : SVE2_VG2_Multi_Multi_Intrinsic;
+ def int_aarch64_sme_ # instr # _x4 : SVE2_VG4_Multi_Multi_Intrinsic;
}
//
@@ -3677,75 +3675,104 @@ let TargetPrefix = "aarch64" in {
//
//Multi-vector floating-point convert from half-precision to deinterleaved single-precision.
//
-
- def int_aarch64_sve_fcvtl_widen_x2 : SME2_CVT_WIDENING_VG2_Intrinsic;
+ def int_aarch64_sve_fcvtl_widen_x2 : SVE2_CVT_WIDENING_VG2_Intrinsic;
//
// Multi-vector floating-point CVT from single-precision to interleaved half-precision/BFloat16
//
- def int_aarch64_sve_fcvtn_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
- def int_aarch64_sve_bfcvtn_x2 : SME2_CVT_VG2_SINGLE_BF16_Intrinsic;
+ def int_aarch64_sve_fcvtn_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+ def int_aarch64_sve_bfcvtn_x2 : SVE2_CVT_VG2_SINGLE_BF16_Intrinsic;
//
// Multi-vector convert to/from floating-point.
//
- def int_aarch64_sve_fcvt_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
- def int_aarch64_sve_bfcvt_x2 : SME2_CVT_VG2_SINGLE_BF16_Intrinsic;
- def int_aarch64_sve_fcvtzs_x2 : SME2_CVT_X2_Intrinsic;
- def int_aarch64_sve_fcvtzu_x2 : SME2_CVT_X2_Intrinsic;
- def int_aarch64_sve_scvtf_x2 : SME2_CVT_X2_Intrinsic;
- def int_aarch64_sve_ucvtf_x2 : SME2_CVT_X2_Intrinsic;
- def int_aarch64_sve_fcvtzs_x4 : SME2_CVT_X4_Intrinsic;
- def int_aarch64_sve_fcvtzu_x4 : SME2_CVT_X4_Intrinsic;
- def int_aarch64_sve_scvtf_x4 : SME2_CVT_X4_Intrinsic;
- def int_aarch64_sve_ucvtf_x4 : SME2_CVT_X4_Intrinsic;
- def int_aarch64_sve_fcvt_widen_x2 : SME2_CVT_WIDENING_VG2_Intrinsic;
+ def int_aarch64_sve_fcvt_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+ def int_aarch64_sve_bfcvt_x2 : SVE2_CVT_VG2_SINGLE_BF16_Intrinsic;
+ def int_aarch64_sve_fcvtzs_x2 : SVE2_CVT_X2_Intrinsic;
+ def int_aarch64_sve_fcvtzu_x2 : SVE2_CVT_X2_Intrinsic;
+ def int_aarch64_sve_scvtf_x2 : SVE2_CVT_X2_Intrinsic;
+ def int_aarch64_sve_ucvtf_x2 : SVE2_CVT_X2_Intrinsic;
+ def int_aarch64_sve_fcvtzs_x4 : SVE2_CVT_X4_Intrinsic;
+ def int_aarch64_sve_fcvtzu_x4 : SVE2_CVT_X4_Intrinsic;
+ def int_aarch64_sve_scvtf_x4 : SVE2_CVT_X4_Intrinsic;
+ def int_aarch64_sve_ucvtf_x4 : SVE2_CVT_X4_Intrinsic;
+ def int_aarch64_sve_fcvt_widen_x2 : SVE2_CVT_WIDENING_VG2_Intrinsic;
//
// Multi-vector saturating extract narrow
//
- def int_aarch64_sve_sqcvt_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
- def int_aarch64_sve_uqcvt_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
- def int_aarch64_sve_sqcvtu_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
- def int_aarch64_sve_sqcvt_x4 : SME2_CVT_VG4_SINGLE_Intrinsic;
- def int_aarch64_sve_uqcvt_x4 : SME2_CVT_VG4_SINGLE_Intrinsic;
- def int_aarch64_sve_sqcvtu_x4 : SME2_CVT_VG4_SINGLE_Intrinsic;
+ def int_aarch64_sve_sqcvt_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+ def int_aarch64_sve_uqcvt_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+ def int_aarch64_sve_sqcvtu_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+ def int_aarch64_sve_sqcvt_x4 : SVE2_CVT_VG4_SINGLE_Intrinsic;
+ def int_aarch64_sve_uqcvt_x4 : SVE2_CVT_VG4_SINGLE_Intrinsic;
+ def int_aarch64_sve_sqcvtu_x4 : SVE2_CVT_VG4_SINGLE_Intrinsic;
//
// Multi-vector saturating extract narrow and interleave
//
- def int_aarch64_sve_sqcvtn_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
- def int_aarch64_sve_uqcvtn_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
- def int_aarch64_sve_sqcvtun_x2 : SME2_CVT_VG2_SINGLE_Intrinsic;
- def int_aarch64_sve_sqcvtn_x4 : SME2_CVT_VG4_SINGLE_Intrinsic;
- def int_aarch64_sve_uqcvtn_x4 : SME2_CVT_VG4_SINGLE_Intrinsic;
- def int_aarch64_sve_sqcvtun_x4 : SME2_CVT_VG4_SINGLE_Intrinsic;
+ def int_aarch64_sve_sqcvtn_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+ def int_aarch64_sve_uqcvtn_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+ def int_aarch64_sve_sqcvtun_x2 : SVE2_CVT_VG2_SINGLE_Intrinsic;
+ def int_aarch64_sve_sqcvtn_x4 : SVE2_CVT_VG4_SINGLE_Intrinsic;
+ def int_aarch64_sve_uqcvtn_x4 : SVE2_CVT_VG4_SINGLE_Intrinsic;
+ def int_aarch64_sve_sqcvtun_x4 : SVE2_CVT_VG4_SINGLE_Intrinsic;
//
// Multi-Single add/sub
//
- def int_aarch64_sme_add_write_single_za_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic;
- def int_aarch64_sme_sub_write_single_za_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Single_Intrinsic;
- def int_aarch64_sme_add_write_single_za_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic;
- def int_aarch64_sme_sub_write_single_za_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Single_Intrinsic;
+
+ class SME2_Add_Sub_Write_VG2_Multi_Single_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_anyvector_ty, LLVMMatchType<0>,
+ LLVMMatchType<0>],
+ [IntrInaccessibleMemOnly, IntrWriteMem]>;
+
+ class SME2_Add_Sub_Write_VG4_Multi_Single_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
+ LLVMMatchType<0>],
+ [IntrInaccessibleMemOnly, IntrWriteMem]>;
+
+ def int_aarch64_sme_add_write_single_za_vg1x2 : SME2_Add_Sub_Write_VG2_Multi_Single_Intrinsic;
+ def int_aarch64_sme_sub_write_single_za_vg1x2 : SME2_Add_Sub_Write_VG2_Multi_Single_Intrinsic;
+ def int_aarch64_sme_add_write_single_za_vg1x4 : SME2_Add_Sub_Write_VG4_Multi_Single_Intrinsic;
+ def int_aarch64_sme_sub_write_single_za_vg1x4 : SME2_Add_Sub_Write_VG4_Multi_Single_Intrinsic;
//
// Multi-Multi add/sub
//
- def int_aarch64_sme_add_write_za_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic;
- def int_aarch64_sme_sub_write_za_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Multi_Intrinsic;
- def int_aarch64_sme_add_write_za_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic;
- def int_aarch64_sme_sub_write_za_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Multi_Intrinsic;
+ class SME2_Add_Sub_Write_VG2_Multi_Multi_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_anyvector_ty, LLVMMatchType<0>,
+ LLVMMatchType<0>, LLVMMatchType<0>],
+ [IntrInaccessibleMemOnly, IntrWriteMem]>;
+
+ class SME2_Add_Sub_Write_VG4_Multi_Multi_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>,
+ LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
+ LLVMMatchType<0>, LLVMMatchType<0>],
+ [IntrInaccessibleMemOnly, IntrWriteMem]>;
+
+ def int_aarch64_sme_add_write_za_vg1x2 : SME2_Add_Sub_Write_VG2_Multi_Multi_Intrinsic;
+ def int_aarch64_sme_sub_write_za_vg1x2 : SME2_Add_Sub_Write_VG2_Multi_Multi_Intrinsic;
+ def int_aarch64_sme_add_write_za_vg1x4 : SME2_Add_Sub_Write_VG4_Multi_Multi_Intrinsic;
+ def int_aarch64_sme_sub_write_za_vg1x4 : SME2_Add_Sub_Write_VG4_Multi_Multi_Intrinsic;
// Multi-vector clamps
- def int_aarch64_sve_sclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
- def int_aarch64_sve_uclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
- def int_aarch64_sve_fclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
- def int_aarch64_sve_bfclamp_single_x2 : SME2_VG2_Multi_Single_Single_Intrinsic;
+ def int_aarch64_sve_sclamp_single_x2 : SVE2_VG2_Multi_Single_Single_Intrinsic;
+ def int_aarch64_sve_uclamp_single_x2 : SVE2_VG2_Multi_Single_Single_Intrinsic;
+ def int_aarch64_sve_fclamp_single_x2 : SVE2_VG2_Multi_Single_Single_Intrinsic;
+ def int_aarch64_sve_bfclamp_single_x2 : SVE2_VG2_Multi_Single_Single_Intrinsic;
- def int_aarch64_sve_sclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
- def int_aarch64_sve_uclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
- def int_aarch64_sve_fclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
- def int_aarch64_sve_bfclamp_single_x4 : SME2_VG4_Multi_Single_Single_Intrinsic;
+ def int_aarch64_sve_sclamp_single_x4 : SVE2_VG4_Multi_Single_Single_Intrinsic;
+ def int_aarch64_sve_uclamp_single_x4 : SVE2_VG4_Multi_Single_Single_Intrinsic;
+ def int_aarch64_sve_fclamp_single_x4 : SVE2_VG4_Multi_Single_Single_Intrinsic;
+ def int_aarch64_sve_bfclamp_single_x4 : SVE2_VG4_Multi_Single_Single_Intrinsic;
//
// Multi-vector add/sub and accumulate into ZA
@@ -3782,8 +3809,8 @@ let TargetPrefix = "aarch64" in {
//
// Multi-Single Vector add
//
- def int_aarch64_sve_add_single_x2 : SME2_VG2_Multi_Single_Intrinsic;
- def int_aarch64_sve_add_single_x4 : SME2_VG4_Multi_Single_Intrinsic;
+ def int_aarch64_sve_add_single_x2 : SVE2_VG2_Multi_Single_Intrinsic;
+ def int_aarch64_sve_add_single_x4 : SVE2_VG4_Multi_Single_Intrinsic;
// 2-way and 4-way multi-vector signed/unsigned integer dot-product
foreach ty = ["s", "u"] in {
@@ -3841,10 +3868,10 @@ let TargetPrefix = "aarch64" in {
//
// Signed/unsigned multi-vector unpacks
//
- def int_aarch64_sve_sunpk_x2 : SME2_VG2_Unpk_Intrinsic;
- def int_aarch64_sve_uunpk_x2 : SME2_VG2_Unpk_Intrinsic;
- def int_aarch64_sve_sunpk_x4 : SME2_VG4_Unpk_Intrinsic;
- def int_aarch64_sve_uunpk_x4 : SME2_VG4_Unpk_Intrinsic;
+ def int_aarch64_sve_sunpk_x2 : SVE2_VG2_Unpk_Intrinsic;
+ def int_aarch64_sve_uunpk_x2 : SVE2_VG2_Unpk_Intrinsic;
+ def int_aarch64_sve_sunpk_x4 : SVE2_VG4_Unpk_Intrinsic;
+ def int_aarch64_sve_uunpk_x4 : SVE2_VG4_Unpk_Intrinsic;
// 2-way and 4-way vector selects
def int_aarch64_sve_sel_x2 : SVE2_VG2_Sel_Intrinsic;
@@ -4030,12 +4057,12 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sve_fp8_fmlalltt : SVE2_FP8_FMLA_FDOT;
def int_aarch64_sve_fp8_fmlalltt_lane : SVE2_FP8_FMLA_FDOT_Lane;
- class SME2_FP8_CVT_X2_Single_Intrinsic
+ class SVE2_FP8_CVT_X2_Single_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[llvm_nxv16i8_ty],
[IntrReadMem, IntrInaccessibleMemOnly]>;
- class SME2_FP8_CVT_Single_X4_Intrinsic
+ class SVE2_FP8_CVT_Single_X4_Intrinsic
: DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
[llvm_nxv4f32_ty, llvm_nxv4f32_ty, llvm_nxv4f32_ty, llvm_nxv4f32_ty],
[IntrReadMem, IntrInaccessibleMemOnly]>;
@@ -4045,68 +4072,68 @@ let TargetPrefix = "aarch64" in {
[llvm_i32_ty,
llvm_nxv16i1_ty, llvm_nxv16i1_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty],
- [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+ [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly]>;
class SME_FP8_ZA_LANE_VGx1_Intrinsic
: DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty,
llvm_nxv16i8_ty,
llvm_i32_ty],
- [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<3>>]>;
+ [IntrInaccessibleMemOnly, ImmArg<ArgIndex<3>>]>;
class SME_FP8_ZA_LANE_VGx2_Intrinsic
: DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty,
llvm_nxv16i8_ty,
llvm_i32_ty],
- [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<4>>]>;
+ [IntrInaccessibleMemOnly, ImmArg<ArgIndex<4>>]>;
class SME_FP8_ZA_LANE_VGx4_Intrinsic
: DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
llvm_nxv16i8_ty,
llvm_i32_ty],
- [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<6>>]>;
+ [IntrInaccessibleMemOnly, ImmArg<ArgIndex<6>>]>;
class SME_FP8_ZA_SINGLE_VGx1_Intrinsic
: DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty,
llvm_nxv16i8_ty],
- [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+ [IntrInaccessibleMemOnly]>;
class SME_FP8_ZA_SINGLE_VGx2_Intrinsic
: DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty,
llvm_nxv16i8_ty],
- [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+ [IntrInaccessibleMemOnly]>;
class SME_FP8_ZA_SINGLE_VGx4_Intrinsic
: DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
llvm_nxv16i8_ty],
- [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+ [IntrInaccessibleMemOnly]>;
class SME_FP8_ZA_MULTI_VGx2_Intrinsic
: DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty],
- [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+ [IntrInaccessibleMemOnly]>;
class SME_FP8_ZA_MULTI_VGx4_Intrinsic
: DefaultAttrsIntrinsic<[], [llvm_i32_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
- [IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+ [IntrInaccessibleMemOnly]>;
//
// CVT from FP8 to half-precision/BFloat16 multi-vector
//
- def int_aarch64_sve_fp8_cvt1_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
- def int_aarch64_sve_fp8_cvt2_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
+ def int_aarch64_sve_fp8_cvt1_x2 : SVE2_FP8_CVT_X2_Single_Intrinsic;
+ def int_aarch64_sve_fp8_cvt2_x2 : SVE2_FP8_CVT_X2_Single_Intrinsic;
//
// CVT from FP8 to deinterleaved half-precision/BFloat16 multi-vector
//
- def int_aarch64_sve_fp8_cvtl1_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
- def int_aarch64_sve_fp8_cvtl2_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
+ def int_aarch64_sve_fp8_cvtl1_x2 : SVE2_FP8_CVT_X2_Single_Intrinsic;
+ def int_aarch64_sve_fp8_cvtl2_x2 : SVE2_FP8_CVT_X2_Single_Intrinsic;
//
// CVT to FP8 from half-precision/BFloat16/single-precision multi-vector
@@ -4116,8 +4143,8 @@ let TargetPrefix = "aarch64" in {
[llvm_anyvector_ty, LLVMMatchType<0>],
[IntrReadMem, IntrInaccessibleMemOnly]>;
- def int_aarch64_sve_fp8_cvt_x4 : SME2_FP8_CVT_Single_X4_Intrinsic;
- def int_aarch64_sve_fp8_cvtn_x4 : SME2_FP8_CVT_Single_X4_Intrinsic;
+ def int_aarch64_sve_fp8_cvt_x4 : SVE2_FP8_CVT_Single_X4_Intrinsic;
+ def int_aarch64_sve_fp8_cvtn_x4 : SVE2_FP8_CVT_Single_X4_Intrinsic;
// FP8 outer product
def int_aarch64_sme_fp8_fmopa_za16 : SME_FP8_OuterProduct_Intrinsic;
@@ -4176,4 +4203,4 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_fp8_fvdot_lane_za16_vg1x2 : SME_FP8_ZA_LANE_VGx2_Intrinsic;
def int_aarch64_sme_fp8_fvdotb_lane_za32_vg1x4 : SME_FP8_ZA_LANE_VGx2_Intrinsic;
def int_aarch64_sme_fp8_fvdott_lane_za32_vg1x4 : SME_FP8_ZA_LANE_VGx2_Intrinsic;
-}
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index c008cda21cf05..b611dddb0b045 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -102,6 +102,8 @@ class sme_outer_product_pseudo<ZPRRegOp zpr_ty, SMEMatrixTypeEnum za_flag>
// Translated to the actual instructions in AArch64ISelLowering.cpp
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
}
class sme2_quarter_tile_outer_product_pseudo<RegisterOperand zn_ty, RegisterOperand zm_ty, SMEMatrixTypeEnum za_flag>
@@ -119,6 +121,8 @@ class sme2_za_array_2op_multi_single_pseudo<string name, Operand index_ty, Regis
Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm), []> {
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
}
class sme2_za_array_2op_multi_multi_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
@@ -127,6 +131,8 @@ class sme2_za_array_2op_multi_multi_pseudo<string name, Operand index_ty, Regist
Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3, multi_vector_ty:$Zn, multi_vector_ty:$Zm), []> {
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
}
class sme2_za_array_2op_multi_index_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
@@ -135,6 +141,8 @@ class sme2_za_array_2op_multi_index_pseudo<string name, Operand index_ty, Regist
Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3, multi_vector_ty:$Zn, zpr_ty:$Zm, imm_ty:$i), []> {
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
}
class sme2_move_to_za_pseudo<string name, Operand imm_ty, RegisterOperand multi_vector_ty, SMEMatrixTypeEnum za_flag>
@@ -142,6 +150,7 @@ class sme2_move_to_za_pseudo<string name, Operand imm_ty, RegisterOperand multi_
Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rs, imm_ty:$imm, multi_vector_ty:$Zn), []> {
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayStore = 1;
}
class sme2_move_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, RegisterOperand multi_vector_ty, SMEMatrixTypeEnum za_flag>
@@ -149,6 +158,7 @@ class sme2_move_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, Re
Pseudo<(outs), (ins tile_imm:$tile, MatrixIndexGPR32Op12_15:$Rs, imm_ty:$imm, multi_vector_ty:$Zn), []> {
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayStore = 1;
}
class sem2p1_zero_matrix_pseudo<string name, Operand index_ty, SMEMatrixTypeEnum za_flag>
@@ -156,6 +166,7 @@ class sem2p1_zero_matrix_pseudo<string name, Operand index_ty, SMEMatrixTypeEnum
Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rs, index_ty:$imm), []> {
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayStore = 1;
}
class sme2_movez_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, RegisterOperand vector_ty, SMEMatrixTypeEnum za_flag>
@@ -163,6 +174,8 @@ class sme2_movez_to_tile_pseudo<string name, Operand tile_imm, Operand imm_ty, R
Pseudo<(outs vector_ty:$Zn), (ins tile_imm:$tile, MatrixIndexGPR32Op12_15:$Rs, imm_ty:$imm), []> {
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
}
class sme2_movaz_array_to_tile_pseudo<string name, Operand index_ty, RegisterOperand multi_vector_ty,
@@ -171,6 +184,8 @@ class sme2_movaz_array_to_tile_pseudo<string name, Operand index_ty, RegisterOpe
Pseudo<(outs multi_vector_ty:$Zd), (ins MatrixIndexGPR32Op8_11:$Rv, index_ty:$imm3), []> {
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
}
//===----------------------------------------------------------------------===//
@@ -757,6 +772,8 @@ class sme_add_vector_to_tile_pseudo<ZPRRegOp zpr_ty, SMEMatrixTypeEnum za_flag>
// Translated to the actual instructions in AArch64ISelLowering.cpp
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
}
multiclass sme_add_vector_to_tile_u32<bit V, string mnemonic, SDPatternOperator op> {
@@ -1215,6 +1232,7 @@ class sme_mova_insert_pseudo<SMEMatrixTypeEnum za_flag>
// Translated to the actual instructions in AArch64ISelLowering.cpp
let SMEMatrixType = za_flag;
let usesCustomInserter = 1;
+ let mayStore = 1;
}
multiclass sme_vector_v_to_tile<string mnemonic, bit is_col> {
@@ -1409,6 +1427,7 @@ multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
is_col, sme_elm_idx0_15, mnemonic> {
bits<4> imm;
let Inst{8-5} = imm;
+ let mayLoad = 1;
}
def _H : sme_tile_to_vector_inst<0b0, 0b01, ZPR16, !if(is_col, TileVectorOpV16,
TileVectorOpH16),
@@ -1417,6 +1436,7 @@ multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
bits<3> imm;
let Inst{8} = ZAn;
let Inst{7-5} = imm;
+ let mayLoad = 1;
}
def _S : sme_tile_to_vector_inst<0b0, 0b10, ZPR32, !if(is_col, TileVectorOpV32,
TileVectorOpH32),
@@ -1425,6 +1445,7 @@ multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
bits<2> imm;
let Inst{8-7} = ZAn;
let Inst{6-5} = imm;
+ let mayLoad = 1;
}
def _D : sme_tile_to_vector_inst<0b0, 0b11, ZPR64, !if(is_col, TileVectorOpV64,
TileVectorOpH64),
@@ -1433,12 +1454,14 @@ multiclass sme_tile_to_vector_v<string mnemonic, bit is_col> {
bits<1> imm;
let Inst{8-6} = ZAn;
let Inst{5} = imm;
+ let mayLoad = 1;
}
def _Q : sme_tile_to_vector_inst<0b1, 0b11, ZPR128, !if(is_col, TileVectorOpV128,
TileVectorOpH128),
is_col, sme_elm_idx0_0, mnemonic> {
bits<4> ZAn;
let Inst{8-5} = ZAn;
+ let mayLoad = 1;
}
defm : sme_tile_to_vector_aliases<!cast<Instruction>(NAME # _B), ZPR8,
@@ -1909,7 +1932,9 @@ multiclass sme2_multivec_accum_add_sub_vg2<string mnemonic, bits<4> op,
def : InstAlias<mnemonic # "\t$ZAdn[$Rv, $imm3], $Zm",
(!cast<Instruction>(NAME) matrix_ty:$ZAdn, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, vector_ty:$Zm), 0>;
- def _PSEUDO : sme2_move_to_za_pseudo<NAME, sme_elm_idx0_7, vector_ty, SMEMatrixArray>;
+ def _PSEUDO : sme2_move_to_za_pseudo<NAME, sme_elm_idx0_7, vector_ty, SMEMatrixArray>{
+ let mayLoad = 1;
+ }
def : SME2_ZA_VG1x2_Multi_Pat<NAME, intrinsic, vty, sme_elm_idx0_7, tileslice16>;
}
@@ -1932,7 +1957,9 @@ multiclass sme2_multivec_accum_add_sub_vg4<string mnemonic, bits<4> op,
def : InstAlias<mnemonic # "\t$ZAdn[$Rv, $imm3], $Zm",
(!cast<Instruction>(NAME) matrix_ty:$ZAdn, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3, vector_ty:$Zm), 0>;
- def _PSEUDO : sme2_move_to_za_pseudo<NAME, sme_elm_idx0_7, vector_ty, SMEMatrixArray>;
+ def _PSEUDO : sme2_move_to_za_pseudo<NAME, sme_elm_idx0_7, vector_ty, SMEMatrixArray>{
+ let mayLoad = 1;
+ }
def : SME2_ZA_VG1x4_Multi_Pat<NAME, intrinsic, vty, sme_elm_idx0_7, tileslice16>;
}
@@ -4441,6 +4468,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
uimm3s2range, mnemonic>, SMEPseudo2Instr<NAME # _B, 1> {
bits<3> imm;
let Inst{7-5} = imm;
+ let mayLoad = 1;
}
def _H : sme2_mova_tile_to_vec_vg2_multi_base<0b01, v, opc, ZZ_h_mul_r,
@@ -4451,6 +4479,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
bits<2> imm;
let Inst{7} = ZAn;
let Inst{6-5} = imm;
+ let mayLoad = 1;
}
def _S : sme2_mova_tile_to_vec_vg2_multi_base<0b10, v, opc, ZZ_s_mul_r,
@@ -4461,6 +4490,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
bits<1> imm;
let Inst{7-6} = ZAn;
let Inst{5} = imm;
+ let mayLoad = 1;
}
def _D : sme2_mova_tile_to_vec_vg2_multi_base<0b11, v, opc, ZZ_d_mul_r,
@@ -4469,6 +4499,7 @@ multiclass sme2_mova_tile_to_vec_vg2_multi_inst<bit v, bits<3> opc, string mnemo
uimm0s2range, mnemonic>, SMEPseudo2Instr<NAME # _D, 1> {
bits<3> ZAn;
let Inst{7-5} = ZAn;
+ let mayLoad = 1;
}
if !eq(mnemonic, "mova") then {
@@ -4583,6 +4614,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
uimm2s4range, mnemonic>, SMEPseudo2Instr<NAME # _B, 1> {
bits<2> imm;
let Inst{6-5} = imm;
+ let mayLoad = 1;
}
def _H : sme2_mova_tile_to_vec_vg4_multi_base<0b01, v, {opc,0,?,?},
@@ -4594,6 +4626,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
bits<1> imm;
let Inst{6} = ZAn;
let Inst{5} = imm;
+ let mayLoad = 1;
}
def _S : sme2_mova_tile_to_vec_vg4_multi_base<0b10, v, {opc,0,?,?},
@@ -4603,6 +4636,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
uimm0s4range, mnemonic>, SMEPseudo2Instr<NAME # _S, 1> {
bits<2> ZAn;
let Inst{6-5} = ZAn;
+ let mayLoad = 1;
}
def _D : sme2_mova_tile_to_vec_vg4_multi_base<0b11, v, {opc,?,?,?},
@@ -4612,6 +4646,7 @@ multiclass sme2_mova_tile_to_vec_vg4_multi_base<bit v, bits<3> opc, string mnemo
uimm0s4range, mnemonic>, SMEPseudo2Instr<NAME # _D, 1> {
bits<3> ZAn;
let Inst{7-5} = ZAn;
+ let mayLoad = 1;
}
if !eq(mnemonic, "mova") then {
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
index 1960987cce4ce..12d945f575f68 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
@@ -1,27 +1,26 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming -verify-machineinstrs < %s | FileCheck %s
-define <vscale x 16 x i8> @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
+define void @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
; CHECK-LABEL: extract_row_b:
; CHECK: // %bb.0:
+; CHECK-NEXT: mov z7.d, z0.d
; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.b, p0/m, za0h.b[w12, 0]
-; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 2]
; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: mov z0.b, p0/m, za0h.b[w12, 0]
+; CHECK-NEXT: mov z4.d, z7.d
+; CHECK-NEXT: mov z5.d, z7.d
+; CHECK-NEXT: mov z6.d, z7.d
+; CHECK-NEXT: mov z1.b, p0/m, za0h.b[w12, 2]
; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 4]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 6]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 8]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 10]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 12]
-; CHECK-NEXT: mov z0.b, p0/m, za0h.b[w12, 14]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z3.b, p0/m, za0h.b[w12, 6]
+; CHECK-NEXT: mov z4.b, p0/m, za0h.b[w12, 8]
+; CHECK-NEXT: mov z5.b, p0/m, za0h.b[w12, 10]
+; CHECK-NEXT: mov z6.b, p0/m, za0h.b[w12, 12]
+; CHECK-NEXT: mov z7.b, p0/m, za0h.b[w12, 14]
+; CHECK-NEXT: b use
%z0 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.2 = add i32 %tileslice, 2
%z1 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.2)
@@ -37,30 +36,33 @@ define <vscale x 16 x i8> @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x
%z6 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.12)
%tileslice.14 = add i32 %tileslice, 14
%z7 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.horiz.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.14)
- ret <vscale x 16 x i8> %z0
+
+ ; Force retention of z0..z7
+ tail call void @use(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
+ <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
+ ret void
}
-define <vscale x 16 x i8> @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
+define void @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
; CHECK-LABEL: extract_col_b:
; CHECK: // %bb.0:
+; CHECK-NEXT: mov z7.d, z0.d
; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.b, p0/m, za0v.b[w12, 1]
-; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 3]
; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: mov z0.b, p0/m, za0v.b[w12, 1]
+; CHECK-NEXT: mov z4.d, z7.d
+; CHECK-NEXT: mov z5.d, z7.d
+; CHECK-NEXT: mov z6.d, z7.d
+; CHECK-NEXT: mov z1.b, p0/m, za0v.b[w12, 3]
; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 5]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 7]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 9]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 11]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 13]
-; CHECK-NEXT: mov z0.b, p0/m, za0v.b[w12, 15]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z3.b, p0/m, za0v.b[w12, 7]
+; CHECK-NEXT: mov z4.b, p0/m, za0v.b[w12, 9]
+; CHECK-NEXT: mov z5.b, p0/m, za0v.b[w12, 11]
+; CHECK-NEXT: mov z6.b, p0/m, za0v.b[w12, 13]
+; CHECK-NEXT: mov z7.b, p0/m, za0v.b[w12, 15]
+; CHECK-NEXT: b use
%tileslice.1 = add i32 %tileslice, 1
%z0 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.1)
%tileslice.3 = add i32 %tileslice, 3
@@ -77,22 +79,24 @@ define <vscale x 16 x i8> @extract_col_b(<vscale x 16 x i8> %zd, <vscale x 16 x
%z6 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.13)
%tileslice.15 = add i32 %tileslice, 15
%z7 = call <vscale x 16 x i8> @llvm.aarch64.sme.read.vert.nxv16i8(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 0, i32 %tileslice.15)
- ret <vscale x 16 x i8> %z0
+
+ tail call void @use(<vscale x 16 x i8> %z0, <vscale x 16 x i8> %z1, <vscale x 16 x i8> %z2, <vscale x 16 x i8> %z3,
+ <vscale x 16 x i8> %z4, <vscale x 16 x i8> %z5, <vscale x 16 x i8> %z6, <vscale x 16 x i8> %z7)
+ ret void
}
-define <vscale x 8 x i16> @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
+define void @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
; CHECK-LABEL: extract_row_h:
; CHECK: // %bb.0:
+; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.h, p0/m, za0h.h[w12, 0]
-; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 2]
; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z0.h, p0/m, za0h.h[w12, 0]
+; CHECK-NEXT: mov z1.h, p0/m, za0h.h[w12, 2]
; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 4]
-; CHECK-NEXT: mov z0.h, p0/m, za0h.h[w12, 6]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z3.h, p0/m, za0h.h[w12, 6]
+; CHECK-NEXT: b use
%z0 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.2 = add i32 %tileslice, 2
%z1 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.2)
@@ -100,22 +104,23 @@ define <vscale x 8 x i16> @extract_row_h(<vscale x 8 x i16> %zd, <vscale x 8 x i
%z2 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.4)
%tileslice.6 = add i32 %tileslice, 6
%z3 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.horiz.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.6)
- ret <vscale x 8 x i16> %z0
+
+ tail call void @use(<vscale x 8 x i16> %z0, <vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z3)
+ ret void
}
-define <vscale x 8 x i16> @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
+define void @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
; CHECK-LABEL: extract_col_h:
; CHECK: // %bb.0:
+; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.h, p0/m, za1v.h[w12, 1]
-; CHECK-NEXT: mov z2.h, p0/m, za1v.h[w12, 3]
; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z0.h, p0/m, za1v.h[w12, 1]
+; CHECK-NEXT: mov z1.h, p0/m, za1v.h[w12, 3]
; CHECK-NEXT: mov z2.h, p0/m, za1v.h[w12, 5]
-; CHECK-NEXT: mov z0.h, p0/m, za1v.h[w12, 7]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z3.h, p0/m, za1v.h[w12, 7]
+; CHECK-NEXT: b use
%tileslice.1 = add i32 %tileslice, 1
%z0 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.1)
%tileslice.3 = add i32 %tileslice, 3
@@ -124,30 +129,31 @@ define <vscale x 8 x i16> @extract_col_h(<vscale x 8 x i16> %zd, <vscale x 8 x i
%z2 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.5)
%tileslice.7 = add i32 %tileslice, 7
%z3 = call <vscale x 8 x i16> @llvm.aarch64.sme.read.vert.nxv8i16(<vscale x 8 x i16> %zd, <vscale x 8 x i1> %pg, i32 1, i32 %tileslice.7)
- ret <vscale x 8 x i16> %z0
+
+ tail call void @use(<vscale x 8 x i16> %z0, <vscale x 8 x i16> %z1, <vscale x 8 x i16> %z2, <vscale x 8 x i16> %z3)
+ ret void
}
-define <vscale x 8 x half> @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
+define void @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 %tileslice) {
; CHECK-LABEL: extract_f16:
; CHECK: // %bb.0:
+; CHECK-NEXT: mov z7.d, z0.d
; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.h, p0/m, za0h.h[w12, 0]
-; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 1]
; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: mov z0.h, p0/m, za0h.h[w12, 0]
+; CHECK-NEXT: mov z4.d, z7.d
+; CHECK-NEXT: mov z5.d, z7.d
+; CHECK-NEXT: mov z6.d, z7.d
+; CHECK-NEXT: mov z1.h, p0/m, za0h.h[w12, 1]
; CHECK-NEXT: mov z2.h, p0/m, za0v.h[w12, 2]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.h, p0/m, za0v.h[w12, 3]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 4]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 5]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.h, p0/m, za0v.h[w12, 6]
-; CHECK-NEXT: mov z0.h, p0/m, za0v.h[w12, 7]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z3.h, p0/m, za0v.h[w12, 3]
+; CHECK-NEXT: mov z4.h, p0/m, za0h.h[w12, 4]
+; CHECK-NEXT: mov z5.h, p0/m, za0h.h[w12, 5]
+; CHECK-NEXT: mov z6.h, p0/m, za0v.h[w12, 6]
+; CHECK-NEXT: mov z7.h, p0/m, za0v.h[w12, 7]
+; CHECK-NEXT: b use
%z0 = call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.1 = add i32 %tileslice, 1
%z1 = call <vscale x 8 x half> @llvm.aarch64.sme.read.horiz.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -163,30 +169,32 @@ define <vscale x 8 x half> @extract_f16(<vscale x 8 x half> %zd, <vscale x 8 x i
%z6 = call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.6)
%tileslice.7 = add i32 %tileslice, 7
%z7 = call <vscale x 8 x half> @llvm.aarch64.sme.read.vert.nxv8f16(<vscale x 8 x half> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.7)
- ret <vscale x 8 x half> %z0
+
+ tail call void @use(<vscale x 8 x half> %z0, <vscale x 8 x half> %z1, <vscale x 8 x half> %z2, <vscale x 8 x half> %z3,
+ <vscale x 8 x half> %z4, <vscale x 8 x half> %z5, <vscale x 8 x half> %z6, <vscale x 8 x half> %z7)
+ ret void
}
-define <vscale x 8 x bfloat> @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 %tileslice, ptr %ptr) {
+define void @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 %tileslice, ptr %ptr) {
; CHECK-LABEL: extract_bf16:
; CHECK: // %bb.0:
+; CHECK-NEXT: mov z7.d, z0.d
; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.h, p0/m, za0h.h[w12, 0]
-; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 1]
; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z3.d, z0.d
+; CHECK-NEXT: mov z0.h, p0/m, za0h.h[w12, 0]
+; CHECK-NEXT: mov z4.d, z7.d
+; CHECK-NEXT: mov z5.d, z7.d
+; CHECK-NEXT: mov z6.d, z7.d
+; CHECK-NEXT: mov z1.h, p0/m, za0h.h[w12, 1]
; CHECK-NEXT: mov z2.h, p0/m, za0v.h[w12, 2]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.h, p0/m, za0v.h[w12, 3]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 4]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 5]
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: mov z2.h, p0/m, za0v.h[w12, 6]
-; CHECK-NEXT: mov z0.h, p0/m, za0v.h[w12, 7]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z3.h, p0/m, za0v.h[w12, 3]
+; CHECK-NEXT: mov z4.h, p0/m, za0h.h[w12, 4]
+; CHECK-NEXT: mov z5.h, p0/m, za0h.h[w12, 5]
+; CHECK-NEXT: mov z6.h, p0/m, za0v.h[w12, 6]
+; CHECK-NEXT: mov z7.h, p0/m, za0v.h[w12, 7]
+; CHECK-NEXT: b use
%z0 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.1 = add i32 %tileslice, 1
%z1 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.horiz.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -202,53 +210,57 @@ define <vscale x 8 x bfloat> @extract_bf16(<vscale x 8 x bfloat> %zd, <vscale x
%z6 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.6)
%tileslice.7 = add i32 %tileslice, 7
%z7 = call <vscale x 8 x bfloat> @llvm.aarch64.sme.read.vert.nxv8bf16(<vscale x 8 x bfloat> %zd, <vscale x 8 x i1> %pg, i32 0, i32 %tileslice.7)
- ret <vscale x 8 x bfloat> %z0
+
+ tail call void @use(<vscale x 8 x bfloat> %z0, <vscale x 8 x bfloat> %z1, <vscale x 8 x bfloat> %z2, <vscale x 8 x bfloat> %z3,
+ <vscale x 8 x bfloat> %z4, <vscale x 8 x bfloat> %z5, <vscale x 8 x bfloat> %z6, <vscale x 8 x bfloat> %z7)
+ ret void
}
-define <vscale x 4 x i32> @extract_row_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
+define void @extract_row_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
; CHECK-LABEL: extract_row_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 0]
-; CHECK-NEXT: mov z0.s, p0/m, za0h.s[w12, 2]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z0.s, p0/m, za0h.s[w12, 0]
+; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 2]
+; CHECK-NEXT: b use
%z0 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.2 = add i32 %tileslice, 2
%z1 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.horiz.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.2)
- ret <vscale x 4 x i32> %z0
+
+ tail call void @use(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
+ ret void
}
-define <vscale x 4 x i32> @extract_col_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
+define void @extract_col_s(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
; CHECK-LABEL: extract_col_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.s, p0/m, za3v.s[w12, 1]
-; CHECK-NEXT: mov z0.s, p0/m, za3v.s[w12, 3]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z0.s, p0/m, za3v.s[w12, 1]
+; CHECK-NEXT: mov z1.s, p0/m, za3v.s[w12, 3]
+; CHECK-NEXT: b use
%tileslice.1 = add i32 %tileslice, 1
%z0 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 3, i32 %tileslice.1)
%tileslice.3 = add i32 %tileslice, 3
%z1 = call <vscale x 4 x i32> @llvm.aarch64.sme.read.vert.nxv4i32(<vscale x 4 x i32> %zd, <vscale x 4 x i1> %pg, i32 3, i32 %tileslice.3)
- ret <vscale x 4 x i32> %z0
+
+ tail call void @use(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1)
+ ret void
}
-define <vscale x 4 x float> @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
+define void @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 %tileslice) {
; CHECK-LABEL: extract_f32:
; CHECK: // %bb.0:
+; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: mov z2.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 0]
-; CHECK-NEXT: mov z2.s, p0/m, za0h.s[w12, 1]
; CHECK-NEXT: mov z2.d, z0.d
+; CHECK-NEXT: mov z0.s, p0/m, za0h.s[w12, 0]
+; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 1]
; CHECK-NEXT: mov z2.s, p0/m, za0v.s[w12, 2]
-; CHECK-NEXT: mov z0.s, p0/m, za0v.s[w12, 3]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z3.s, p0/m, za0v.s[w12, 3]
+; CHECK-NEXT: b use
%z0 = call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.1 = add i32 %tileslice, 1
%z1 = call <vscale x 4 x float> @llvm.aarch64.sme.read.horiz.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.1)
@@ -256,7 +268,9 @@ define <vscale x 4 x float> @extract_f32(<vscale x 4 x float> %zd, <vscale x 4 x
%z2 = call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.2)
%tileslice.3 = add i32 %tileslice, 3
%z3 = call <vscale x 4 x float> @llvm.aarch64.sme.read.vert.nxv4f32(<vscale x 4 x float> %zd, <vscale x 4 x i1> %pg, i32 0, i32 %tileslice.3)
- ret <vscale x 4 x float> %z0
+
+ tail call void @use(<vscale x 4 x float> %z0, <vscale x 4 x float> %z1, <vscale x 4 x float> %z2, <vscale x 4 x float> %z3)
+ ret void
}
define <vscale x 2 x i64> @extract_row_d(<vscale x 2 x i64> %zd, <vscale x 2 x i1> %pg, i32 %tileslice) {
@@ -280,19 +294,20 @@ define <vscale x 2 x i64> @extract_col_d(<vscale x 2 x i64> %zd, <vscale x 2 x i
ret <vscale x 2 x i64> %z0
}
-define <vscale x 2 x double> @extract_f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 %tileslice) {
+define void @extract_f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 %tileslice) {
; CHECK-LABEL: extract_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.d, z0.d
; CHECK-NEXT: mov w12, w0
-; CHECK-NEXT: mov z1.d, p0/m, za0h.d[w12, 0]
-; CHECK-NEXT: mov z0.d, p0/m, za0v.d[w12, 1]
-; CHECK-NEXT: mov z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-NEXT: mov z0.d, p0/m, za0h.d[w12, 0]
+; CHECK-NEXT: mov z1.d, p0/m, za0v.d[w12, 1]
+; CHECK-NEXT: b use
%z0 = call <vscale x 2 x double> @llvm.aarch64.sme.read.horiz.nxv2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 0, i32 %tileslice)
%tileslice.1 = add i32 %tileslice, 1
%z1 = call <vscale x 2 x double> @llvm.aarch64.sme.read.vert.nxv2f64(<vscale x 2 x double> %zd, <vscale x 2 x i1> %pg, i32 0, i32 %tileslice.1)
- ret <vscale x 2 x double> %z0
+
+ tail call void @use(<vscale x 2 x double> %z0, <vscale x 2 x double> %z1)
+ ret void
}
define <vscale x 16 x i8> @extract_row_q_v16i18(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg) {
@@ -506,3 +521,9 @@ declare <vscale x 4 x i32> @llvm.aarch64.sme.readq.vert.nxv4i32(<vscale x 4 x i3
declare <vscale x 4 x float> @llvm.aarch64.sme.readq.vert.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, i32, i32)
declare <vscale x 2 x i64> @llvm.aarch64.sme.readq.vert.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i32, i32)
declare <vscale x 2 x double> @llvm.aarch64.sme.readq.vert.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, i32, i32)
+
+; ------------------------------------------------------------------------------
+; Dummy external function to force code retention.
+; ------------------------------------------------------------------------------
+
+declare void @use(...)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll
index ca5399a0503e9..c01c96cc56975 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -force-streaming < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-b16b16 -verify-machineinstrs -force-streaming < %s | FileCheck %s
;
; Move Multi-Vector From Tile (Read) x2
@@ -7,82 +7,106 @@
; Horizontal
-define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_horiz_vg2_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_horiz_vg2_b(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg2_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.b, z1.b }, za0h.b[w12, 0:1]
-; CHECK-NEXT: mov { z0.b, z1.b }, za0h.b[w12, 14:15]
+; CHECK-NEXT: mov { z2.b, z3.b }, za0h.b[w12, 14:15]
+; CHECK-NEXT: add z0.b, z0.b, z2.b
; CHECK-NEXT: ret
%res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 %slice)
%slice.14 = add i32 %slice, 14
%res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 %slice.14)
- ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+ %val1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+ %val2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+ %sum = add <vscale x 16 x i8> %val1, %val2
+ ret <vscale x 16 x i8> %sum
}
-define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_horiz_vg2_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_horiz_vg2_h(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg2_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h, z1.h }, za0h.h[w12, 0:1]
-; CHECK-NEXT: mov { z0.h, z1.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT: mov { z2.h, z3.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT: add z0.h, z0.h, z2.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 0, i32 %slice)
%slice.6 = add i32 %slice, 6
%res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 %slice.6)
- ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+ %val1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+ %sum = add <vscale x 8 x i16> %val1, %val2
+ ret <vscale x 8 x i16> %sum
}
-define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_horiz_vg2_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_horiz_vg2_f16(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg2_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h, z1.h }, za0h.h[w12, 0:1]
-; CHECK-NEXT: mov { z0.h, z1.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT: mov { z2.h, z3.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT: fadd z0.h, z0.h, z2.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 0, i32 %slice)
%slice.6 = add i32 %slice, 6
%res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 1, i32 %slice.6)
- ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
+ %val1 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+ %sum = fadd <vscale x 8 x half> %val1, %val2
+ ret <vscale x 8 x half> %sum
}
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_horiz_vg2_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_horiz_vg2_bf16(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg2_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h, z1.h }, za0h.h[w12, 0:1]
-; CHECK-NEXT: mov { z0.h, z1.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT: mov { z2.h, z3.h }, za1h.h[w12, 6:7]
+; CHECK-NEXT: bfadd z0.h, z0.h, z2.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 0, i32 %slice)
%slice.6 = add i32 %slice, 6
%res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 1, i32 %slice.6)
- ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+ %val1 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+ %sum = fadd <vscale x 8 x bfloat> %val1, %val2
+ ret <vscale x 8 x bfloat> %sum
}
-define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_horiz_vg2_s(i32 %slice) {
+define <vscale x 4 x i32> @za_read_horiz_vg2_s(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg2_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.s, z1.s }, za0h.s[w12, 0:1]
-; CHECK-NEXT: mov { z0.s, z1.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT: mov { z2.s, z3.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT: add z0.s, z0.s, z2.s
; CHECK-NEXT: ret
%res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 0, i32 %slice)
%slice.2 = add i32 %slice, 2
%res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 %slice.2)
- ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
+ %val1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
+ %val2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2, 0
+ %sum = add <vscale x 4 x i32> %val1, %val2
+ ret <vscale x 4 x i32> %sum
}
-define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_horiz_vg2_f32(i32 %slice) {
+define <vscale x 4 x float> @za_read_horiz_vg2_f32(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg2_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.s, z1.s }, za0h.s[w12, 0:1]
-; CHECK-NEXT: mov { z0.s, z1.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT: mov { z2.s, z3.s }, za3h.s[w12, 2:3]
+; CHECK-NEXT: fadd z0.s, z0.s, z2.s
; CHECK-NEXT: ret
%res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 0, i32 %slice)
%slice.2 = add i32 %slice, 2
%res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 3, i32 %slice.2)
- ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
+ %val1 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res, 0
+ %val2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res2, 0
+ %sum = fadd <vscale x 4 x float> %val1, %val2
+ ret <vscale x 4 x float> %sum
}
define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_horiz_vg2_d(i32 %slice) {
@@ -107,82 +131,106 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_horiz_vg2_f64(i
; Vertical
-define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vert_vg2_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_vert_vg2_b(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg2_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.b, z1.b }, za0v.b[w12, 0:1]
-; CHECK-NEXT: mov { z0.b, z1.b }, za0v.b[w12, 14:15]
+; CHECK-NEXT: mov { z2.b, z3.b }, za0v.b[w12, 14:15]
+; CHECK-NEXT: add z0.b, z0.b, z2.b
; CHECK-NEXT: ret
%res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 %slice)
%slice.14 = add i32 %slice, 14
%res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 %slice.14)
- ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+ %val1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+ %val2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+ %sum = add <vscale x 16 x i8> %val1, %val2
+ ret <vscale x 16 x i8> %sum
}
-define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vert_vg2_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_vert_vg2_h(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg2_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h, z1.h }, za0v.h[w12, 0:1]
-; CHECK-NEXT: mov { z0.h, z1.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT: mov { z2.h, z3.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT: add z0.h, z0.h, z2.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 0, i32 %slice)
%slice.6 = add i32 %slice, 6
%res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 %slice.6)
- ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+ %val1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+ %sum = add <vscale x 8 x i16> %val1, %val2
+ ret <vscale x 8 x i16> %sum
}
-define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vert_vg2_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_vert_vg2_f16(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg2_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h, z1.h }, za0v.h[w12, 0:1]
-; CHECK-NEXT: mov { z0.h, z1.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT: mov { z2.h, z3.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT: fadd z0.h, z0.h, z2.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 0, i32 %slice)
%slice.6 = add i32 %slice, 6
%res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 1, i32 %slice.6)
- ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
+ %val1 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+ %sum = fadd <vscale x 8 x half> %val1, %val2
+ ret <vscale x 8 x half> %sum
}
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vert_vg2_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_vert_vg2_bf16(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg2_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h, z1.h }, za0v.h[w12, 0:1]
-; CHECK-NEXT: mov { z0.h, z1.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT: mov { z2.h, z3.h }, za1v.h[w12, 6:7]
+; CHECK-NEXT: bfadd z0.h, z0.h, z2.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 0, i32 %slice)
%slice.6 = add i32 %slice, 6
%res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 1, i32 %slice.6)
- ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+ %val1 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+ %sum = fadd <vscale x 8 x bfloat> %val1, %val2
+ ret <vscale x 8 x bfloat> %sum
}
-define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vert_vg2_s(i32 %slice) {
+define <vscale x 4 x i32> @za_read_vert_vg2_s(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg2_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.s, z1.s }, za0v.s[w12, 0:1]
-; CHECK-NEXT: mov { z0.s, z1.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT: mov { z2.s, z3.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT: add z0.s, z0.s, z2.s
; CHECK-NEXT: ret
%res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 0, i32 %slice)
%slice.2 = add i32 %slice, 2
%res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 %slice.2)
- ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
+ %val1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
+ %val2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2, 0
+ %sum = add <vscale x 4 x i32> %val1, %val2
+ ret <vscale x 4 x i32> %sum
}
-define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vert_vg2_f32(i32 %slice) {
+define <vscale x 4 x float> @za_read_vert_vg2_f32(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg2_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.s, z1.s }, za0v.s[w12, 0:1]
-; CHECK-NEXT: mov { z0.s, z1.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT: mov { z2.s, z3.s }, za3v.s[w12, 2:3]
+; CHECK-NEXT: fadd z0.s, z0.s, z2.s
; CHECK-NEXT: ret
%res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 0, i32 %slice)
%slice.2 = add i32 %slice, 2
%res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 3, i32 %slice.2)
- ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
+ %val1 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res, 0
+ %val2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res2, 0
+ %sum = fadd <vscale x 4 x float> %val1, %val2
+ ret <vscale x 4 x float> %sum
}
define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vert_vg2_d(i32 %slice) {
@@ -211,56 +259,72 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vert_vg2_f64(i3
; Horizontal
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_horiz_vg4_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_horiz_vg4_b(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg4_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.b - z3.b }, za0h.b[w12, 0:3]
-; CHECK-NEXT: mov { z0.b - z3.b }, za0h.b[w12, 12:15]
+; CHECK-NEXT: mov { z4.b - z7.b }, za0h.b[w12, 12:15]
+; CHECK-NEXT: add z0.b, z0.b, z4.b
; CHECK-NEXT: ret
%res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 %slice)
%slice.12 = add i32 %slice, 12
%res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 %slice.12)
- ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+ %val1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+ %val2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+ %sum = add <vscale x 16 x i8> %val1, %val2
+ ret <vscale x 16 x i8> %sum
}
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_horiz_vg4_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_horiz_vg4_h(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg4_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h - z3.h }, za0h.h[w12, 0:3]
-; CHECK-NEXT: mov { z0.h - z3.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT: mov { z4.h - z7.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT: add z0.h, z0.h, z4.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 0, i32 %slice)
%slice.4 = add i32 %slice, 4
%res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 %slice.4)
- ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+ %val1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+ %sum = add <vscale x 8 x i16> %val1, %val2
+ ret <vscale x 8 x i16> %sum
}
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_horiz_vg4_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_horiz_vg4_f16(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg4_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h - z3.h }, za0h.h[w12, 0:3]
-; CHECK-NEXT: mov { z0.h - z3.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT: mov { z4.h - z7.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT: fadd z0.h, z0.h, z4.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 0, i32 %slice)
%slice.4 = add i32 %slice, 4
%res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 1, i32 %slice.4)
- ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
+ %val1 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+ %sum = fadd <vscale x 8 x half> %val1, %val2
+ ret <vscale x 8 x half> %sum
}
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_horiz_vg4_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_horiz_vg4_bf16(i32 %slice) {
; CHECK-LABEL: za_read_horiz_vg4_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h - z3.h }, za0h.h[w12, 0:3]
-; CHECK-NEXT: mov { z0.h - z3.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT: mov { z4.h - z7.h }, za1h.h[w12, 4:7]
+; CHECK-NEXT: bfadd z0.h, z0.h, z4.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 0, i32 %slice)
%slice.4 = add i32 %slice, 4
%res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 1, i32 %slice.4)
- ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+ %val1 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+ %sum = fadd <vscale x 8 x bfloat> %val1, %val2
+ ret <vscale x 8 x bfloat> %sum
}
define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_horiz_vg4_s(i32 %slice) {
@@ -305,56 +369,72 @@ define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <v
; Vertical
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vert_vg4_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_vert_vg4_b(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg4_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.b - z3.b }, za0v.b[w12, 0:3]
-; CHECK-NEXT: mov { z0.b - z3.b }, za0v.b[w12, 12:15]
+; CHECK-NEXT: mov { z4.b - z7.b }, za0v.b[w12, 12:15]
+; CHECK-NEXT: add z0.b, z0.b, z4.b
; CHECK-NEXT: ret
%res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 %slice)
%slice.12 = add i32 %slice, 12
%res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 %slice.12)
- ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+ %val1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+ %val2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+ %sum = add <vscale x 16 x i8> %val1, %val2
+ ret <vscale x 16 x i8> %sum
}
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vert_vg4_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_vert_vg4_h(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg4_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h - z3.h }, za0v.h[w12, 0:3]
-; CHECK-NEXT: mov { z0.h - z3.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT: mov { z4.h - z7.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT: add z0.h, z0.h, z4.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 0, i32 %slice)
%slice.4 = add i32 %slice, 4
%res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 %slice.4)
- ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+ %val1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+ %sum = add <vscale x 8 x i16> %val1, %val2
+ ret <vscale x 8 x i16> %sum
}
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vert_vg4_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_vert_vg4_f16(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg4_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h - z3.h }, za0v.h[w12, 0:3]
-; CHECK-NEXT: mov { z0.h - z3.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT: mov { z4.h - z7.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT: fadd z0.h, z0.h, z4.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 0, i32 %slice)
%slice.4 = add i32 %slice, 4
%res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 1, i32 %slice.4)
- ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
+ %val1 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+ %sum = fadd <vscale x 8 x half> %val1, %val2
+ ret <vscale x 8 x half> %sum
}
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vert_vg4_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_vert_vg4_bf16(i32 %slice) {
; CHECK-LABEL: za_read_vert_vg4_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w12, w0
; CHECK-NEXT: mov { z0.h - z3.h }, za0v.h[w12, 0:3]
-; CHECK-NEXT: mov { z0.h - z3.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT: mov { z4.h - z7.h }, za1v.h[w12, 4:7]
+; CHECK-NEXT: bfadd z0.h, z0.h, z4.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 0, i32 %slice)
%slice.4 = add i32 %slice, 4
%res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 1, i32 %slice.4)
- ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+ %val1 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+ %sum = fadd <vscale x 8 x bfloat> %val1, %val2
+ ret <vscale x 8 x bfloat> %sum
}
define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vert_vg4_s(i32 %slice) {
@@ -399,214 +479,278 @@ define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <v
; Move Multi-Vector From ZA (Read) x2
-define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vg1x2_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_vg1x2_b(i32 %slice) {
; CHECK-LABEL: za_read_vg1x2_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: add z0.b, z0.b, z2.b
; CHECK-NEXT: ret
%res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x2.nxv16i8(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x2.nxv16i8(i32 %slice.7)
- ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+ %val1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+ %val2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+ %sum = add <vscale x 16 x i8> %val1, %val2
+ ret <vscale x 16 x i8> %sum
}
-define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vg1x2_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_vg1x2_h(i32 %slice) {
; CHECK-LABEL: za_read_vg1x2_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: add z0.h, z0.h, z2.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x2.nxv8i16(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x2.nxv8i16(i32 %slice.7)
- ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+ %val1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+ %sum = add <vscale x 8 x i16> %val1, %val2
+ ret <vscale x 8 x i16> %sum
}
-define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vg1x2_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_vg1x2_f16(i32 %slice) {
; CHECK-LABEL: za_read_vg1x2_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: fadd z0.h, z0.h, z2.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x2.nxv8f16(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x2.nxv8f16(i32 %slice.7)
- ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
+ %val1 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+ %sum = fadd <vscale x 8 x half> %val1, %val2
+ ret <vscale x 8 x half> %sum
}
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vg1x2_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_vg1x2_bf16(i32 %slice) {
; CHECK-LABEL: za_read_vg1x2_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: bfadd z0.h, z0.h, z2.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x2.nxv8bf16(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x2.nxv8bf16(i32 %slice.7)
- ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+ %val1 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+ %sum = fadd <vscale x 8 x bfloat> %val1, %val2
+ ret <vscale x 8 x bfloat> %sum
}
-define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vg1x2_s(i32 %slice) {
+define <vscale x 4 x i32> @za_read_vg1x2_s(i32 %slice) {
; CHECK-LABEL: za_read_vg1x2_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: add z0.s, z0.s, z2.s
; CHECK-NEXT: ret
%res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x2.nxv4i32(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x2.nxv4i32(i32 %slice.7)
- ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
+ %val1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
+ %val2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2, 0
+ %sum = add <vscale x 4 x i32> %val1, %val2
+ ret <vscale x 4 x i32> %sum
}
-define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vg1x2_f32(i32 %slice) {
+define <vscale x 4 x float> @za_read_vg1x2_f32(i32 %slice) {
; CHECK-LABEL: za_read_vg1x2_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: fadd z0.s, z0.s, z2.s
; CHECK-NEXT: ret
%res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x2.nxv4f32(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x2.nxv4f32(i32 %slice.7)
- ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
+ %val1 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res, 0
+ %val2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %res2, 0
+ %sum = fadd <vscale x 4 x float> %val1, %val2
+ ret <vscale x 4 x float> %sum
}
-define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vg1x2_d(i32 %slice) {
+define <vscale x 2 x i64> @za_read_vg1x2_d(i32 %slice) {
; CHECK-LABEL: za_read_vg1x2_d:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: add z0.d, z0.d, z2.d
; CHECK-NEXT: ret
%res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 %slice.7)
- ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res2
+ %val1 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 0
+ %val2 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %res2, 0
+ %sum = add <vscale x 2 x i64> %val1, %val2
+ ret <vscale x 2 x i64> %sum
}
-define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vg1x2_f64(i32 %slice) {
+define <vscale x 2 x double> @za_read_vg1x2_f64(i32 %slice) {
; CHECK-LABEL: za_read_vg1x2_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
-; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: mov { z2.d, z3.d }, za.d[w8, 7, vgx2]
+; CHECK-NEXT: fadd z0.d, z0.d, z2.d
; CHECK-NEXT: ret
%res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 %slice.7)
- ret { <vscale x 2 x double>, <vscale x 2 x double> } %res2
+ %val1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %res, 0
+ %val2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %res2, 0
+ %sum = fadd <vscale x 2 x double> %val1, %val2
+ ret <vscale x 2 x double> %sum
}
; Move Multi-Vector From ZA (Read) x4
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vg1x4_b(i32 %slice) {
+define <vscale x 16 x i8> @za_read_vg1x4_b(i32 %slice) {
; CHECK-LABEL: za_read_vg1x4_b:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: add z0.b, z0.b, z4.b
; CHECK-NEXT: ret
%res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x4.nxv16i8(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x4.nxv16i8(i32 %slice.7)
- ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
+ %val1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
+ %val2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2, 0
+ %sum = add <vscale x 16 x i8> %val1, %val2
+ ret <vscale x 16 x i8> %sum
}
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vg1x4_h(i32 %slice) {
+define <vscale x 8 x i16> @za_read_vg1x4_h(i32 %slice) {
; CHECK-LABEL: za_read_vg1x4_h:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: add z0.h, z0.h, z4.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x4.nxv8i16(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x4.nxv8i16(i32 %slice.7)
- ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
+ %val1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2, 0
+ %sum = add <vscale x 8 x i16> %val1, %val2
+ ret <vscale x 8 x i16> %sum
}
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vg1x4_f16(i32 %slice) {
+define <vscale x 8 x half> @za_read_vg1x4_f16(i32 %slice) {
; CHECK-LABEL: za_read_vg1x4_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: fadd z0.h, z0.h, z4.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x4.nxv8f16(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x4.nxv8f16(i32 %slice.7)
- ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
+ %val1 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2, 0
+ %sum = fadd <vscale x 8 x half> %val1, %val2
+ ret <vscale x 8 x half> %sum
}
-define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vg1x4_bf16(i32 %slice) {
+define <vscale x 8 x bfloat> @za_read_vg1x4_bf16(i32 %slice) {
; CHECK-LABEL: za_read_vg1x4_bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: bfadd z0.h, z0.h, z4.h
; CHECK-NEXT: ret
%res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x4.nxv8bf16(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x4.nxv8bf16(i32 %slice.7)
- ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
+ %val1 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res, 0
+ %val2 = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2, 0
+ %sum = fadd <vscale x 8 x bfloat> %val1, %val2
+ ret <vscale x 8 x bfloat> %sum
}
-define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vg1x4_s(i32 %slice) {
+define <vscale x 4 x i32> @za_read_vg1x4_s(i32 %slice) {
; CHECK-LABEL: za_read_vg1x4_s:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: add z0.s, z0.s, z4.s
; CHECK-NEXT: ret
%res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x4.nxv4i32(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x4.nxv4i32(i32 %slice.7)
- ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
+ %val1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
+ %val2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res2, 0
+ %sum = add <vscale x 4 x i32> %val1, %val2
+ ret <vscale x 4 x i32> %sum
}
-define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vg1x4_f32(i32 %slice) {
+define <vscale x 4 x float> @za_read_vg1x4_f32(i32 %slice) {
; CHECK-LABEL: za_read_vg1x4_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: fadd z0.s, z0.s, z4.s
; CHECK-NEXT: ret
%res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x4.nxv4f32(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x4.nxv4f32(i32 %slice.7)
- ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res2
+ %val1 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res, 0
+ %val2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res2, 0
+ %sum = fadd <vscale x 4 x float> %val1, %val2
+ ret <vscale x 4 x float> %sum
}
-define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vg1x4_d(i32 %slice) {
+define <vscale x 2 x i64> @za_read_vg1x4_d(i32 %slice) {
; CHECK-LABEL: za_read_vg1x4_d:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: add z0.d, z0.d, z4.d
; CHECK-NEXT: ret
%res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 %slice.7)
- ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res2
+ %val1 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 0
+ %val2 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res2, 0
+ %sum = add <vscale x 2 x i64> %val1, %val2
+ ret <vscale x 2 x i64> %sum
}
-define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vg1x4_f64(i32 %slice) {
+define <vscale x 2 x double> @za_read_vg1x4_f64(i32 %slice) {
; CHECK-LABEL: za_read_vg1x4_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
-; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: mov { z4.d - z7.d }, za.d[w8, 7, vgx4]
+; CHECK-NEXT: fadd z0.d, z0.d, z4.d
; CHECK-NEXT: ret
%res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 %slice)
%slice.7 = add i32 %slice, 7
%res2 = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 %slice.7)
- ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res2
+ %val1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res, 0
+ %val2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res2, 0
+ %sum = fadd <vscale x 2 x double> %val1, %val2
+ ret <vscale x 2 x double> %sum
}
declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32, i32)
More information about the llvm-commits
mailing list