[llvm] 2c4e447 - [AArch64][SME] Add load/store intrinsics
Rosie Sumpter via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 14 03:19:02 PDT 2022
Author: Rosie Sumpter
Date: 2022-06-14T11:11:22+01:00
New Revision: 2c4e44752d1dd319e0dedaefac1957d70ecb3c9a
URL: https://github.com/llvm/llvm-project/commit/2c4e44752d1dd319e0dedaefac1957d70ecb3c9a
DIFF: https://github.com/llvm/llvm-project/commit/2c4e44752d1dd319e0dedaefac1957d70ecb3c9a.diff
LOG: [AArch64][SME] Add load/store intrinsics
This patch adds implementations for the load/store SME ACLE intrinsics:
- @llvm.aarch64.sme.ld1*
- @llvm.aarch64.sme.st1*
Differential Revision: https://reviews.llvm.org/D127210
Added:
llvm/test/CodeGen/AArch64/SME/sme-intrinsics-loads.ll
llvm/test/CodeGen/AArch64/SME/sme-intrinsics-stores.ll
Modified:
llvm/include/llvm/IR/Intrinsics.td
llvm/include/llvm/IR/IntrinsicsAArch64.td
llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
llvm/lib/Target/AArch64/AArch64RegisterInfo.td
llvm/lib/Target/AArch64/SMEInstrFormats.td
llvm/lib/Target/AArch64/SVEInstrFormats.td
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 1b1cfd428cbc1..077613e686bff 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -245,6 +245,7 @@ def llvm_i8_ty : LLVMType<i8>;
def llvm_i16_ty : LLVMType<i16>;
def llvm_i32_ty : LLVMType<i32>;
def llvm_i64_ty : LLVMType<i64>;
+def llvm_i128_ty : LLVMType<i128>;
def llvm_half_ty : LLVMType<f16>;
def llvm_bfloat_ty : LLVMType<bf16>;
def llvm_float_ty : LLVMType<f32>;
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 3032254d13f89..6aa976e4e8dfd 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2583,3 +2583,46 @@ def int_aarch64_sve_whilewr_b : SVE2_CONFLICT_DETECT_Intrinsic;
def int_aarch64_sve_whilewr_h : SVE2_CONFLICT_DETECT_Intrinsic;
def int_aarch64_sve_whilewr_s : SVE2_CONFLICT_DETECT_Intrinsic;
def int_aarch64_sve_whilewr_d : SVE2_CONFLICT_DETECT_Intrinsic;
+
+// Scalable Matrix Extension (SME) Intrinsics
+let TargetPrefix = "aarch64" in {
+ class SME_Load_Store_B_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_nxv16i1_ty, llvm_ptr_ty, llvm_i64_ty, llvm_i32_ty], []>;
+ class SME_Load_Store_H_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_nxv16i1_ty, LLVMPointerType<llvm_i16_ty>, llvm_i64_ty, llvm_i32_ty], []>;
+ class SME_Load_Store_S_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_nxv16i1_ty, LLVMPointerType<llvm_i32_ty>, llvm_i64_ty, llvm_i32_ty], []>;
+ class SME_Load_Store_D_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_nxv16i1_ty, LLVMPointerType<llvm_i64_ty>, llvm_i64_ty, llvm_i32_ty], []>;
+ class SME_Load_Store_Q_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_nxv16i1_ty, LLVMPointerType<llvm_i128_ty>, llvm_i64_ty, llvm_i32_ty], []>;
+
+ // Loads
+ def int_aarch64_sme_ld1b_horiz : SME_Load_Store_B_Intrinsic;
+ def int_aarch64_sme_ld1h_horiz : SME_Load_Store_H_Intrinsic;
+ def int_aarch64_sme_ld1w_horiz : SME_Load_Store_S_Intrinsic;
+ def int_aarch64_sme_ld1d_horiz : SME_Load_Store_D_Intrinsic;
+ def int_aarch64_sme_ld1q_horiz : SME_Load_Store_Q_Intrinsic;
+ def int_aarch64_sme_ld1b_vert : SME_Load_Store_B_Intrinsic;
+ def int_aarch64_sme_ld1h_vert : SME_Load_Store_H_Intrinsic;
+ def int_aarch64_sme_ld1w_vert : SME_Load_Store_S_Intrinsic;
+ def int_aarch64_sme_ld1d_vert : SME_Load_Store_D_Intrinsic;
+ def int_aarch64_sme_ld1q_vert : SME_Load_Store_Q_Intrinsic;
+
+ // Stores
+ def int_aarch64_sme_st1b_horiz : SME_Load_Store_B_Intrinsic;
+ def int_aarch64_sme_st1h_horiz : SME_Load_Store_H_Intrinsic;
+ def int_aarch64_sme_st1w_horiz : SME_Load_Store_S_Intrinsic;
+ def int_aarch64_sme_st1d_horiz : SME_Load_Store_D_Intrinsic;
+ def int_aarch64_sme_st1q_horiz : SME_Load_Store_Q_Intrinsic;
+ def int_aarch64_sme_st1b_vert : SME_Load_Store_B_Intrinsic;
+ def int_aarch64_sme_st1h_vert : SME_Load_Store_H_Intrinsic;
+ def int_aarch64_sme_st1w_vert : SME_Load_Store_S_Intrinsic;
+ def int_aarch64_sme_st1d_vert : SME_Load_Store_D_Intrinsic;
+ def int_aarch64_sme_st1q_vert : SME_Load_Store_Q_Intrinsic;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index d6d6a18cbdec9..ad97a2b265f52 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -278,6 +278,15 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
return false;
}
+ template <unsigned BaseReg> bool ImmToTile(SDValue N, SDValue &Imm) {
+ if (auto *CI = dyn_cast<ConstantSDNode>(N)) {
+ uint64_t C = CI->getZExtValue();
+ Imm = CurDAG->getRegister(BaseReg + C, MVT::Other);
+ return true;
+ }
+ return false;
+ }
+
/// Form sequences of consecutive 64/128-bit registers for use in NEON
/// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
/// between 1 and 4 elements. If it contains a single element that is returned
@@ -321,6 +330,11 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
return SelectSVERegRegAddrMode(N, Scale, Base, Offset);
}
+ template <unsigned Scale>
+ bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) {
+ return SelectSMETileSlice(N, Scale, Vector, Offset);
+ }
+
void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
@@ -389,6 +403,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm);
bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
SDValue &Offset);
+ bool SelectSMETileSlice(SDValue N, unsigned Scale, SDValue &Vector,
+ SDValue &Offset);
bool SelectAllActivePredicate(SDValue N);
};
@@ -5224,3 +5240,27 @@ bool AArch64DAGToDAGISel::SelectAllActivePredicate(SDValue N) {
return TLI->isAllActivePredicate(*CurDAG, N);
}
+
+bool AArch64DAGToDAGISel::SelectSMETileSlice(SDValue N, unsigned Scale,
+ SDValue &Vector, SDValue &Offset) {
+ if (N.getOpcode() != ISD::ADD)
+ return false;
+
+ // Process an ADD node.
+ const SDValue LHS = N.getOperand(0);
+ const SDValue RHS = N.getOperand(1);
+
+ if (auto C = dyn_cast<ConstantSDNode>(RHS)) {
+ int64_t ImmOff = C->getSExtValue();
+ unsigned MaxSize = (1 << Scale) - 1;
+
+ if (ImmOff < 0 || ImmOff > MaxSize)
+ return false;
+
+ Vector = LHS;
+ Offset = CurDAG->getTargetConstant(ImmOff, SDLoc(N), MVT::i64);
+ return true;
+ }
+
+ return false;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5d6b4e7e0b192..7e2fbd33de320 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2323,6 +2323,24 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
return BB;
}
+MachineBasicBlock *
+AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
+
+ MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
+ MIB.add(MI.getOperand(1)); // slice index register
+ MIB.add(MI.getOperand(2)); // slice index offset
+ MIB.add(MI.getOperand(3)); // pg
+ MIB.add(MI.getOperand(4)); // base
+ MIB.add(MI.getOperand(5)); // offset
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+}
+
MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *BB) const {
switch (MI.getOpcode()) {
@@ -2353,6 +2371,26 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
case AArch64::CATCHRET:
return EmitLoweredCatchRet(MI, BB);
+ case AArch64::LD1_MXIPXX_H_PSEUDO_B:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
+ case AArch64::LD1_MXIPXX_H_PSEUDO_H:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
+ case AArch64::LD1_MXIPXX_H_PSEUDO_S:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
+ case AArch64::LD1_MXIPXX_H_PSEUDO_D:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
+ case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
+ case AArch64::LD1_MXIPXX_V_PSEUDO_B:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
+ case AArch64::LD1_MXIPXX_V_PSEUDO_H:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
+ case AArch64::LD1_MXIPXX_V_PSEUDO_S:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
+ case AArch64::LD1_MXIPXX_V_PSEUDO_D:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
+ case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
+ return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 444cebbb3b24c..2b8fbbab5517c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -556,6 +556,10 @@ class AArch64TargetLowering : public TargetLowering {
MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg,
+ MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *MBB) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index c036217078c61..b186df8948b21 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -338,6 +338,13 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
markSuperRegs(Reserved, AArch64::W16);
+ // SME tiles are not allocatable.
+ if (MF.getSubtarget<AArch64Subtarget>().hasSME()) {
+ for (MCSubRegIterator SubReg(AArch64::ZA, this, /*self=*/true);
+ SubReg.isValid(); ++SubReg)
+ Reserved.set(*SubReg);
+ }
+
assert(checkAllSuperRegsMarked(Reserved));
return Reserved;
}
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 401d2b7202786..e42feea959667 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -1212,26 +1212,28 @@ let SubRegIndices = [zasubb] in {
// SME Register Classes
-// Accumulator array
-def MPR : RegisterClass<"AArch64", [untyped], 2048, (add ZA)> {
- let Size = 2048;
-}
+let isAllocatable = 0 in {
+ // Accumulator array
+ def MPR : RegisterClass<"AArch64", [untyped], 2048, (add ZA)> {
+ let Size = 2048;
+ }
-// Accumulator array as single tiles
-def MPR8 : RegisterClass<"AArch64", [untyped], 2048, (add (sequence "ZAB%u", 0, 0))> {
- let Size = 2048;
-}
-def MPR16 : RegisterClass<"AArch64", [untyped], 1024, (add (sequence "ZAH%u", 0, 1))> {
- let Size = 1024;
-}
-def MPR32 : RegisterClass<"AArch64", [untyped], 512, (add (sequence "ZAS%u", 0, 3))> {
- let Size = 512;
-}
-def MPR64 : RegisterClass<"AArch64", [untyped], 256, (add (sequence "ZAD%u", 0, 7))> {
- let Size = 256;
-}
-def MPR128 : RegisterClass<"AArch64", [untyped], 128, (add (sequence "ZAQ%u", 0, 15))> {
- let Size = 128;
+ // Accumulator array as single tiles
+ def MPR8 : RegisterClass<"AArch64", [untyped], 2048, (add (sequence "ZAB%u", 0, 0))> {
+ let Size = 2048;
+ }
+ def MPR16 : RegisterClass<"AArch64", [untyped], 1024, (add (sequence "ZAH%u", 0, 1))> {
+ let Size = 1024;
+ }
+ def MPR32 : RegisterClass<"AArch64", [untyped], 512, (add (sequence "ZAS%u", 0, 3))> {
+ let Size = 512;
+ }
+ def MPR64 : RegisterClass<"AArch64", [untyped], 256, (add (sequence "ZAD%u", 0, 7))> {
+ let Size = 256;
+ }
+ def MPR128 : RegisterClass<"AArch64", [untyped], 128, (add (sequence "ZAQ%u", 0, 15))> {
+ let Size = 128;
+ }
}
// SME Register Operands
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 4c78c17dfd1c7..9a1dc17d4486d 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -10,6 +10,18 @@
//
//===----------------------------------------------------------------------===//
+def imm_to_tile8 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAB0>", []>;
+def imm_to_tile16 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAH0>", []>;
+def imm_to_tile32 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAS0>", []>;
+def imm_to_tile64 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAD0>", []>;
+def imm_to_tile128 : ComplexPattern<i64, 1, "ImmToTile<AArch64::ZAQ0>", []>;
+
+def tileslice8 : ComplexPattern<i32 , 2, "SelectSMETileSlice<4>", []>;
+def tileslice16 : ComplexPattern<i32 , 2, "SelectSMETileSlice<3>", []>;
+def tileslice32 : ComplexPattern<i32 , 2, "SelectSMETileSlice<2>", []>;
+def tileslice64 : ComplexPattern<i32 , 2, "SelectSMETileSlice<1>", []>;
+def tileslice128 : ComplexPattern<i32 , 2, "SelectSMETileSlice<0>", []>; // nop
+
//===----------------------------------------------------------------------===//
// SME Outer Products
//===----------------------------------------------------------------------===//
@@ -233,6 +245,45 @@ multiclass sme_mem_ld_ss_aliases<string inst, bit is_col> {
defm NAME : sme_mem_ss_aliases<"ld1", inst, is_col, "/z">;
}
+multiclass sme_mem_ld_ss_patterns<Instruction Inst, SDPatternOperator Load,
+ Operand tile_ty, Operand offset_ty,
+ ComplexPattern addr,
+ ComplexPattern tileslice> {
+ // base
+ def : Pat<(Load PPR3bAny:$pg, GPR64sp:$base, tile_ty:$tile,
+ MatrixIndexGPR32Op12_15:$idx),
+ (Inst tile_ty:$tile, $idx, 0, $pg, $base, XZR)>;
+ // reg + reg
+ let AddedComplexity = 1 in {
+ def : Pat<(Load PPR3bAny:$pg, (addr GPR64sp:$base, GPR64:$offset),
+ tile_ty:$tile, MatrixIndexGPR32Op12_15:$idx),
+ (Inst tile_ty:$tile, $idx, 0, $pg, $base, $offset)>;
+ }
+
+ // base, tileslice
+ let AddedComplexity = 1 in {
+ def : Pat<(Load PPR3bAny:$pg, GPR64sp:$base, tile_ty:$tile,
+ (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, offset_ty:$imm))),
+ (Inst tile_ty:$tile, $idx, $imm, $pg, $base, XZR)>;
+ }
+ // reg + reg, tileslice
+ let AddedComplexity = 2 in {
+ def : Pat<(Load PPR3bAny:$pg, (addr GPR64sp:$base, GPR64:$offset),
+ tile_ty:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$idx,
+ offset_ty:$imm))),
+ (Inst tile_ty:$tile, $idx, $imm, $pg, $base, $offset)>;
+ }
+}
+
+class sme_load_pseudo
+ : Pseudo<(outs), (ins i64imm:$tile, MatrixIndexGPR32Op12_15:$idx,
+ i64imm:$imm, PPR3bAny:$pg, GPR64sp:$base, GPR64:$offset), []>,
+ Sched<[]> {
+ // Translated to the actual instructions in AArch64ISelLowering.cpp
+ let usesCustomInserter = 1;
+ let mayLoad = 1;
+}
+
multiclass sme_mem_ld_v_ss<string mnemonic, bit is_col> {
def _B : sme_mem_ld_ss_inst<0b0, 0b00, mnemonic # "b",
!if(is_col, TileVectorOpV8, TileVectorOpH8),
@@ -272,6 +323,40 @@ multiclass sme_mem_ld_v_ss<string mnemonic, bit is_col> {
}
defm : sme_mem_ld_ss_aliases<NAME, is_col>;
+
+ // Pseudo instructions for lowering intrinsics, using immediates instead of
+ // tile registers.
+ def _PSEUDO_B : sme_load_pseudo;
+ def _PSEUDO_H : sme_load_pseudo;
+ def _PSEUDO_S : sme_load_pseudo;
+ def _PSEUDO_D : sme_load_pseudo;
+ def _PSEUDO_Q : sme_load_pseudo;
+
+ defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_B),
+ !if(is_col, int_aarch64_sme_ld1b_vert,
+ int_aarch64_sme_ld1b_horiz),
+ sme_elm_idx0_0, imm0_15, am_sve_regreg_lsl0,
+ tileslice8>;
+ defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_H),
+ !if(is_col, int_aarch64_sme_ld1h_vert,
+ int_aarch64_sme_ld1h_horiz),
+ imm0_1, imm0_7, am_sve_regreg_lsl1,
+ tileslice16>;
+ defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_S),
+ !if(is_col, int_aarch64_sme_ld1w_vert,
+ int_aarch64_sme_ld1w_horiz),
+ imm0_3, imm0_3, am_sve_regreg_lsl2,
+ tileslice32>;
+ defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_D),
+ !if(is_col, int_aarch64_sme_ld1d_vert,
+ int_aarch64_sme_ld1d_horiz),
+ imm0_7, imm0_1, am_sve_regreg_lsl3,
+ tileslice64>;
+ defm : sme_mem_ld_ss_patterns<!cast<Instruction>(NAME # _PSEUDO_Q),
+ !if(is_col, int_aarch64_sme_ld1q_vert,
+ int_aarch64_sme_ld1q_horiz),
+ imm0_15, sme_elm_idx0_0, am_sve_regreg_lsl4,
+ tileslice128>;
}
multiclass sme_mem_ld_ss<string mnemonic> {
@@ -318,6 +403,36 @@ multiclass sme_mem_st_ss_aliases<string inst, bit is_col> {
defm NAME : sme_mem_ss_aliases<"st1", inst, is_col>;
}
+multiclass sme_mem_st_ss_patterns<Instruction Inst, SDPatternOperator Store,
+ Operand offset_ty,
+ ComplexPattern imm2tile,
+ ComplexPattern addr,
+ ComplexPattern tileslice> {
+ // base
+ def : Pat<(Store PPR3bAny:$pg, GPR64sp:$base, (imm2tile untyped:$tile),
+ MatrixIndexGPR32Op12_15:$idx),
+ (Inst $tile, $idx, 0, $pg, $base, XZR)>;
+ // reg + reg
+ let AddedComplexity = 1 in {
+ def : Pat<(Store PPR3bAny:$pg, (addr GPR64sp:$base, GPR64:$offset),
+ (imm2tile untyped:$tile), MatrixIndexGPR32Op12_15:$idx),
+ (Inst $tile, $idx, 0, $pg, $base, $offset)>;
+ }
+ // base, tileslice
+ let AddedComplexity = 1 in {
+ def : Pat<(Store PPR3bAny:$pg, GPR64sp:$base, (imm2tile untyped:$tile),
+ (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, offset_ty:$imm))),
+ (Inst $tile, $idx, $imm, $pg, $base, XZR)>;
+ }
+ // reg + reg, tileslice
+ let AddedComplexity = 2 in {
+ def : Pat<(Store PPR3bAny:$pg, (addr GPR64sp:$base, GPR64:$offset),
+ (imm2tile untyped:$tile),
+ (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, offset_ty:$imm))),
+ (Inst $tile, $idx, $imm, $pg, $base, $offset)>;
+ }
+}
+
multiclass sme_mem_st_v_ss<string mnemonic, bit is_col> {
def _B : sme_mem_st_ss_inst<0b0, 0b00, mnemonic # "b",
!if(is_col, TileVectorOpV8, TileVectorOpH8),
@@ -357,6 +472,32 @@ multiclass sme_mem_st_v_ss<string mnemonic, bit is_col> {
}
defm : sme_mem_st_ss_aliases<NAME, is_col>;
+
+ defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _B),
+ !if(is_col, int_aarch64_sme_st1b_vert,
+ int_aarch64_sme_st1b_horiz),
+ imm0_15, imm_to_tile8, am_sve_regreg_lsl0,
+ tileslice8>;
+ defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _H),
+ !if(is_col, int_aarch64_sme_st1h_vert,
+ int_aarch64_sme_st1h_horiz),
+ imm0_7, imm_to_tile16, am_sve_regreg_lsl1,
+ tileslice16>;
+ defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _S),
+ !if(is_col, int_aarch64_sme_st1w_vert,
+ int_aarch64_sme_st1w_horiz),
+ imm0_3, imm_to_tile32, am_sve_regreg_lsl2,
+ tileslice32>;
+ defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _D),
+ !if(is_col, int_aarch64_sme_st1d_vert,
+ int_aarch64_sme_st1d_horiz),
+ imm0_1, imm_to_tile64, am_sve_regreg_lsl3,
+ tileslice64>;
+ defm : sme_mem_st_ss_patterns<!cast<Instruction>(NAME # _Q),
+ !if(is_col, int_aarch64_sme_st1q_vert,
+ int_aarch64_sme_st1q_horiz),
+ sme_elm_idx0_0, imm_to_tile128,
+ am_sve_regreg_lsl4, tileslice128>;
}
multiclass sme_mem_st_ss<string mnemonic> {
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 29d0a8e6282ea..2d6fa1ddcb77a 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -8489,6 +8489,7 @@ def am_sve_regreg_lsl0 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<0>", [
def am_sve_regreg_lsl1 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<1>", []>;
def am_sve_regreg_lsl2 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<2>", []>;
def am_sve_regreg_lsl3 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<3>", []>;
+def am_sve_regreg_lsl4 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<4>", []>;
// Predicated pseudo floating point two operand instructions.
multiclass sve_fp_bin_pred_hfd<SDPatternOperator op> {
diff --git a/llvm/test/CodeGen/AArch64/SME/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/SME/sme-intrinsics-loads.ll
new file mode 100644
index 0000000000000..3418d7e8a819f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/SME/sme-intrinsics-loads.ll
@@ -0,0 +1,258 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
+
+define void @ld1b(<vscale x 16 x i1> %pg, i8* %ptr, i32 %sliceidx) {
+; CHECK-LABEL: ld1b:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: mov w13, wzr
+; CHECK-NEXT: ld1b {za0h.b[w12, 15]}, p0/z, [x0]
+; CHECK-NEXT: ld1b {za0v.b[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT: ret
+ %tileslice = add i32 %sliceidx, 15
+ call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> %pg, i8* %ptr, i64 0, i32 %tileslice)
+ call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> %pg, i8* %ptr, i64 0, i32 0)
+ ret void;
+}
+
+define void @ld1b_with_addr_offset(<vscale x 16 x i1> %pg, i8* %ptr, i64 %index, i32 %sliceidx) {
+; CHECK-LABEL: ld1b_with_addr_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, wzr
+; CHECK-NEXT: mov w13, w2
+; CHECK-NEXT: ld1b {za0h.b[w12, 0]}, p0/z, [x0, x1]
+; CHECK-NEXT: ld1b {za0v.b[w13, 15]}, p0/z, [x0, x1]
+; CHECK-NEXT: ret
+ %base = getelementptr i8, i8* %ptr, i64 %index
+ %tileslice = add i32 %sliceidx, 15
+ call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> %pg, i8* %base, i64 0, i32 0)
+ call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> %pg, i8* %base, i64 0, i32 %tileslice)
+ ret void;
+}
+
+define void @ld1h(<vscale x 16 x i1> %pg, i16* %ptr, i32 %sliceidx) {
+; CHECK-LABEL: ld1h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: mov w13, wzr
+; CHECK-NEXT: ld1h {za0h.h[w12, 7]}, p0/z, [x0]
+; CHECK-NEXT: ld1h {za1h.h[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1h {za0v.h[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1h {za1v.h[w12, 7]}, p0/z, [x0]
+; CHECK-NEXT: ret
+ %tileslice = add i32 %sliceidx, 7
+ call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 16 x i1> %pg, i16* %ptr, i64 0, i32 %tileslice)
+ call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 16 x i1> %pg, i16* %ptr, i64 1, i32 0)
+ call void @llvm.aarch64.sme.ld1h.vert(<vscale x 16 x i1> %pg, i16* %ptr, i64 0, i32 0)
+ call void @llvm.aarch64.sme.ld1h.vert(<vscale x 16 x i1> %pg, i16* %ptr, i64 1, i32 %tileslice)
+ ret void;
+}
+
+define void @ld1h_with_addr_offset(<vscale x 16 x i1> %pg, i16* %ptr, i64 %index, i32 %sliceidx) {
+; CHECK-LABEL: ld1h_with_addr_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w2
+; CHECK-NEXT: mov w13, wzr
+; CHECK-NEXT: ld1h {za0h.h[w12, 7]}, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ld1h {za1v.h[w13, 0]}, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %base = getelementptr i16, i16* %ptr, i64 %index
+ %tileslice = add i32 %sliceidx, 7
+ call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 16 x i1> %pg, i16* %base, i64 0, i32 %tileslice)
+ call void @llvm.aarch64.sme.ld1h.vert(<vscale x 16 x i1> %pg, i16* %base, i64 1, i32 0)
+ ret void;
+}
+
+define void @ld1w(<vscale x 16 x i1> %pg, i32* %ptr, i32 %sliceidx) {
+; CHECK-LABEL: ld1w:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, wzr
+; CHECK-NEXT: mov w13, w1
+; CHECK-NEXT: ld1w {za0h.s[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1w {za1h.s[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1w {za2h.s[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1w {za3h.s[w13, 3]}, p0/z, [x0]
+; CHECK-NEXT: ld1w {za0v.s[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1w {za1v.s[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1w {za2v.s[w13, 3]}, p0/z, [x0]
+; CHECK-NEXT: ld1w {za3v.s[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ret
+ %tileslice = add i32 %sliceidx, 3
+ call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1> %pg, i32* %ptr, i64 0, i32 0)
+ call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1> %pg, i32* %ptr, i64 1, i32 0)
+ call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1> %pg, i32* %ptr, i64 2, i32 0)
+ call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1> %pg, i32* %ptr, i64 3, i32 %tileslice)
+ call void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1> %pg, i32* %ptr, i64 0, i32 0)
+ call void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1> %pg, i32* %ptr, i64 1, i32 0)
+ call void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1> %pg, i32* %ptr, i64 2, i32 %tileslice)
+ call void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1> %pg, i32* %ptr, i64 3, i32 0)
+ ret void;
+}
+
+define void @ld1w_with_addr_offset(<vscale x 16 x i1> %pg, i32* %ptr, i64 %index, i32 %sliceidx) {
+; CHECK-LABEL: ld1w_with_addr_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w2
+; CHECK-NEXT: mov w13, wzr
+; CHECK-NEXT: ld1w {za0h.s[w13, 0]}, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: ld1w {za3v.s[w12, 3]}, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+ %base = getelementptr i32, i32* %ptr, i64 %index
+ %tileslice = add i32 %sliceidx, 3
+ call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1> %pg, i32* %base, i64 0, i32 0)
+ call void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1> %pg, i32* %base, i64 3, i32 %tileslice)
+ ret void;
+}
+
+define void @ld1d(<vscale x 16 x i1> %pg, i64* %ptr, i32 %sliceidx) {
+; CHECK-LABEL: ld1d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w13, wzr
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: ld1d {za0h.d[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1d {za1h.d[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1d {za2h.d[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1d {za3h.d[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1d {za4h.d[w12, 1]}, p0/z, [x0]
+; CHECK-NEXT: ld1d {za5h.d[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1d {za6h.d[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1d {za7h.d[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1d {za0v.d[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1d {za1v.d[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1d {za2v.d[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1d {za3v.d[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1d {za4v.d[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1d {za5v.d[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1d {za6v.d[w13, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1d {za7v.d[w12, 1]}, p0/z, [x0]
+; CHECK-NEXT: ret
+ %tileslice = add i32 %sliceidx, 1
+ call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 0, i32 0)
+ call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 1, i32 0)
+ call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 2, i32 0)
+ call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 3, i32 0)
+ call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 4, i32 %tileslice)
+ call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 5, i32 0)
+ call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 6, i32 0)
+ call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 7, i32 0)
+ call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 0, i32 0)
+ call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 1, i32 0)
+ call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 2, i32 0)
+ call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 3, i32 0)
+ call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 4, i32 0)
+ call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 5, i32 0)
+ call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 6, i32 0)
+ call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 7, i32 %tileslice)
+ ret void;
+}
+
+define void @ld1d_with_addr_offset(<vscale x 16 x i1> %pg, i64* %ptr, i64 %index, i32 %sliceidx) {
+; CHECK-LABEL: ld1d_with_addr_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w2
+; CHECK-NEXT: mov w13, wzr
+; CHECK-NEXT: ld1d {za0h.d[w12, 1]}, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT: ld1d {za7v.d[w13, 0]}, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+ %base = getelementptr i64, i64* %ptr, i64 %index
+ %tileslice = add i32 %sliceidx, 1
+ call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1> %pg, i64* %base, i64 0, i32 %tileslice)
+ call void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1> %pg, i64* %base, i64 7, i32 0)
+ ret void;
+}
+
+define void @ld1q(<vscale x 16 x i1> %pg, i128* %ptr) {
+; CHECK-LABEL: ld1q:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, wzr
+; CHECK-NEXT: ld1q {za0h.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za1h.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za2h.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za3h.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za4h.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za5h.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za6h.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za7h.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za8h.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za9h.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za10h.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za11h.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za12h.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za13h.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za14h.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za15h.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za0v.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za1v.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za2v.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za3v.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za4v.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za5v.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za6v.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za7v.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za8v.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za9v.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za10v.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za11v.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za12v.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za13v.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za14v.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ld1q {za15v.q[w12, 0]}, p0/z, [x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 0, i32 0)
+ call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 1, i32 0)
+ call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 2, i32 0)
+ call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 3, i32 0)
+ call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 4, i32 0)
+ call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 5, i32 0)
+ call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 6, i32 0)
+ call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 7, i32 0)
+ call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 8, i32 0)
+ call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 9, i32 0)
+ call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 10, i32 0)
+ call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 11, i32 0)
+ call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 12, i32 0)
+ call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 13, i32 0)
+ call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 14, i32 0)
+ call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 15, i32 0)
+ call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 0, i32 0)
+ call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 1, i32 0)
+ call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 2, i32 0)
+ call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 3, i32 0)
+ call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 4, i32 0)
+ call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 5, i32 0)
+ call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 6, i32 0)
+ call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 7, i32 0)
+ call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 8, i32 0)
+ call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 9, i32 0)
+ call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 10, i32 0)
+ call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 11, i32 0)
+ call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 12, i32 0)
+ call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 13, i32 0)
+ call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 14, i32 0)
+ call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 15, i32 0)
+ ret void;
+}
+
+define void @ld1q_with_addr_offset(<vscale x 16 x i1> %pg, i128* %ptr, i64 %index) {
+; CHECK-LABEL: ld1q_with_addr_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, wzr
+; CHECK-NEXT: ld1q {za0h.q[w12, 0]}, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ld1q {za15v.q[w12, 0]}, p0/z, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %base = getelementptr i128, i128* %ptr, i64 %index
+ call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1> %pg, i128* %base, i64 0, i32 0)
+ call void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1> %pg, i128* %base, i64 15, i32 0)
+ ret void;
+}
+
+declare void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1>, i8*, i64, i32)
+declare void @llvm.aarch64.sme.ld1h.horiz(<vscale x 16 x i1>, i16*, i64, i32)
+declare void @llvm.aarch64.sme.ld1w.horiz(<vscale x 16 x i1>, i32*, i64, i32)
+declare void @llvm.aarch64.sme.ld1d.horiz(<vscale x 16 x i1>, i64*, i64, i32)
+declare void @llvm.aarch64.sme.ld1q.horiz(<vscale x 16 x i1>, i128*, i64, i32)
+declare void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1>, i8*, i64, i32)
+declare void @llvm.aarch64.sme.ld1h.vert(<vscale x 16 x i1>, i16*, i64, i32)
+declare void @llvm.aarch64.sme.ld1w.vert(<vscale x 16 x i1>, i32*, i64, i32)
+declare void @llvm.aarch64.sme.ld1d.vert(<vscale x 16 x i1>, i64*, i64, i32)
+declare void @llvm.aarch64.sme.ld1q.vert(<vscale x 16 x i1>, i128*, i64, i32)
diff --git a/llvm/test/CodeGen/AArch64/SME/sme-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/SME/sme-intrinsics-stores.ll
new file mode 100644
index 0000000000000..5b8acd21520c0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/SME/sme-intrinsics-stores.ll
@@ -0,0 +1,258 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
+
+define void @st1b(<vscale x 16 x i1> %pg, i8* %ptr, i32 %sliceidx) {
+; CHECK-LABEL: st1b:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: mov w13, wzr
+; CHECK-NEXT: st1b {za0h.b[w12, 15]}, p0, [x0]
+; CHECK-NEXT: st1b {za0v.b[w13, 0]}, p0, [x0]
+; CHECK-NEXT: ret
+ %tileslice = add i32 %sliceidx, 15
+ call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> %pg, i8* %ptr, i64 0, i32 %tileslice)
+ call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> %pg, i8* %ptr, i64 0, i32 0)
+ ret void;
+}
+
+define void @st1b_with_addr_offset(<vscale x 16 x i1> %pg, i8* %ptr, i64 %index, i32 %sliceidx) {
+; CHECK-LABEL: st1b_with_addr_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, wzr
+; CHECK-NEXT: mov w13, w2
+; CHECK-NEXT: st1b {za0h.b[w12, 0]}, p0, [x0, x1]
+; CHECK-NEXT: st1b {za0v.b[w13, 15]}, p0, [x0, x1]
+; CHECK-NEXT: ret
+ %base = getelementptr i8, i8* %ptr, i64 %index
+ %tileslice = add i32 %sliceidx, 15
+ call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> %pg, i8* %base, i64 0, i32 0)
+ call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> %pg, i8* %base, i64 0, i32 %tileslice)
+ ret void;
+}
+
+define void @st1h(<vscale x 16 x i1> %pg, i16* %ptr, i32 %sliceidx) {
+; CHECK-LABEL: st1h:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: mov w13, wzr
+; CHECK-NEXT: st1h {za0h.h[w12, 7]}, p0, [x0]
+; CHECK-NEXT: st1h {za1h.h[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1h {za0v.h[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1h {za1v.h[w12, 7]}, p0, [x0]
+; CHECK-NEXT: ret
+ %tileslice = add i32 %sliceidx, 7
+ call void @llvm.aarch64.sme.st1h.horiz(<vscale x 16 x i1> %pg, i16* %ptr, i64 0, i32 %tileslice)
+ call void @llvm.aarch64.sme.st1h.horiz(<vscale x 16 x i1> %pg, i16* %ptr, i64 1, i32 0)
+ call void @llvm.aarch64.sme.st1h.vert(<vscale x 16 x i1> %pg, i16* %ptr, i64 0, i32 0)
+ call void @llvm.aarch64.sme.st1h.vert(<vscale x 16 x i1> %pg, i16* %ptr, i64 1, i32 %tileslice)
+ ret void;
+}
+
+define void @st1h_with_addr_offset(<vscale x 16 x i1> %pg, i16* %ptr, i64 %index, i32 %sliceidx) {
+; CHECK-LABEL: st1h_with_addr_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w2
+; CHECK-NEXT: mov w13, wzr
+; CHECK-NEXT: st1h {za0h.h[w12, 7]}, p0, [x0, x1, lsl #1]
+; CHECK-NEXT: st1h {za1v.h[w13, 0]}, p0, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %base = getelementptr i16, i16* %ptr, i64 %index
+ %tileslice = add i32 %sliceidx, 7
+ call void @llvm.aarch64.sme.st1h.horiz(<vscale x 16 x i1> %pg, i16* %base, i64 0, i32 %tileslice)
+ call void @llvm.aarch64.sme.st1h.vert(<vscale x 16 x i1> %pg, i16* %base, i64 1, i32 0)
+ ret void;
+}
+
+define void @st1w(<vscale x 16 x i1> %pg, i32* %ptr, i32 %sliceidx) {
+; CHECK-LABEL: st1w:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w13, wzr
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: st1w {za0h.s[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1w {za1h.s[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1w {za2h.s[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1w {za3h.s[w12, 3]}, p0, [x0]
+; CHECK-NEXT: st1w {za0v.s[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1w {za1v.s[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1w {za2v.s[w12, 3]}, p0, [x0]
+; CHECK-NEXT: st1w {za3v.s[w13, 0]}, p0, [x0]
+; CHECK-NEXT: ret
+ %tileslice = add i32 %sliceidx, 3
+ call void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1> %pg, i32* %ptr, i64 0, i32 0)
+ call void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1> %pg, i32* %ptr, i64 1, i32 0)
+ call void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1> %pg, i32* %ptr, i64 2, i32 0)
+ call void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1> %pg, i32* %ptr, i64 3, i32 %tileslice)
+ call void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1> %pg, i32* %ptr, i64 0, i32 0)
+ call void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1> %pg, i32* %ptr, i64 1, i32 0)
+ call void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1> %pg, i32* %ptr, i64 2, i32 %tileslice)
+ call void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1> %pg, i32* %ptr, i64 3, i32 0)
+ ret void;
+}
+
+define void @st1w_with_addr_offset(<vscale x 16 x i1> %pg, i32* %ptr, i64 %index, i32 %sliceidx) {
+; CHECK-LABEL: st1w_with_addr_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, wzr
+; CHECK-NEXT: mov w13, w2
+; CHECK-NEXT: st1w {za0h.s[w12, 0]}, p0, [x0, x1, lsl #2]
+; CHECK-NEXT: st1w {za3v.s[w13, 3]}, p0, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+ %base = getelementptr i32, i32* %ptr, i64 %index
+ %tileslice = add i32 %sliceidx, 3
+ call void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1> %pg, i32* %base, i64 0, i32 0)
+ call void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1> %pg, i32* %base, i64 3, i32 %tileslice)
+ ret void;
+}
+
+define void @st1d(<vscale x 16 x i1> %pg, i64* %ptr, i32 %sliceidx) {
+; CHECK-LABEL: st1d:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w13, wzr
+; CHECK-NEXT: mov w12, w1
+; CHECK-NEXT: st1d {za0h.d[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1d {za1h.d[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1d {za2h.d[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1d {za3h.d[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1d {za4h.d[w12, 1]}, p0, [x0]
+; CHECK-NEXT: st1d {za5h.d[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1d {za6h.d[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1d {za7h.d[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1d {za0v.d[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1d {za1v.d[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1d {za2v.d[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1d {za3v.d[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1d {za4v.d[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1d {za5v.d[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1d {za6v.d[w13, 0]}, p0, [x0]
+; CHECK-NEXT: st1d {za7v.d[w12, 1]}, p0, [x0]
+; CHECK-NEXT: ret
+ %tileslice = add i32 %sliceidx, 1
+ call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 0, i32 0)
+ call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 1, i32 0)
+ call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 2, i32 0)
+ call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 3, i32 0)
+ call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 4, i32 %tileslice)
+ call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 5, i32 0)
+ call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 6, i32 0)
+ call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> %pg, i64* %ptr, i64 7, i32 0)
+ call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 0, i32 0)
+ call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 1, i32 0)
+ call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 2, i32 0)
+ call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 3, i32 0)
+ call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 4, i32 0)
+ call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 5, i32 0)
+ call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 6, i32 0)
+ call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> %pg, i64* %ptr, i64 7, i32 %tileslice)
+ ret void;
+}
+
+define void @st1d_with_addr_offset(<vscale x 16 x i1> %pg, i64* %ptr, i64 %index, i32 %sliceidx) {
+; CHECK-LABEL: st1d_with_addr_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, w2
+; CHECK-NEXT: mov w13, wzr
+; CHECK-NEXT: st1d {za0h.d[w12, 1]}, p0, [x0, x1, lsl #3]
+; CHECK-NEXT: st1d {za7v.d[w13, 0]}, p0, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+ %base = getelementptr i64, i64* %ptr, i64 %index
+ %tileslice = add i32 %sliceidx, 1
+ call void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1> %pg, i64* %base, i64 0, i32 %tileslice)
+ call void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1> %pg, i64* %base, i64 7, i32 0)
+ ret void;
+}
+
+define void @st1q(<vscale x 16 x i1> %pg, i128* %ptr) {
+; CHECK-LABEL: st1q:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, wzr
+; CHECK-NEXT: st1q {za0h.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za1h.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za2h.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za3h.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za4h.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za5h.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za6h.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za7h.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za8h.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za9h.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za10h.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za11h.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za12h.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za13h.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za14h.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za15h.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za0v.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za1v.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za2v.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za3v.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za4v.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za5v.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za6v.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za7v.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za8v.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za9v.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za10v.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za11v.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za12v.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za13v.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za14v.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: st1q {za15v.q[w12, 0]}, p0, [x0]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 0, i32 0)
+ call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 1, i32 0)
+ call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 2, i32 0)
+ call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 3, i32 0)
+ call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 4, i32 0)
+ call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 5, i32 0)
+ call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 6, i32 0)
+ call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 7, i32 0)
+ call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 8, i32 0)
+ call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 9, i32 0)
+ call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 10, i32 0)
+ call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 11, i32 0)
+ call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 12, i32 0)
+ call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 13, i32 0)
+ call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 14, i32 0)
+ call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %ptr, i64 15, i32 0)
+ call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 0, i32 0)
+ call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 1, i32 0)
+ call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 2, i32 0)
+ call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 3, i32 0)
+ call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 4, i32 0)
+ call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 5, i32 0)
+ call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 6, i32 0)
+ call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 7, i32 0)
+ call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 8, i32 0)
+ call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 9, i32 0)
+ call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 10, i32 0)
+ call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 11, i32 0)
+ call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 12, i32 0)
+ call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 13, i32 0)
+ call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 14, i32 0)
+ call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %ptr, i64 15, i32 0)
+ ret void;
+}
+
+define void @st1q_with_addr_offset(<vscale x 16 x i1> %pg, i128* %ptr, i64 %index) {
+; CHECK-LABEL: st1q_with_addr_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov w12, wzr
+; CHECK-NEXT: st1q {za0h.q[w12, 0]}, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: st1q {za15v.q[w12, 0]}, p0, [x0, x1, lsl #4]
+; CHECK-NEXT: ret
+ %base = getelementptr i128, i128* %ptr, i64 %index
+ call void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1> %pg, i128* %base, i64 0, i32 0)
+ call void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1> %pg, i128* %base, i64 15, i32 0)
+ ret void;
+}
+
+declare void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1>, i8*, i64, i32)
+declare void @llvm.aarch64.sme.st1h.horiz(<vscale x 16 x i1>, i16*, i64, i32)
+declare void @llvm.aarch64.sme.st1w.horiz(<vscale x 16 x i1>, i32*, i64, i32)
+declare void @llvm.aarch64.sme.st1d.horiz(<vscale x 16 x i1>, i64*, i64, i32)
+declare void @llvm.aarch64.sme.st1q.horiz(<vscale x 16 x i1>, i128*, i64, i32)
+declare void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1>, i8*, i64, i32)
+declare void @llvm.aarch64.sme.st1h.vert(<vscale x 16 x i1>, i16*, i64, i32)
+declare void @llvm.aarch64.sme.st1w.vert(<vscale x 16 x i1>, i32*, i64, i32)
+declare void @llvm.aarch64.sme.st1d.vert(<vscale x 16 x i1>, i64*, i64, i32)
+declare void @llvm.aarch64.sme.st1q.vert(<vscale x 16 x i1>, i128*, i64, i32)
More information about the llvm-commits
mailing list