[llvm] e2ed1d1 - [llvm][aarch64] SVE addressing modes.
Francesco Petrogalli via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 21 12:03:24 PST 2020
Author: Francesco Petrogalli
Date: 2020-02-21T20:02:34Z
New Revision: e2ed1d14d6c2d11d1a5df23bd679bcb7e6cbf433
URL: https://github.com/llvm/llvm-project/commit/e2ed1d14d6c2d11d1a5df23bd679bcb7e6cbf433
DIFF: https://github.com/llvm/llvm-project/commit/e2ed1d14d6c2d11d1a5df23bd679bcb7e6cbf433.diff
LOG: [llvm][aarch64] SVE addressing modes.
Summary:
Added register + immediate and register + register addressing modes for the following intrinsics:
1. Masked load and stores:
* Sign and zero extended load and truncated stores.
* No extension or truncation.
2. Masked non-temporal load and store.
Reviewers: andwar, efriedma
Subscribers: cameron.mcinally, sdesmalen, tschuett, kristof.beyls, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D74254
Added:
llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll
llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-reg.ll
llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll
llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-reg.ll
Modified:
llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
llvm/lib/Target/AArch64/SVEInstrFormats.td
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index d906cc6689d5..ee52e2f3f8b8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -221,6 +221,15 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
+ /// SVE Reg+Imm addressing mode.
+ template <int64_t Min, int64_t Max>
+ bool SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, SDValue &Base,
+ SDValue &OffImm);
+ /// SVE Reg+Reg address mode.
+ template <unsigned Scale>
+ bool SelectSVERegRegAddrMode(SDValue N, SDValue &Base, SDValue &Offset) {
+ return SelectSVERegRegAddrMode(N, Scale, Base, Offset);
+ }
void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
@@ -282,6 +291,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
bool SelectSVEArithImm(SDValue N, SDValue &Imm);
+ bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
+ SDValue &Offset);
};
} // end anonymous namespace
@@ -4427,3 +4438,72 @@ FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,
CodeGenOpt::Level OptLevel) {
return new AArch64DAGToDAGISel(TM, OptLevel);
}
+
+/// SelectAddrModeIndexedSVE - Attempt selection of the addressing mode:
+/// Base + OffImm * sizeof(MemVT) for Min >= OffImm <= Max
+/// where Root is the memory access using N for its address.
+template <int64_t Min, int64_t Max>
+bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
+ SDValue &Base,
+ SDValue &OffImm) {
+ assert(isa<MemSDNode>(Root) && "Invalid node.");
+
+ EVT MemVT = cast<MemSDNode>(Root)->getMemoryVT();
+
+ if (N.getOpcode() != ISD::ADD)
+ return false;
+
+ SDValue VScale = N.getOperand(1);
+ if (VScale.getOpcode() != ISD::VSCALE)
+ return false;
+
+ TypeSize TS = MemVT.getSizeInBits();
+ int64_t MemWidthBytes = static_cast<int64_t>(TS.getKnownMinSize()) / 8;
+ int64_t MulImm = cast<ConstantSDNode>(VScale.getOperand(0))->getSExtValue();
+
+ if ((MulImm % MemWidthBytes) != 0)
+ return false;
+
+ int64_t Offset = MulImm / MemWidthBytes;
+ if (Offset < Min || Offset > Max)
+ return false;
+
+ Base = N.getOperand(0);
+ OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64);
+ return true;
+}
+
+/// Select register plus register addressing mode for SVE, with scaled
+/// offset.
+bool AArch64DAGToDAGISel::SelectSVERegRegAddrMode(SDValue N, unsigned Scale,
+ SDValue &Base,
+ SDValue &Offset) {
+ if (N.getOpcode() != ISD::ADD)
+ return false;
+
+ // Process an ADD node.
+ const SDValue LHS = N.getOperand(0);
+ const SDValue RHS = N.getOperand(1);
+
+ // 8 bit data does not come with the SHL node, so it is treated
+ // separately.
+ if (Scale == 0) {
+ Base = LHS;
+ Offset = RHS;
+ return true;
+ }
+
+ // Check if the RHS is a shift node with a constant.
+ if (RHS.getOpcode() != ISD::SHL)
+ return false;
+
+ const SDValue ShiftRHS = RHS.getOperand(1);
+ if (auto *C = dyn_cast<ConstantSDNode>(ShiftRHS))
+ if (C->getZExtValue() == Scale) {
+ Base = LHS;
+ Offset = RHS.getOperand(0);
+ return true;
+ }
+
+ return false;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index f11234787905..d9c29f136959 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1270,81 +1270,100 @@ let Predicates = [HasSVE] in {
// Add more complex addressing modes here as required
multiclass pred_load<ValueType Ty, ValueType PredTy, SDPatternOperator Load,
- Instruction RegImmInst> {
-
+ Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> {
+ // reg + reg
+ let AddedComplexity = 1 in {
+ def _reg_reg_z : Pat<(Ty (Load (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), (SVEDup0Undef))),
+ (RegRegInst PPR:$gp, GPR64:$base, GPR64:$offset)>;
+ }
+ // reg + imm
+ let AddedComplexity = 2 in {
+ def _reg_imm_z : Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), (SVEDup0Undef))),
+ (RegImmInst PPR:$gp, GPR64:$base, simm4s1:$offset)>;
+ }
def _default_z : Pat<(Ty (Load GPR64:$base, (PredTy PPR:$gp), (SVEDup0Undef))),
(RegImmInst PPR:$gp, GPR64:$base, (i64 0))>;
}
// 2-element contiguous loads
- defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8, LD1B_D_IMM>;
- defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8, LD1SB_D_IMM>;
- defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16, LD1H_D_IMM>;
- defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D_IMM>;
- defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32, LD1W_D_IMM>;
- defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D_IMM>;
- defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load, LD1D_IMM>;
- defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load, LD1H_D_IMM>;
- defm : pred_load<nxv2f32, nxv2i1, nonext_masked_load, LD1W_D_IMM>;
- defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load, LD1D_IMM>;
+ defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8, LD1B_D, LD1B_D_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8, LD1SB_D, LD1SB_D_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D, LD1SH_D_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>;
+ defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D, LD1SW_D_IMM, am_sve_regreg_lsl2>;
+ defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>;
+ defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load, LD1H_D, LD1H_D_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv2f32, nxv2i1, nonext_masked_load, LD1W_D, LD1W_D_IMM, am_sve_regreg_lsl2>;
+ defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load, LD1D, LD1D_IMM, am_sve_regreg_lsl3>;
// 4-element contiguous loads
- defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8, LD1B_S_IMM>;
- defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8, LD1SB_S_IMM>;
- defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16, LD1H_S_IMM>;
- defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S_IMM>;
- defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load, LD1W_IMM>;
- defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load, LD1H_S_IMM>;
- defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load, LD1W_IMM>;
+ defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8, LD1B_S, LD1B_S_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8, LD1SB_S, LD1SB_S_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S, LD1SH_S_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;
+ defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load, LD1H_S, LD1H_S_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load, LD1W, LD1W_IMM, am_sve_regreg_lsl2>;
// 8-element contiguous loads
- defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H_IMM>;
- defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H_IMM>;
- defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H_IMM>;
- defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H_IMM>;
+ defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H, LD1B_H_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H, LD1H_IMM, am_sve_regreg_lsl1>;
// 16-element contiguous loads
- defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B_IMM>;
+ defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B, LD1B_IMM, am_sve_regreg_lsl0>;
multiclass pred_store<ValueType Ty, ValueType PredTy, SDPatternOperator Store,
- Instruction RegImmInst> {
+ Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> {
+ // reg + reg
+ let AddedComplexity = 1 in {
+ def _reg_reg : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp)),
+ (RegRegInst ZPR:$vec, PPR:$gp, GPR64:$base, GPR64:$offset)>;
+ }
+ // reg + imm
+ let AddedComplexity = 2 in {
+ def _reg_imm : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp)),
+ (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, simm4s1:$offset)>;
+ }
def _default : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp)),
(RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>;
}
// 2-element contiguous stores
- defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i8, ST1B_D_IMM>;
- defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i16, ST1H_D_IMM>;
- defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i32, ST1W_D_IMM>;
- defm : pred_store<nxv2i64, nxv2i1, nontrunc_masked_store, ST1D_IMM>;
- defm : pred_store<nxv2f16, nxv2i1, nontrunc_masked_store, ST1H_D_IMM>;
- defm : pred_store<nxv2f32, nxv2i1, nontrunc_masked_store, ST1W_D_IMM>;
- defm : pred_store<nxv2f64, nxv2i1, nontrunc_masked_store, ST1D_IMM>;
+ defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i8, ST1B_D, ST1B_D_IMM, am_sve_regreg_lsl0>;
+ defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i16, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
+ defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i32, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
+ defm : pred_store<nxv2i64, nxv2i1, nontrunc_masked_store, ST1D, ST1D_IMM, am_sve_regreg_lsl3>;
+ defm : pred_store<nxv2f16, nxv2i1, nontrunc_masked_store, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
+ defm : pred_store<nxv2f32, nxv2i1, nontrunc_masked_store, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
+ defm : pred_store<nxv2f64, nxv2i1, nontrunc_masked_store, ST1D, ST1D_IMM, am_sve_regreg_lsl3>;
// 4-element contiguous stores
- defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i8, ST1B_S_IMM>;
- defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i16, ST1H_S_IMM>;
- defm : pred_store<nxv4i32, nxv4i1, nontrunc_masked_store, ST1W_IMM>;
- defm : pred_store<nxv4f16, nxv4i1, nontrunc_masked_store, ST1H_S_IMM>;
- defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store, ST1W_IMM>;
+ defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i8, ST1B_S, ST1B_S_IMM, am_sve_regreg_lsl0>;
+ defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i16, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
+ defm : pred_store<nxv4i32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>;
+ defm : pred_store<nxv4f16, nxv4i1, nontrunc_masked_store, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
+ defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store, ST1W, ST1W_IMM, am_sve_regreg_lsl2>;
// 8-element contiguous stores
- defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H_IMM>;
- defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H_IMM>;
- defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H_IMM>;
+ defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>;
+ defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
+ defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H, ST1H_IMM, am_sve_regreg_lsl1>;
// 16-element contiguous stores
- defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B_IMM>;
+ defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B, ST1B_IMM, am_sve_regreg_lsl0>;
- defm : pred_load<nxv16i8, nxv16i1, non_temporal_load, LDNT1B_ZRI>;
- defm : pred_load<nxv8i16, nxv8i1, non_temporal_load, LDNT1H_ZRI>;
- defm : pred_load<nxv4i32, nxv4i1, non_temporal_load, LDNT1W_ZRI>;
- defm : pred_load<nxv2i64, nxv2i1, non_temporal_load, LDNT1D_ZRI>;
+ defm : pred_load<nxv16i8, nxv16i1, non_temporal_load, LDNT1B_ZRR, LDNT1B_ZRI, am_sve_regreg_lsl0>;
+ defm : pred_load<nxv8i16, nxv8i1, non_temporal_load, LDNT1H_ZRR, LDNT1H_ZRI, am_sve_regreg_lsl1>;
+ defm : pred_load<nxv4i32, nxv4i1, non_temporal_load, LDNT1W_ZRR, LDNT1W_ZRI, am_sve_regreg_lsl2>;
+ defm : pred_load<nxv2i64, nxv2i1, non_temporal_load, LDNT1D_ZRR, LDNT1D_ZRI, am_sve_regreg_lsl3>;
- defm : pred_store<nxv16i8, nxv16i1, non_temporal_store, STNT1B_ZRI>;
- defm : pred_store<nxv8i16, nxv8i1, non_temporal_store, STNT1H_ZRI>;
- defm : pred_store<nxv4i32, nxv4i1, non_temporal_store, STNT1W_ZRI>;
- defm : pred_store<nxv2i64, nxv2i1, non_temporal_store, STNT1D_ZRI>;
+ defm : pred_store<nxv16i8, nxv16i1, non_temporal_store, STNT1B_ZRR, STNT1B_ZRI, am_sve_regreg_lsl0>;
+ defm : pred_store<nxv8i16, nxv8i1, non_temporal_store, STNT1H_ZRR, STNT1H_ZRI, am_sve_regreg_lsl1>;
+ defm : pred_store<nxv4i32, nxv4i1, non_temporal_store, STNT1W_ZRR, STNT1W_ZRI, am_sve_regreg_lsl2>;
+ defm : pred_store<nxv2i64, nxv2i1, non_temporal_store, STNT1D_ZRR, STNT1D_ZRI, am_sve_regreg_lsl3>;
multiclass unpred_store<ValueType Ty, Instruction RegImmInst, Instruction PTrue> {
def _fi : Pat<(store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)),
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 8c02b3a95dfe..b3a455cf9b82 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -7059,3 +7059,11 @@ class sve2_crypto_unary_op<bit opc, string asm>
let Constraints = "$Zdn = $_Zdn";
}
+
+/// Addressing modes
+def am_sve_indexed_s4 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-8,7>", [], [SDNPWantRoot]>;
+
+def am_sve_regreg_lsl0 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<0>", []>;
+def am_sve_regreg_lsl1 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<1>", []>;
+def am_sve_regreg_lsl2 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<2>", []>;
+def am_sve_regreg_lsl3 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<3>", []>;
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll
new file mode 100644
index 000000000000..4b1323038be2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll
@@ -0,0 +1,622 @@
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s
+
+; Range checks: for all the instruction tested in this file, the
+; immediate must be within the range [-8, 7] (4-bit immediate). Out of
+; range values are tested only in one case (following). Valid values
+; are tested all through the rest of the file.
+
+define void @imm_out_of_range(<vscale x 2 x i64> * %base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: imm_out_of_range:
+; CHECK-NEXT: rdvl x8, #8
+; CHECK-NEXT: add x8, x0, x8
+; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x{{[0-9]+}}]
+; CHECK-NEXT: rdvl x8, #-9
+; CHECK-NEXT: add x8, x0, x8
+; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base, i64 8
+ %data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>* %base_load,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i64> undef)
+ %base_store = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64> * %base, i64 -9
+ call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data,
+ <vscale x 2 x i64>* %base_store,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+; 2-lane contiguous load/stores
+
+define void @test_masked_ldst_sv2i8(<vscale x 2 x i8> * %base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv2i8:
+; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
+; CHECK-NEXT: st1b { z[[DATA]].d }, p0, [x0, #-7, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base, i64 -8
+ %data = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_load,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i8> undef)
+ %base_store = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8> * %base, i64 -7
+ call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %data,
+ <vscale x 2 x i8>* %base_store,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv2i16(<vscale x 2 x i16> * %base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv2i16:
+; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
+; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, #-7, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base, i64 -8
+ %data = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_load,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i16> undef)
+ %base_store = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16> * %base, i64 -7
+ call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %data,
+ <vscale x 2 x i16>* %base_store,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+
+define void @test_masked_ldst_sv2i32(<vscale x 2 x i32> * %base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv2i32:
+; CHECK-NEXT: ld1sw { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
+; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, #-7, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base, i64 -8
+ %data = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_load,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i32> undef)
+ %base_store = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32> * %base, i64 -7
+ call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %data,
+ <vscale x 2 x i32>* %base_store,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv2i64(<vscale x 2 x i64> * %base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv2i64:
+; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
+; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, #-7, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base, i64 -8
+ %data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>* %base_load,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i64> undef)
+ %base_store = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64> * %base, i64 -7
+ call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data,
+ <vscale x 2 x i64>* %base_store,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv2f16(<vscale x 2 x half> * %base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv2f16:
+; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
+; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, #-7, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x half>, <vscale x 2 x half>* %base, i64 -8
+ %data = call <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>* %base_load,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x half> undef)
+ %base_store = getelementptr <vscale x 2 x half>, <vscale x 2 x half> * %base, i64 -7
+ call void @llvm.masked.store.nxv2f16(<vscale x 2 x half> %data,
+ <vscale x 2 x half>* %base_store,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+
+define void @test_masked_ldst_sv2f32(<vscale x 2 x float> * %base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv2f32:
+; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
+; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, #-7, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x float>, <vscale x 2 x float>* %base, i64 -8
+ %data = call <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>* %base_load,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x float> undef)
+ %base_store = getelementptr <vscale x 2 x float>, <vscale x 2 x float> * %base, i64 -7
+ call void @llvm.masked.store.nxv2f32(<vscale x 2 x float> %data,
+ <vscale x 2 x float>* %base_store,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv2f64(<vscale x 2 x double> * %base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv2f64:
+; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-6, mul vl]
+; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, #-5, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %base, i64 -6
+ %data = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>* %base_load,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x double> undef)
+ %base_store = getelementptr <vscale x 2 x double>, <vscale x 2 x double> * %base, i64 -5
+ call void @llvm.masked.store.nxv2f64(<vscale x 2 x double> %data,
+ <vscale x 2 x double>* %base_store,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+; 2-lane zero/sign extended contiguous loads.
+
+define <vscale x 2 x i64> @masked_zload_sv2i8_to_sv2i64(<vscale x 2 x i8>* %base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64:
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #-4, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base, i64 -4
+ %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_load,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i8> undef)
+ %ext = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %ext
+}
+
+define <vscale x 2 x i64> @masked_sload_sv2i8_to_sv2i64(<vscale x 2 x i8>* %base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64:
+; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, #-3, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base, i64 -3
+ %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_load,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i8> undef)
+ %ext = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %ext
+}
+
+define <vscale x 2 x i64> @masked_zload_sv2i16_to_sv2i64(<vscale x 2 x i16>* %base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64:
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base, i64 1
+ %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_load,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i16> undef)
+ %ext = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %ext
+}
+
+define <vscale x 2 x i64> @masked_sload_sv2i16_to_sv2i64(<vscale x 2 x i16>* %base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64:
+; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base, i64 2
+ %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_load,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i16> undef)
+ %ext = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %ext
+}
+
+define <vscale x 2 x i64> @masked_zload_sv2i32_to_sv2i64(<vscale x 2 x i32>* %base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64:
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, #-2, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base, i64 -2
+ %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_load,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i32> undef)
+ %ext = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %ext
+}
+
+define <vscale x 2 x i64> @masked_sload_sv2i32_to_sv2i64(<vscale x 2 x i32>* %base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64:
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, #-1, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base, i64 -1
+ %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_load,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i32> undef)
+ %ext = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %ext
+}
+
+; 2-lane truncating contiguous stores.
+
+define void @masked_trunc_store_sv2i64_to_sv2i8(<vscale x 2 x i64> %val, <vscale x 2 x i8> *%base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8:
+; CHECK-NEXT: st1b { z0.d }, p0, [x0, #3, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base, i64 3
+ %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i8>
+ call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %trunc,
+ <vscale x 2 x i8> *%base_load,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+
+define void @masked_trunc_store_sv2i64_to_sv2i16(<vscale x 2 x i64> %val, <vscale x 2 x i16> *%base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16:
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, #4, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base, i64 4
+ %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i16>
+ call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %trunc,
+ <vscale x 2 x i16> *%base_load,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @masked_trunc_store_sv2i64_to_sv2i32(<vscale x 2 x i64> %val, <vscale x 2 x i32> *%base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32:
+; CHECK-NEXT: st1w { z0.d }, p0, [x0, #5, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base, i64 5
+ %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i32>
+ call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %trunc,
+ <vscale x 2 x i32> *%base_load,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+; 4-lane contiguous load/stores.
+
+define void @test_masked_ldst_sv4i8(<vscale x 4 x i8> * %base, <vscale x 4 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv4i8:
+; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl]
+; CHECK-NEXT: st1b { z[[DATA]].s }, p0, [x0, #2, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base, i64 -1
+ %data = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_load,
+ i32 1,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x i8> undef)
+ %base_store = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8> * %base, i64 2
+ call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %data,
+ <vscale x 4 x i8>* %base_store,
+ i32 1,
+ <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv4i16(<vscale x 4 x i16> * %base, <vscale x 4 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv4i16:
+; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl]
+; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, #2, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base, i64 -1
+ %data = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_load,
+ i32 1,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x i16> undef)
+ %base_store = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16> * %base, i64 2
+ call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %data,
+ <vscale x 4 x i16>* %base_store,
+ i32 1,
+ <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv4i32(<vscale x 4 x i32> * %base, <vscale x 4 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv4i32:
+; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, #7, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %base, i64 6
+ %data = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>* %base_load,
+ i32 1,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x i32> undef)
+ %base_store = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32> * %base, i64 7
+ call void @llvm.masked.store.nxv4i32(<vscale x 4 x i32> %data,
+ <vscale x 4 x i32>* %base_store,
+ i32 1,
+ <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv4f16(<vscale x 4 x half> * %base, <vscale x 4 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv4f16:
+; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl]
+; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, #2, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 4 x half>, <vscale x 4 x half>* %base, i64 -1
+ %data = call <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>* %base_load,
+ i32 1,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x half> undef)
+ %base_store = getelementptr <vscale x 4 x half>, <vscale x 4 x half> * %base, i64 2
+ call void @llvm.masked.store.nxv4f16(<vscale x 4 x half> %data,
+ <vscale x 4 x half>* %base_store,
+ i32 1,
+ <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv4f32(<vscale x 4 x float> * %base, <vscale x 4 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv4f32:
+; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl]
+; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, #2, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %base, i64 -1
+ %data = call <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>* %base_load,
+ i32 1,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x float> undef)
+ %base_store = getelementptr <vscale x 4 x float>, <vscale x 4 x float> * %base, i64 2
+ call void @llvm.masked.store.nxv4f32(<vscale x 4 x float> %data,
+ <vscale x 4 x float>* %base_store,
+ i32 1,
+ <vscale x 4 x i1> %mask)
+ ret void
+}
+
+; 4-lane zero/sign extended contiguous loads.
+
+define <vscale x 4 x i32> @masked_zload_sv4i8_to_sv4i32(<vscale x 4 x i8>* %base, <vscale x 4 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32:
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #-4, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base, i64 -4
+ %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_load,
+ i32 1,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x i8> undef)
+ %ext = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %ext
+}
+
+define <vscale x 4 x i32> @masked_sload_sv4i8_to_sv4i32(<vscale x 4 x i8>* %base, <vscale x 4 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32:
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, #-3, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base, i64 -3
+ %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_load,
+ i32 1,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x i8> undef)
+ %ext = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %ext
+}
+
+define <vscale x 4 x i32> @masked_zload_sv4i16_to_sv4i32(<vscale x 4 x i16>* %base, <vscale x 4 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32:
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base, i64 1
+ %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_load,
+ i32 1,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x i16> undef)
+ %ext = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %ext
+}
+
+define <vscale x 4 x i32> @masked_sload_sv4i16_to_sv4i32(<vscale x 4 x i16>* %base, <vscale x 4 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32:
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base, i64 2
+ %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_load,
+ i32 1,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x i16> undef)
+ %ext = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %ext
+}
+
+; 4-lane truncating contiguous stores.
+
+define void @masked_trunc_store_sv4i32_to_sv4i8(<vscale x 4 x i32> %val, <vscale x 4 x i8> *%base, <vscale x 4 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8:
+; CHECK-NEXT: st1b { z0.s }, p0, [x0, #3, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base, i64 3
+ %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i8>
+ call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %trunc,
+ <vscale x 4 x i8> *%base_load,
+ i32 1,
+ <vscale x 4 x i1> %mask)
+ ret void
+}
+
+
+define void @masked_trunc_store_sv4i32_to_sv4i16(<vscale x 4 x i32> %val, <vscale x 4 x i16> *%base, <vscale x 4 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16:
+; CHECK-NEXT: st1h { z0.s }, p0, [x0, #4, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base, i64 4
+ %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i16>
+ call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %trunc,
+ <vscale x 4 x i16> *%base_load,
+ i32 1,
+ <vscale x 4 x i1> %mask)
+ ret void
+}
+
+; 8-lane contiguous load/stores.
+
+define void @test_masked_ldst_sv8i8(<vscale x 8 x i8> * %base, <vscale x 8 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv8i8:
+; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: st1b { z[[DATA]].h }, p0, [x0, #7, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base, i64 6
+ %data = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_load,
+ i32 1,
+ <vscale x 8 x i1> %mask,
+ <vscale x 8 x i8> undef)
+ %base_store = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8> * %base, i64 7
+ call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %data,
+ <vscale x 8 x i8>* %base_store,
+ i32 1,
+ <vscale x 8 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv8i16(<vscale x 8 x i16> * %base, <vscale x 8 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv8i16:
+; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, #7, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %base, i64 6
+ %data = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>* %base_load,
+ i32 1,
+ <vscale x 8 x i1> %mask,
+ <vscale x 8 x i16> undef)
+ %base_store = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16> * %base, i64 7
+ call void @llvm.masked.store.nxv8i16(<vscale x 8 x i16> %data,
+ <vscale x 8 x i16>* %base_store,
+ i32 1,
+ <vscale x 8 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv8f16(<vscale x 8 x half> * %base, <vscale x 8 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv8f16:
+; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl]
+; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, #2, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %base, i64 -1
+ %data = call <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>* %base_load,
+ i32 1,
+ <vscale x 8 x i1> %mask,
+ <vscale x 8 x half> undef)
+ %base_store = getelementptr <vscale x 8 x half>, <vscale x 8 x half> * %base, i64 2
+ call void @llvm.masked.store.nxv8f16(<vscale x 8 x half> %data,
+ <vscale x 8 x half>* %base_store,
+ i32 1,
+ <vscale x 8 x i1> %mask)
+ ret void
+}
+
+; 8-lane zero/sign extended contiguous loads.
+
+define <vscale x 8 x i16> @masked_zload_sv8i8_to_sv8i16(<vscale x 8 x i8>* %base, <vscale x 8 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16:
+; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, #-4, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base, i64 -4
+ %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_load,
+ i32 1,
+ <vscale x 8 x i1> %mask,
+ <vscale x 8 x i8> undef)
+ %ext = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
+ ret <vscale x 8 x i16> %ext
+}
+
+define <vscale x 8 x i16> @masked_sload_sv8i8_to_sv8i16(<vscale x 8 x i8>* %base, <vscale x 8 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16:
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, #-3, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base, i64 -3
+ %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_load,
+ i32 1,
+ <vscale x 8 x i1> %mask,
+ <vscale x 8 x i8> undef)
+ %ext = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
+ ret <vscale x 8 x i16> %ext
+}
+
+; 8-lane truncating contiguous stores.
+
+define void @masked_trunc_store_sv8i16_to_sv8i8(<vscale x 8 x i16> %val, <vscale x 8 x i8> *%base, <vscale x 8 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8:
+; CHECK-NEXT: st1b { z0.h }, p0, [x0, #3, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base, i64 3
+ %trunc = trunc <vscale x 8 x i16> %val to <vscale x 8 x i8>
+ call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %trunc,
+ <vscale x 8 x i8> *%base_load,
+ i32 1,
+ <vscale x 8 x i1> %mask)
+ ret void
+}
+
+; 16-lane contiguous load/stores.
+
+define void @test_masked_ldst_sv16i8(<vscale x 16 x i8> * %base, <vscale x 16 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv16i8:
+; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: st1b { z[[DATA]].b }, p0, [x0, #7, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %base, i64 6
+ %data = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>* %base_load,
+ i32 1,
+ <vscale x 16 x i1> %mask,
+ <vscale x 16 x i8> undef)
+ %base_store = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8> * %base, i64 7
+ call void @llvm.masked.store.nxv16i8(<vscale x 16 x i8> %data,
+ <vscale x 16 x i8>* %base_store,
+ i32 1,
+ <vscale x 16 x i1> %mask)
+ ret void
+}
+
+; 2-element contiguous loads.
+declare <vscale x 2 x i8> @llvm.masked.load.nxv2i8 (<vscale x 2 x i8>* , i32, <vscale x 2 x i1>, <vscale x 2 x i8> )
+declare <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>*, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>*, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>*, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>*, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>*, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>*, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+
+; 4-element contiguous loads.
+declare <vscale x 4 x i8> @llvm.masked.load.nxv4i8 (<vscale x 4 x i8>* , i32, <vscale x 4 x i1>, <vscale x 4 x i8> )
+declare <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>*, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
+declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>*, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>*, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>*, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
+
+; 8-element contiguous loads.
+declare <vscale x 8 x i8> @llvm.masked.load.nxv8i8 (<vscale x 8 x i8>* , i32, <vscale x 8 x i1>, <vscale x 8 x i8> )
+declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
+declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
+
+; 16-element contiguous loads.
+declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>*, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
+
+; 2-element contiguous stores.
+declare void @llvm.masked.store.nxv2i8 (<vscale x 2 x i8> , <vscale x 2 x i8>* , i32, <vscale x 2 x i1>)
+declare void @llvm.masked.store.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>*, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.store.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>*, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.store.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>*, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.store.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>*, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.store.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>*, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.store.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>*, i32, <vscale x 2 x i1>)
+
+; 4-element contiguous stores.
+declare void @llvm.masked.store.nxv4i8 (<vscale x 4 x i8> , <vscale x 4 x i8>* , i32, <vscale x 4 x i1>)
+declare void @llvm.masked.store.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>*, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.store.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>*, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.store.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>*, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>*, i32, <vscale x 4 x i1>)
+
+; 8-element contiguous stores.
+declare void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8> , <vscale x 8 x i8>* , i32, <vscale x 8 x i1>)
+declare void @llvm.masked.store.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>*, i32, <vscale x 8 x i1>)
+declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>*, i32, <vscale x 8 x i1>)
+
+; 16-element contiguous stores.
+declare void @llvm.masked.store.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>*, i32, <vscale x 16 x i1>)
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-reg.ll b/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-reg.ll
new file mode 100644
index 000000000000..63cf1726b0a6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-reg.ll
@@ -0,0 +1,610 @@
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s
+
+; 2-lane contiguous load/stores
+
+define void @test_masked_ldst_sv2i8(i8 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv2i8:
+; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1]
+; CHECK-NEXT: st1b { z[[DATA]].d }, p0, [x0, x1]
+; CHECK-NEXT: ret
+ %base_i8 = getelementptr i8, i8* %base, i64 %offset
+ %base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>*
+ %data = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i8> undef)
+ call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %data,
+ <vscale x 2 x i8>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv2i16(i16 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv2i16:
+; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %base_i16 = getelementptr i16, i16* %base, i64 %offset
+ %base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>*
+ %data = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i16> undef)
+ call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %data,
+ <vscale x 2 x i16>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv2i32(i32 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv2i32:
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+ %base_i32 = getelementptr i32, i32* %base, i64 %offset
+ %base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>*
+ %data = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i32> undef)
+ call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %data,
+ <vscale x 2 x i32>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv2i64(i64 * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv2i64:
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+ %base_i64 = getelementptr i64, i64* %base, i64 %offset
+ %base_addr = bitcast i64* %base_i64 to <vscale x 2 x i64>*
+ %data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i64> undef)
+ call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data,
+ <vscale x 2 x i64>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv2f16(half * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv2f16:
+; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %base_half = getelementptr half, half* %base, i64 %offset
+ %base_addr = bitcast half* %base_half to <vscale x 2 x half>*
+ %data = call <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x half> undef)
+ call void @llvm.masked.store.nxv2f16(<vscale x 2 x half> %data,
+ <vscale x 2 x half>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv2f32(float * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv2f32:
+; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+ %base_float = getelementptr float, float* %base, i64 %offset
+ %base_addr = bitcast float* %base_float to <vscale x 2 x float>*
+ %data = call <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x float> undef)
+ call void @llvm.masked.store.nxv2f32(<vscale x 2 x float> %data,
+ <vscale x 2 x float>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv2f64(double * %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv2f64:
+; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+ %base_double = getelementptr double, double* %base, i64 %offset
+ %base_addr = bitcast double* %base_double to <vscale x 2 x double>*
+ %data = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x double> undef)
+ call void @llvm.masked.store.nxv2f64(<vscale x 2 x double> %data,
+ <vscale x 2 x double>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+; 2-lane zero/sign extended contiguous loads.
+
+define <vscale x 2 x i64> @masked_zload_sv2i8_to_sv2i64(i8* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64:
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x1]
+; CHECK-NEXT: ret
+ %base_i8 = getelementptr i8, i8* %base, i64 %offset
+ %base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>*
+ %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i8> undef)
+ %ext = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %ext
+}
+
+define <vscale x 2 x i64> @masked_sload_sv2i8_to_sv2i64(i8* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64:
+; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, x1]
+; CHECK-NEXT: ret
+ %base_i8 = getelementptr i8, i8* %base, i64 %offset
+ %base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>*
+ %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(<vscale x 2 x i8>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i8> undef)
+ %ext = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %ext
+}
+
+define <vscale x 2 x i64> @masked_zload_sv2i16_to_sv2i64(i16* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64:
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %base_i16 = getelementptr i16, i16* %base, i64 %offset
+ %base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>*
+ %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i16> undef)
+ %ext = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %ext
+}
+
+define <vscale x 2 x i64> @masked_sload_sv2i16_to_sv2i64(i16* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64:
+; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %base_i16 = getelementptr i16, i16* %base, i64 %offset
+ %base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>*
+ %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i16> undef)
+ %ext = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %ext
+}
+
+
+define <vscale x 2 x i64> @masked_zload_sv2i32_to_sv2i64(i32* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64:
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+ %base_i32 = getelementptr i32, i32* %base, i64 %offset
+ %base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>*
+ %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i32> undef)
+ %ext = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %ext
+}
+
+define <vscale x 2 x i64> @masked_sload_sv2i32_to_sv2i64(i32* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64:
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+ %base_i32 = getelementptr i32, i32* %base, i64 %offset
+ %base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>*
+ %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>* %base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i32> undef)
+ %ext = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %ext
+}
+
+; 2-lane truncating contiguous stores.
+
+define void @masked_trunc_store_sv2i64_to_sv2i8(<vscale x 2 x i64> %val, i8 *%base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8:
+; CHECK-NEXT: st1b { z0.d }, p0, [x0, x1]
+; CHECK-NEXT: ret
+ %base_i8 = getelementptr i8, i8* %base, i64 %offset
+ %base_addr = bitcast i8* %base_i8 to <vscale x 2 x i8>*
+ %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i8>
+ call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %trunc,
+ <vscale x 2 x i8> *%base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @masked_trunc_store_sv2i64_to_sv2i16(<vscale x 2 x i64> %val, i16 *%base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16:
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %base_i16 = getelementptr i16, i16* %base, i64 %offset
+ %base_addr = bitcast i16* %base_i16 to <vscale x 2 x i16>*
+ %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i16>
+ call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %trunc,
+ <vscale x 2 x i16> *%base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @masked_trunc_store_sv2i64_to_sv2i32(<vscale x 2 x i64> %val, i32 *%base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32:
+; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+ %base_i32 = getelementptr i32, i32* %base, i64 %offset
+ %base_addr = bitcast i32* %base_i32 to <vscale x 2 x i32>*
+ %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i32>
+ call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %trunc,
+ <vscale x 2 x i32> *%base_addr,
+ i32 1,
+ <vscale x 2 x i1> %mask)
+ ret void
+}
+
+; 4-lane contiguous load/stores.
+
+define void @test_masked_ldst_sv4i8(i8 * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv4i8:
+; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1]
+; CHECK-NEXT: st1b { z[[DATA]].s }, p0, [x0, x1]
+; CHECK-NEXT: ret
+ %base_i8 = getelementptr i8, i8* %base, i64 %offset
+ %base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>*
+ %data = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_addr,
+ i32 1,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x i8> undef)
+ call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %data,
+ <vscale x 4 x i8>* %base_addr,
+ i32 1,
+ <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv4i16(i16 * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv4i16:
+; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %base_i16 = getelementptr i16, i16* %base, i64 %offset
+ %base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>*
+ %data = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_addr,
+ i32 1,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x i16> undef)
+ call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %data,
+ <vscale x 4 x i16>* %base_addr,
+ i32 1,
+ <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv4i32(i32 * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv4i32:
+; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+ %base_i32 = getelementptr i32, i32* %base, i64 %offset
+ %base_addr = bitcast i32* %base_i32 to <vscale x 4 x i32>*
+ %data = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>* %base_addr,
+ i32 1,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x i32> undef)
+ call void @llvm.masked.store.nxv4i32(<vscale x 4 x i32> %data,
+ <vscale x 4 x i32>* %base_addr,
+ i32 1,
+ <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv4f16(half * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv4f16:
+; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %base_f16 = getelementptr half, half* %base, i64 %offset
+ %base_addr = bitcast half* %base_f16 to <vscale x 4 x half>*
+ %data = call <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>* %base_addr,
+ i32 1,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x half> undef)
+ call void @llvm.masked.store.nxv4f16(<vscale x 4 x half> %data,
+ <vscale x 4 x half>* %base_addr,
+ i32 1,
+ <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv4f32(float * %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv4f32:
+; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+ %base_f32 = getelementptr float, float* %base, i64 %offset
+ %base_addr = bitcast float* %base_f32 to <vscale x 4 x float>*
+ %data = call <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>* %base_addr,
+ i32 1,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x float> undef)
+ call void @llvm.masked.store.nxv4f32(<vscale x 4 x float> %data,
+ <vscale x 4 x float>* %base_addr,
+ i32 1,
+ <vscale x 4 x i1> %mask)
+ ret void
+}
+
+; 4-lane zero/sign extended contiguous loads.
+
+define <vscale x 4 x i32> @masked_zload_sv4i8_to_sv4i32(i8* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32:
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, x1]
+; CHECK-NEXT: ret
+ %base_i8 = getelementptr i8, i8* %base, i64 %offset
+ %base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>*
+ %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_addr,
+ i32 1,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x i8> undef)
+ %ext = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %ext
+}
+
+define <vscale x 4 x i32> @masked_sload_sv4i8_to_sv4i32(i8* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32:
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, x1]
+; CHECK-NEXT: ret
+ %base_i8 = getelementptr i8, i8* %base, i64 %offset
+ %base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>*
+ %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(<vscale x 4 x i8>* %base_addr,
+ i32 1,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x i8> undef)
+ %ext = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %ext
+}
+
+define <vscale x 4 x i32> @masked_zload_sv4i16_to_sv4i32(i16* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32:
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %base_i16 = getelementptr i16, i16* %base, i64 %offset
+ %base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>*
+ %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_addr,
+ i32 1,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x i16> undef)
+ %ext = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %ext
+}
+
+define <vscale x 4 x i32> @masked_sload_sv4i16_to_sv4i32(i16* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32:
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %base_i16 = getelementptr i16, i16* %base, i64 %offset
+ %base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>*
+ %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>* %base_addr,
+ i32 1,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x i16> undef)
+ %ext = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %ext
+}
+
+; 4-lane truncating contiguous stores.
+
+define void @masked_trunc_store_sv4i32_to_sv4i8(<vscale x 4 x i32> %val, i8 *%base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8:
+; CHECK-NEXT: st1b { z0.s }, p0, [x0, x1]
+; CHECK-NEXT: ret
+ %base_i8 = getelementptr i8, i8* %base, i64 %offset
+ %base_addr = bitcast i8* %base_i8 to <vscale x 4 x i8>*
+ %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i8>
+ call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %trunc,
+ <vscale x 4 x i8> *%base_addr,
+ i32 1,
+ <vscale x 4 x i1> %mask)
+ ret void
+}
+
+define void @masked_trunc_store_sv4i32_to_sv4i16(<vscale x 4 x i32> %val, i16 *%base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16:
+; CHECK-NEXT: st1h { z0.s }, p0, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %base_i16 = getelementptr i16, i16* %base, i64 %offset
+ %base_addr = bitcast i16* %base_i16 to <vscale x 4 x i16>*
+ %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i16>
+ call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %trunc,
+ <vscale x 4 x i16> *%base_addr,
+ i32 1,
+ <vscale x 4 x i1> %mask)
+ ret void
+}
+
+; 8-lane contiguous load/stores.
+
+define void @test_masked_ldst_sv8i8(i8 * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv8i8:
+; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1]
+; CHECK-NEXT: st1b { z[[DATA]].h }, p0, [x0, x1]
+; CHECK-NEXT: ret
+ %base_i8 = getelementptr i8, i8* %base, i64 %offset
+ %base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>*
+ %data = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_addr,
+ i32 1,
+ <vscale x 8 x i1> %mask,
+ <vscale x 8 x i8> undef)
+ call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %data,
+ <vscale x 8 x i8>* %base_addr,
+ i32 1,
+ <vscale x 8 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv8i16(i16 * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv8i16:
+; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %base_i16 = getelementptr i16, i16* %base, i64 %offset
+ %base_addr = bitcast i16* %base_i16 to <vscale x 8 x i16>*
+ %data = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>* %base_addr,
+ i32 1,
+ <vscale x 8 x i1> %mask,
+ <vscale x 8 x i16> undef)
+ call void @llvm.masked.store.nxv8i16(<vscale x 8 x i16> %data,
+ <vscale x 8 x i16>* %base_addr,
+ i32 1,
+ <vscale x 8 x i1> %mask)
+ ret void
+}
+
+define void @test_masked_ldst_sv8f16(half * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv8f16:
+; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %base_f16 = getelementptr half, half* %base, i64 %offset
+ %base_addr = bitcast half* %base_f16 to <vscale x 8 x half>*
+ %data = call <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>* %base_addr,
+ i32 1,
+ <vscale x 8 x i1> %mask,
+ <vscale x 8 x half> undef)
+ call void @llvm.masked.store.nxv8f16(<vscale x 8 x half> %data,
+ <vscale x 8 x half>* %base_addr,
+ i32 1,
+ <vscale x 8 x i1> %mask)
+ ret void
+}
+
+; 8-lane zero/sign extended contiguous loads.
+
+define <vscale x 8 x i16> @masked_zload_sv8i8_to_sv8i16(i8* %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16:
+; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x1]
+; CHECK-NEXT: ret
+ %base_i8 = getelementptr i8, i8* %base, i64 %offset
+ %base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>*
+ %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_addr,
+ i32 1,
+ <vscale x 8 x i1> %mask,
+ <vscale x 8 x i8> undef)
+ %ext = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
+ ret <vscale x 8 x i16> %ext
+}
+
+define <vscale x 8 x i16> @masked_sload_sv8i8_to_sv8i16(i8* %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16:
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x1]
+; CHECK-NEXT: ret
+ %base_i8 = getelementptr i8, i8* %base, i64 %offset
+ %base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>*
+ %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(<vscale x 8 x i8>* %base_addr,
+ i32 1,
+ <vscale x 8 x i1> %mask,
+ <vscale x 8 x i8> undef)
+ %ext = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
+ ret <vscale x 8 x i16> %ext
+}
+
+; 8-lane truncating contiguous stores.
+
+define void @masked_trunc_store_sv8i16_to_sv8i8(<vscale x 8 x i16> %val, i8 *%base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8:
+; CHECK-NEXT: st1b { z0.h }, p0, [x0, x1]
+; CHECK-NEXT: ret
+ %base_i8 = getelementptr i8, i8* %base, i64 %offset
+ %base_addr = bitcast i8* %base_i8 to <vscale x 8 x i8>*
+ %trunc = trunc <vscale x 8 x i16> %val to <vscale x 8 x i8>
+ call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %trunc,
+ <vscale x 8 x i8> *%base_addr,
+ i32 1,
+ <vscale x 8 x i1> %mask)
+ ret void
+}
+
+; 16-lane contiguous load/stores.
+
+define void @test_masked_ldst_sv16i8(i8 * %base, <vscale x 16 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv16i8:
+; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, x1]
+; CHECK-NEXT: st1b { z[[DATA]].b }, p0, [x0, x1]
+; CHECK-NEXT: ret
+ %base_i8 = getelementptr i8, i8* %base, i64 %offset
+ %base_addr = bitcast i8* %base_i8 to <vscale x 16 x i8>*
+ %data = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>* %base_addr,
+ i32 1,
+ <vscale x 16 x i1> %mask,
+ <vscale x 16 x i8> undef)
+ call void @llvm.masked.store.nxv16i8(<vscale x 16 x i8> %data,
+ <vscale x 16 x i8>* %base_addr,
+ i32 1,
+ <vscale x 16 x i1> %mask)
+ ret void
+}
+
+; 2-element contiguous loads.
+declare <vscale x 2 x i8> @llvm.masked.load.nxv2i8 (<vscale x 2 x i8>* , i32, <vscale x 2 x i1>, <vscale x 2 x i8> )
+declare <vscale x 2 x i16> @llvm.masked.load.nxv2i16(<vscale x 2 x i16>*, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(<vscale x 2 x i32>*, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>*, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.load.nxv2f16(<vscale x 2 x half>*, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.load.nxv2f32(<vscale x 2 x float>*, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(<vscale x 2 x double>*, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+
+; 4-element contiguous loads.
+declare <vscale x 4 x i8> @llvm.masked.load.nxv4i8 (<vscale x 4 x i8>* , i32, <vscale x 4 x i1>, <vscale x 4 x i8> )
+declare <vscale x 4 x i16> @llvm.masked.load.nxv4i16(<vscale x 4 x i16>*, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
+declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>*, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 4 x half> @llvm.masked.load.nxv4f16(<vscale x 4 x half>*, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.masked.load.nxv4f32(<vscale x 4 x float>*, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
+
+; 8-element contiguous loads.
+declare <vscale x 8 x i8> @llvm.masked.load.nxv8i8 (<vscale x 8 x i8>* , i32, <vscale x 8 x i1>, <vscale x 8 x i8> )
+declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
+declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
+
+; 16-element contiguous loads.
+declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>*, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
+
+; 2-element contiguous stores.
+declare void @llvm.masked.store.nxv2i8 (<vscale x 2 x i8> , <vscale x 2 x i8>* , i32, <vscale x 2 x i1>)
+declare void @llvm.masked.store.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>*, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.store.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>*, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.store.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>*, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.store.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>*, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.store.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>*, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.store.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>*, i32, <vscale x 2 x i1>)
+
+; 4-element contiguous stores.
+declare void @llvm.masked.store.nxv4i8 (<vscale x 4 x i8> , <vscale x 4 x i8>* , i32, <vscale x 4 x i1>)
+declare void @llvm.masked.store.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>*, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.store.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>*, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.store.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>*, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>*, i32, <vscale x 4 x i1>)
+
+; 8-element contiguous stores.
+declare void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8> , <vscale x 8 x i8>* , i32, <vscale x 8 x i1>)
+declare void @llvm.masked.store.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>*, i32, <vscale x 8 x i1>)
+declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>*, i32, <vscale x 8 x i1>)
+
+; 16-element contiguous stores.
+declare void @llvm.masked.store.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>*, i32, <vscale x 16 x i1>)
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll
new file mode 100644
index 000000000000..fc71c79be69c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll
@@ -0,0 +1,171 @@
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s
+
+; Range checks: for all the instruction tested in this file, the
+; immediate must be within the range [-8, 7] (4-bit immediate). Out of
+; range values are tested only in one case (following). Valid values
+; are tested all through the rest of the file.
+
+define void @imm_out_of_range(<vscale x 2 x i64> * %base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: imm_out_of_range:
+; CHECK-NEXT: rdvl x8, #8
+; CHECK-NEXT: add x8, x0, x8
+; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x{{[0-9]+}}]
+; CHECK-NEXT: rdvl x8, #-9
+; CHECK-NEXT: add x8, x0, x8
+; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x{{[0-9]+}}]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base, i64 8
+ %data = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1> %mask,
+ <vscale x 2 x i64>* %base_load)
+ %base_store = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64> * %base, i64 -9
+ call void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64> %data,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i64>* %base_store)
+ ret void
+}
+
+; 2-lane non-temporal load/stores
+
+
+define void @test_masked_ldst_sv2i64(<vscale x 2 x i64> * %base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv2i64:
+; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl]
+; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x0, #-7, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base, i64 -8
+ %data = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1> %mask,
+ <vscale x 2 x i64>* %base_load)
+ %base_store = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64> * %base, i64 -7
+ call void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64> %data,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i64>* %base_store)
+ ret void
+}
+
+define void @test_masked_ldst_sv2f64(<vscale x 2 x double> * %base, <vscale x 2 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv2f64:
+; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-6, mul vl]
+; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x0, #-5, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %base, i64 -6
+ %data = call <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1> %mask,
+ <vscale x 2 x double>* %base_load)
+ %base_store = getelementptr <vscale x 2 x double>, <vscale x 2 x double> * %base, i64 -5
+ call void @llvm.aarch64.sve.stnt1.nxv2f64(<vscale x 2 x double> %data,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x double>* %base_store)
+ ret void
+}
+
+; 4-lane non-temporal load/stores.
+
+define void @test_masked_ldst_sv4i32(<vscale x 4 x i32> * %base, <vscale x 4 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv4i32:
+; CHECK-NEXT: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: stnt1w { z[[DATA]].s }, p0, [x0, #7, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %base, i64 6
+ %data = call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.nxv4i32(<vscale x 4 x i1> %mask,
+ <vscale x 4 x i32>* %base_load)
+ %base_store = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32> * %base, i64 7
+ call void @llvm.aarch64.sve.stnt1.nxv4i32(<vscale x 4 x i32> %data,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x i32>* %base_store)
+ ret void
+}
+
+define void @test_masked_ldst_sv4f32(<vscale x 4 x float> * %base, <vscale x 4 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv4f32:
+; CHECK-NEXT: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl]
+; CHECK-NEXT: stnt1w { z[[DATA]].s }, p0, [x0, #2, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %base, i64 -1
+ %data = call <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1> %mask,
+ <vscale x 4 x float>* %base_load)
+ %base_store = getelementptr <vscale x 4 x float>, <vscale x 4 x float> * %base, i64 2
+ call void @llvm.aarch64.sve.stnt1.nxv4f32(<vscale x 4 x float> %data,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x float>* %base_store)
+ ret void
+}
+
+
+; 8-lane non-temporal load/stores.
+
+define void @test_masked_ldst_sv8i16(<vscale x 8 x i16> * %base, <vscale x 8 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv8i16:
+; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, #7, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %base, i64 6
+ %data = call <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1> %mask,
+ <vscale x 8 x i16>* %base_load)
+ %base_store = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16> * %base, i64 7
+ call void @llvm.aarch64.sve.stnt1.nxv8i16(<vscale x 8 x i16> %data,
+ <vscale x 8 x i1> %mask,
+ <vscale x 8 x i16>* %base_store)
+ ret void
+}
+
+define void @test_masked_ldst_sv8f16(<vscale x 8 x half> * %base, <vscale x 8 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv8f16:
+; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl]
+; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, #2, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %base, i64 -1
+ %data = call <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1> %mask,
+ <vscale x 8 x half>* %base_load)
+ %base_store = getelementptr <vscale x 8 x half>, <vscale x 8 x half> * %base, i64 2
+ call void @llvm.aarch64.sve.stnt1.nxv8f16(<vscale x 8 x half> %data,
+ <vscale x 8 x i1> %mask,
+ <vscale x 8 x half>* %base_store)
+ ret void
+}
+
+; 16-lane non-temporal load/stores.
+
+define void @test_masked_ldst_sv16i8(<vscale x 16 x i8> * %base, <vscale x 16 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv16i8:
+; CHECK-NEXT: ldnt1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: stnt1b { z[[DATA]].b }, p0, [x0, #7, mul vl]
+; CHECK-NEXT: ret
+ %base_load = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %base, i64 6
+ %data = call <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1> %mask,
+ <vscale x 16 x i8>* %base_load)
+ %base_store = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8> * %base, i64 7
+ call void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8> %data,
+ <vscale x 16 x i1> %mask,
+ <vscale x 16 x i8>* %base_store)
+ ret void
+}
+
+; 2-element non-temporal loads.
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
+
+; 4-element non-temporal loads.
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
+
+; 8-element non-temporal loads.
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
+declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
+
+; 16-element non-temporal loads.
+declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
+
+; 2-element non-temporal stores.
+declare void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>*)
+declare void @llvm.aarch64.sve.stnt1.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x double>*)
+
+; 4-element non-temporal stores.
+declare void @llvm.aarch64.sve.stnt1.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>*)
+declare void @llvm.aarch64.sve.stnt1.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x float>*)
+
+; 8-element non-temporal stores.
+declare void @llvm.aarch64.sve.stnt1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 8 x i16>*)
+declare void @llvm.aarch64.sve.stnt1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, <vscale x 8 x half>*)
+
+; 16-element non-temporal stores.
+declare void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, <vscale x 16 x i8>*)
+
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-reg.ll b/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-reg.ll
new file mode 100644
index 000000000000..663f5659ec00
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-reg.ll
@@ -0,0 +1,145 @@
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s
+
+; 2-lane non-temporal load/stores
+
+define void @test_masked_ldst_sv2i64(i64* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv2i64:
+; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+ %base_i64 = getelementptr i64, i64* %base, i64 %offset
+ %base_addr = bitcast i64* %base_i64 to <vscale x 2 x i64>*
+ %data = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1> %mask,
+ <vscale x 2 x i64>* %base_addr)
+ call void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64> %data,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x i64>* %base_addr)
+ ret void
+}
+
+define void @test_masked_ldst_sv2f64(double* %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv2f64:
+; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+ %base_double = getelementptr double, double* %base, i64 %offset
+ %base_addr = bitcast double* %base_double to <vscale x 2 x double>*
+ %data = call <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1> %mask,
+ <vscale x 2 x double>* %base_addr)
+ call void @llvm.aarch64.sve.stnt1.nxv2f64(<vscale x 2 x double> %data,
+ <vscale x 2 x i1> %mask,
+ <vscale x 2 x double>* %base_addr)
+ ret void
+}
+
+; 4-lane non-temporal load/stores.
+
+define void @test_masked_ldst_sv4i32(i32* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv4i32:
+; CHECK-NEXT: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: stnt1w { z[[DATA]].s }, p0, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+ %base_i32 = getelementptr i32, i32* %base, i64 %offset
+ %base_addr = bitcast i32* %base_i32 to <vscale x 4 x i32>*
+ %data = call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.nxv4i32(<vscale x 4 x i1> %mask,
+ <vscale x 4 x i32>* %base_addr)
+ call void @llvm.aarch64.sve.stnt1.nxv4i32(<vscale x 4 x i32> %data,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x i32>* %base_addr)
+ ret void
+}
+
+define void @test_masked_ldst_sv4f32(float* %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv4f32:
+; CHECK-NEXT: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2]
+; CHECK-NEXT: stnt1w { z[[DATA]].s }, p0, [x0, x1, lsl #2]
+; CHECK-NEXT: ret
+ %base_float = getelementptr float, float* %base, i64 %offset
+ %base_addr = bitcast float* %base_float to <vscale x 4 x float>*
+ %data = call <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1> %mask,
+ <vscale x 4 x float>* %base_addr)
+ call void @llvm.aarch64.sve.stnt1.nxv4f32(<vscale x 4 x float> %data,
+ <vscale x 4 x i1> %mask,
+ <vscale x 4 x float>* %base_addr)
+ ret void
+}
+
+
+; 8-lane non-temporal load/stores.
+
+define void @test_masked_ldst_sv8i16(i16* %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv8i16:
+; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %base_i16 = getelementptr i16, i16* %base, i64 %offset
+ %base_addr = bitcast i16* %base_i16 to <vscale x 8 x i16>*
+ %data = call <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1> %mask,
+ <vscale x 8 x i16>* %base_addr)
+ call void @llvm.aarch64.sve.stnt1.nxv8i16(<vscale x 8 x i16> %data,
+ <vscale x 8 x i1> %mask,
+ <vscale x 8 x i16>* %base_addr)
+ ret void
+}
+
+define void @test_masked_ldst_sv8f16(half* %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv8f16:
+; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+ %base_half = getelementptr half, half* %base, i64 %offset
+ %base_addr = bitcast half* %base_half to <vscale x 8 x half>*
+ %data = call <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1> %mask,
+ <vscale x 8 x half>* %base_addr)
+ call void @llvm.aarch64.sve.stnt1.nxv8f16(<vscale x 8 x half> %data,
+ <vscale x 8 x i1> %mask,
+ <vscale x 8 x half>* %base_addr)
+ ret void
+}
+
+; 16-lane non-temporal load/stores.
+
+define void @test_masked_ldst_sv16i8(i8* %base, <vscale x 16 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv16i8:
+; CHECK-NEXT: ldnt1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, x1]
+; CHECK-NEXT: stnt1b { z[[DATA]].b }, p0, [x0, x1]
+; CHECK-NEXT: ret
+ %base_i8 = getelementptr i8, i8* %base, i64 %offset
+ %base_addr = bitcast i8* %base_i8 to <vscale x 16 x i8>*
+ %data = call <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1> %mask,
+ <vscale x 16 x i8>* %base_addr)
+ call void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8> %data,
+ <vscale x 16 x i1> %mask,
+ <vscale x 16 x i8>* %base_addr)
+ ret void
+}
+
+; 2-element non-temporal loads.
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
+
+; 4-element non-temporal loads.
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
+
+; 8-element non-temporal loads.
+declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
+declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
+
+; 16-element non-temporal loads.
+declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
+
+; 2-element non-temporal stores.
+declare void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>*)
+declare void @llvm.aarch64.sve.stnt1.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x double>*)
+
+; 4-element non-temporal stores.
+declare void @llvm.aarch64.sve.stnt1.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>*)
+declare void @llvm.aarch64.sve.stnt1.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x float>*)
+
+; 8-element non-temporal stores.
+declare void @llvm.aarch64.sve.stnt1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 8 x i16>*)
+declare void @llvm.aarch64.sve.stnt1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, <vscale x 8 x half>*)
+
+; 16-element non-temporal stores.
+declare void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, <vscale x 16 x i8>*)
More information about the llvm-commits
mailing list