[llvm] 7e20c3a - [Aarch64][SVE] Add intrinsics for scatter stores
Andrzej Warzynski via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 16 03:53:18 PST 2019
Author: Andrzej Warzynski
Date: 2019-12-16T11:52:53Z
New Revision: 7e20c3a71d5fc3763f6d0b85312c866837a92a06
URL: https://github.com/llvm/llvm-project/commit/7e20c3a71d5fc3763f6d0b85312c866837a92a06
DIFF: https://github.com/llvm/llvm-project/commit/7e20c3a71d5fc3763f6d0b85312c866837a92a06.diff
LOG: [Aarch64][SVE] Add intrinsics for scatter stores
Summary:
This patch adds the following SVE intrinsics for scatter stores:
* 64-bit offsets:
* @llvm.aarch64.sve.st1.scatter (unscaled)
* @llvm.aarch64.sve.st1.scatter.index (scaled)
* 32-bit unscaled offsets:
* @llvm.aarch64.sve.st1.scatter.uxtw (zero-extended offset)
* @llvm.aarch64.sve.st1.scatter.sxtw (sign-extended-offset)
* 32-bit scaled offsets:
* @llvm.aarch64.sve.st1.scatter.uxtw.index (zero-extended offset)
* @llvm.aarch64.sve.st1.scatter.sxtw.index (sign-extended offset)
* vector base + immediate:
* @llvm.aarch64.sve.st1.scatter.imm
Reviewers: rengolin, efriedma, sdesmalen
Reviewed By: efriedma, sdesmalen
Subscribers: kmclaughlin, eli.friedman, tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D71074
Added:
llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-scaled-offsets.ll
llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-unscaled-offsets.ll
llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-scaled-offset.ll
llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-unscaled-offset.ll
llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-vector-base.ll
Modified:
llvm/include/llvm/IR/IntrinsicsAArch64.td
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/lib/Target/AArch64/AArch64InstrFormats.td
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
llvm/lib/Target/AArch64/SVEInstrFormats.td
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 0d54197fac3e..65a8daaffa7c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1064,6 +1064,35 @@ class AdvSIMD_1VectorArg_Imm_Intrinsic
llvm_i32_ty],
[IntrNoMem, ImmArg<1>]>;
+class AdvSIMD_ScatterStore_64bitOffset_Intrinsic
+ : Intrinsic<[],
+ [
+ llvm_anyvector_ty,
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+ LLVMPointerToElt<0>,
+ LLVMScalarOrSameVectorWidth<0, llvm_i64_ty>
+ ],
+ [IntrWriteMem, IntrArgMemOnly]>;
+
+class AdvSIMD_ScatterStore_32bitOffset_Intrinsic
+ : Intrinsic<[],
+ [
+ llvm_anyvector_ty,
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+ LLVMPointerToElt<0>,
+ LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>
+ ],
+ [IntrWriteMem, IntrArgMemOnly]>;
+
+class AdvSIMD_ScatterStore_VectorBase_Intrinsic
+ : Intrinsic<[],
+ [
+ llvm_anyvector_ty,
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+ llvm_anyvector_ty, llvm_i64_ty
+ ],
+ [IntrWriteMem, IntrArgMemOnly, ImmArg<3>]>;
+
//
// Loads
//
@@ -1406,6 +1435,36 @@ def int_aarch64_sve_ld1_gather_uxtw_index : AdvSIMD_GatherLoad_32bitOffset_Intri
// vector base + immediate index
def int_aarch64_sve_ld1_gather_imm : AdvSIMD_GatherLoad_VecTorBase_Intrinsic;
+//
+// Scatter stores:
+//
+
+// scalar + vector, 64 bit unscaled offsets
+def int_aarch64_sve_st1_scatter : AdvSIMD_ScatterStore_64bitOffset_Intrinsic;
+
+// scalar + vector, 64 bit scaled offsets
+def int_aarch64_sve_st1_scatter_index
+ : AdvSIMD_ScatterStore_64bitOffset_Intrinsic;
+
+// scalar + vector, 32 bit unscaled offsets, sign (sxtw) or zero (zxtw)
+// extended to 64 bits
+def int_aarch64_sve_st1_scatter_sxtw
+ : AdvSIMD_ScatterStore_32bitOffset_Intrinsic;
+
+def int_aarch64_sve_st1_scatter_uxtw
+ : AdvSIMD_ScatterStore_32bitOffset_Intrinsic;
+
+// scalar + vector, 32 bit scaled offsets, sign (sxtw) or zero (zxtw) extended
+// to 64 bits
+def int_aarch64_sve_st1_scatter_sxtw_index
+ : AdvSIMD_ScatterStore_32bitOffset_Intrinsic;
+
+def int_aarch64_sve_st1_scatter_uxtw_index
+ : AdvSIMD_ScatterStore_32bitOffset_Intrinsic;
+
+// vector base + immediate index
+def int_aarch64_sve_st1_scatter_imm : AdvSIMD_ScatterStore_VectorBase_Intrinsic;
+
//
// SVE2 - Non-widening pairwise arithmetic
//
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8350677f2cf9..6ea1e603f9ea 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1357,6 +1357,13 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::GLD1S_SXTW_SCALED: return "AArch64ISD::GLD1S_SXTW_SCALED";
case AArch64ISD::GLD1S_UXTW_SCALED: return "AArch64ISD::GLD1S_UXTW_SCALED";
case AArch64ISD::GLD1S_IMM: return "AArch64ISD::GLD1S_IMM";
+ case AArch64ISD::SST1: return "AArch64ISD::SST1";
+ case AArch64ISD::SST1_SCALED: return "AArch64ISD::SST1_SCALED";
+ case AArch64ISD::SST1_SXTW: return "AArch64ISD::SST1_SXTW";
+ case AArch64ISD::SST1_UXTW: return "AArch64ISD::SST1_UXTW";
+ case AArch64ISD::SST1_SXTW_SCALED: return "AArch64ISD::SST1_SXTW_SCALED";
+ case AArch64ISD::SST1_UXTW_SCALED: return "AArch64ISD::SST1_UXTW_SCALED";
+ case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM";
}
return nullptr;
}
@@ -12080,6 +12087,75 @@ static MVT getSVEContainerType(EVT ContentTy) {
}
}
+static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
+ unsigned Opcode,
+ bool OnlyPackedOffsets = true) {
+ const SDValue Src = N->getOperand(2);
+ const EVT SrcVT = Src->getValueType(0);
+ assert(SrcVT.isScalableVector() &&
+ "Scatter stores are only possible for SVE vectors");
+
+ SDLoc DL(N);
+ MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
+
+ // Make sure that source data will fit into an SVE register
+ if (SrcVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
+ return SDValue();
+
+ // For FPs, ACLE only supports _packed_ single and double precision types.
+ if (SrcElVT.isFloatingPoint())
+ if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64))
+ return SDValue();
+
+ // Depending on the addressing mode, this is either a pointer or a vector of
+ // pointers (that fits into one register)
+ const SDValue Base = N->getOperand(4);
+ // Depending on the addressing mode, this is either a single offset or a
+ // vector of offsets (that fits into one register)
+ SDValue Offset = N->getOperand(5);
+
+ auto &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isTypeLegal(Base.getValueType()))
+ return SDValue();
+
+ // Some scatter store variants allow unpacked offsets, but only as nxv2i32
+ // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
+ // nxv2i64. Legalize accordingly.
+ if (!OnlyPackedOffsets &&
+ Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
+ Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
+
+ if (!TLI.isTypeLegal(Offset.getValueType()))
+ return SDValue();
+
+ // Source value type that is representable in hardware
+ EVT HwSrcVt = getSVEContainerType(SrcVT);
+
+ // Keep the original type of the input data to store - this is needed to
+ //
diff erentiate between ST1B, ST1H, ST1W and ST1D. For FP values we want the
+ // integer equivalent, so just use HwSrcVt.
+ SDValue InputVT = DAG.getValueType(SrcVT);
+ if (SrcVT.isFloatingPoint())
+ InputVT = DAG.getValueType(HwSrcVt);
+
+ SDVTList VTs = DAG.getVTList(MVT::Other);
+ SDValue SrcNew;
+
+ if (Src.getValueType().isFloatingPoint())
+ SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
+ else
+ SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
+
+ SDValue Ops[] = {N->getOperand(0), // Chain
+ SrcNew,
+ N->getOperand(3), // Pg
+ Base,
+ Offset,
+ InputVT};
+
+ return DAG.getNode(Opcode, DL, VTs, Ops);
+}
+
static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
unsigned Opcode) {
EVT RetVT = N->getValueType(0);
@@ -12300,6 +12376,24 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED);
case Intrinsic::aarch64_sve_ld1_gather_imm:
return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_IMM);
+ case Intrinsic::aarch64_sve_st1_scatter:
+ return performST1ScatterCombine(N, DAG, AArch64ISD::SST1);
+ case Intrinsic::aarch64_sve_st1_scatter_index:
+ return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SCALED);
+ case Intrinsic::aarch64_sve_st1_scatter_sxtw:
+ return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_st1_scatter_uxtw:
+ return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
+ return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW_SCALED,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
+ return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW_SCALED,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_st1_scatter_imm:
+ return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_IMM);
default:
break;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 83473fd47978..449c0d376b79 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -223,6 +223,14 @@ enum NodeType : unsigned {
GLD1S_UXTW_SCALED,
GLD1S_SXTW_SCALED,
GLD1S_IMM,
+ // Scatter store
+ SST1,
+ SST1_SCALED,
+ SST1_UXTW,
+ SST1_SXTW,
+ SST1_UXTW_SCALED,
+ SST1_SXTW_SCALED,
+ SST1_IMM,
// NEON Load/Store with post-increment base updates
LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index ed1ac25bddd7..222365fd7872 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -393,6 +393,27 @@ def uimm5s8 : Operand<i64>, ImmLeaf<i64,
let PrintMethod = "printImmScale<8>";
}
+// tuimm5sN predicate - similiar to uimm5sN, but use TImmLeaf (TargetConstant)
+// instead of ImmLeaf (Constant)
+def tuimm5s2 : Operand<i64>, TImmLeaf<i64,
+ [{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }],
+ UImmS2XForm> {
+ let ParserMatchClass = UImm5s2Operand;
+ let PrintMethod = "printImmScale<2>";
+}
+def tuimm5s4 : Operand<i64>, TImmLeaf<i64,
+ [{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }],
+ UImmS4XForm> {
+ let ParserMatchClass = UImm5s4Operand;
+ let PrintMethod = "printImmScale<4>";
+}
+def tuimm5s8 : Operand<i64>, TImmLeaf<i64,
+ [{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }],
+ UImmS8XForm> {
+ let ParserMatchClass = UImm5s8Operand;
+ let PrintMethod = "printImmScale<8>";
+}
+
// uimm6sN predicate - True if the immediate is a multiple of N in the range
// [0 * N, 64 * N].
def UImm6s1Operand : UImmScaledMemoryIndexed<6, 1>;
@@ -750,6 +771,14 @@ def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
let ParserMatchClass = Imm0_31Operand;
}
+// timm0_31 predicate - same ass imm0_31, but use TargetConstant (TimmLeaf)
+// instead of Contant (ImmLeaf)
+def timm0_31 : Operand<i64>, TImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 32;
+}]> {
+ let ParserMatchClass = Imm0_31Operand;
+}
+
// True if the 32-bit immediate is in the range [0,31]
def imm32_0_31 : Operand<i32>, ImmLeaf<i32, [{
return ((uint64_t)Imm) < 32;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 8dfea2f451d1..98d8761c2eb4 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -20,6 +20,24 @@ def SDT_AArch64_GLD1_IMM : SDTypeProfile<1, 4, [
SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
]>;
+def SDT_AArch64_SST1 : SDTypeProfile<0, 5, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
+ SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
+]>;
+
+def SDT_AArch64_SST1_IMM : SDTypeProfile<0, 5, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>,
+ SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
+]>;
+
+def AArch64st1_scatter : SDNode<"AArch64ISD::SST1", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
+def AArch64st1_scatter_scaled : SDNode<"AArch64ISD::SST1_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
+def AArch64st1_scatter_uxtw : SDNode<"AArch64ISD::SST1_UXTW", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
+def AArch64st1_scatter_sxtw : SDNode<"AArch64ISD::SST1_SXTW", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
+def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
+def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
+def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM", SDT_AArch64_SST1_IMM, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
+
def AArch64ld1_gather : SDNode<"AArch64ISD::GLD1", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
def AArch64ld1_gather_scaled : SDNode<"AArch64ISD::GLD1_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
def AArch64ld1_gather_uxtw : SDNode<"AArch64ISD::GLD1_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
@@ -584,51 +602,55 @@ let Predicates = [HasSVE] in {
defm ST1W_D : sve_mem_cst_ss<0b1011, "st1w", Z_d, ZPR64, GPR64NoXZRshifted32>;
defm ST1D : sve_mem_cst_ss<0b1111, "st1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
- // Scatters using unscaled 32-bit offsets, e.g.
- // st1h z0.s, p0, [x0, z0.s, uxtw]
- // and unpacked:
+ // Scatters using unpacked, unscaled 32-bit offsets, e.g.
// st1h z0.d, p0, [x0, z0.d, uxtw]
- defm SST1B_D : sve_mem_sst_sv_32_unscaled<0b000, "st1b", Z_d, ZPR64, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
- defm SST1B_S : sve_mem_sst_sv_32_unscaled<0b001, "st1b", Z_s, ZPR32, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
- defm SST1H_D : sve_mem_sst_sv_32_unscaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
- defm SST1H_S : sve_mem_sst_sv_32_unscaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
- defm SST1W_D : sve_mem_sst_sv_32_unscaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
- defm SST1W : sve_mem_sst_sv_32_unscaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
- defm SST1D : sve_mem_sst_sv_32_unscaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
-
- // Scatters using scaled 32-bit offsets, e.g.
+ defm SST1B_D : sve_mem_64b_sst_sv_32_unscaled<0b000, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+ defm SST1H_D : sve_mem_64b_sst_sv_32_unscaled<0b010, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+ defm SST1W_D : sve_mem_64b_sst_sv_32_unscaled<0b100, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8,nxv2i32>;
+ defm SST1D : sve_mem_64b_sst_sv_32_unscaled<0b110, "st1d", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
+
+ // Scatters using packed, unscaled 32-bit offsets, e.g.
+ // st1h z0.s, p0, [x0, z0.s, uxtw]
+ defm SST1B_S : sve_mem_32b_sst_sv_32_unscaled<0b001, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+ defm SST1H_S : sve_mem_32b_sst_sv_32_unscaled<0b011, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
+ defm SST1W : sve_mem_32b_sst_sv_32_unscaled<0b101, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
+
+ // Scatters using packed, scaled 32-bit offsets, e.g.
// st1h z0.s, p0, [x0, z0.s, uxtw #1]
- // and unpacked:
+ defm SST1H_S : sve_mem_32b_sst_sv_32_scaled<0b011, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+ defm SST1W : sve_mem_32b_sst_sv_32_scaled<0b101, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
+
+ // Scatters using unpacked, scaled 32-bit offsets, e.g.
// st1h z0.d, p0, [x0, z0.d, uxtw #1]
- defm SST1H_D : sve_mem_sst_sv_32_scaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
- defm SST1H_S : sve_mem_sst_sv_32_scaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
- defm SST1W_D : sve_mem_sst_sv_32_scaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
- defm SST1W : sve_mem_sst_sv_32_scaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
- defm SST1D : sve_mem_sst_sv_32_scaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
+ defm SST1H_D : sve_mem_64b_sst_sv_32_scaled<0b010, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+ defm SST1W_D : sve_mem_64b_sst_sv_32_scaled<0b100, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+ defm SST1D : sve_mem_64b_sst_sv_32_scaled<0b110, "st1d", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
// Scatters using 32/64-bit pointers with offset, e.g.
// st1h z0.s, p0, [z0.s, #16]
+ defm SST1B_S : sve_mem_32b_sst_vi_ptrs<0b001, "st1b", timm0_31, AArch64st1_scatter_imm, nxv4i8>;
+ defm SST1H_S : sve_mem_32b_sst_vi_ptrs<0b011, "st1h", tuimm5s2, AArch64st1_scatter_imm, nxv4i16>;
+ defm SST1W : sve_mem_32b_sst_vi_ptrs<0b101, "st1w", tuimm5s4, AArch64st1_scatter_imm, nxv4i32>;
+
+ // Scatters using 32/64-bit pointers with offset, e.g.
// st1h z0.d, p0, [z0.d, #16]
- defm SST1B_D : sve_mem_sst_vi_ptrs<0b000, "st1b", Z_d, ZPR64, imm0_31>;
- defm SST1B_S : sve_mem_sst_vi_ptrs<0b001, "st1b", Z_s, ZPR32, imm0_31>;
- defm SST1H_D : sve_mem_sst_vi_ptrs<0b010, "st1h", Z_d, ZPR64, uimm5s2>;
- defm SST1H_S : sve_mem_sst_vi_ptrs<0b011, "st1h", Z_s, ZPR32, uimm5s2>;
- defm SST1W_D : sve_mem_sst_vi_ptrs<0b100, "st1w", Z_d, ZPR64, uimm5s4>;
- defm SST1W : sve_mem_sst_vi_ptrs<0b101, "st1w", Z_s, ZPR32, uimm5s4>;
- defm SST1D : sve_mem_sst_vi_ptrs<0b110, "st1d", Z_d, ZPR64, uimm5s8>;
+ defm SST1B_D : sve_mem_64b_sst_vi_ptrs<0b000, "st1b", timm0_31, AArch64st1_scatter_imm, nxv2i8>;
+ defm SST1H_D : sve_mem_64b_sst_vi_ptrs<0b010, "st1h", tuimm5s2, AArch64st1_scatter_imm, nxv2i16>;
+ defm SST1W_D : sve_mem_64b_sst_vi_ptrs<0b100, "st1w", tuimm5s4, AArch64st1_scatter_imm, nxv2i32>;
+ defm SST1D : sve_mem_64b_sst_vi_ptrs<0b110, "st1d", tuimm5s8, AArch64st1_scatter_imm, nxv2i64>;
// Scatters using unscaled 64-bit offsets, e.g.
// st1h z0.d, p0, [x0, z0.d]
- defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b">;
- defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h">;
- defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w">;
- defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d">;
+ defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b", AArch64st1_scatter, nxv2i8>;
+ defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h", AArch64st1_scatter, nxv2i16>;
+ defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w", AArch64st1_scatter, nxv2i32>;
+ defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d", AArch64st1_scatter, nxv2i64>;
// Scatters using scaled 64-bit offsets, e.g.
// st1h z0.d, p0, [x0, z0.d, lsl #1]
- defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", ZPR64ExtLSL16>;
- defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", ZPR64ExtLSL32>;
- defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", ZPR64ExtLSL64>;
+ defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", AArch64st1_scatter_scaled, ZPR64ExtLSL16, nxv2i16>;
+ defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", AArch64st1_scatter_scaled, ZPR64ExtLSL32, nxv2i32>;
+ defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", AArch64st1_scatter_scaled, ZPR64ExtLSL64, nxv2i64>;
// ST(2|3|4) structured stores (register + immediate)
defm ST2B_IMM : sve_mem_est_si<0b00, 0b01, ZZ_b, "st2b", simm4s2>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 27e0d0f611d8..bdc63fc6b4e3 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -4564,32 +4564,84 @@ class sve_mem_sst_sv<bits<3> opc, bit xs, bit scaled, string asm,
let mayStore = 1;
}
-multiclass sve_mem_sst_sv_32_scaled<bits<3> opc, string asm,
- RegisterOperand listty,
- ZPRRegOp zprty,
+multiclass sve_mem_32b_sst_sv_32_scaled<bits<3> opc, string asm,
+ SDPatternOperator sxtw_op,
+ SDPatternOperator uxtw_op,
RegisterOperand sxtw_opnd,
- RegisterOperand uxtw_opnd > {
- def _UXTW_SCALED : sve_mem_sst_sv<opc, 0, 1, asm, listty, uxtw_opnd>;
- def _SXTW_SCALED : sve_mem_sst_sv<opc, 1, 1, asm, listty, sxtw_opnd>;
+ RegisterOperand uxtw_opnd,
+ ValueType vt > {
+ def _UXTW_SCALED : sve_mem_sst_sv<opc, 0, 1, asm, Z_s, uxtw_opnd>;
+ def _SXTW_SCALED : sve_mem_sst_sv<opc, 1, 1, asm, Z_s, sxtw_opnd>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
- (!cast<Instruction>(NAME # _UXTW_SCALED) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
+ (!cast<Instruction>(NAME # _UXTW_SCALED) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
- (!cast<Instruction>(NAME # _SXTW_SCALED) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+ (!cast<Instruction>(NAME # _SXTW_SCALED) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+
+ def : Pat<(uxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt),
+ (!cast<Instruction>(NAME # _UXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+ def : Pat<(sxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt),
+ (!cast<Instruction>(NAME # _SXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
}
-multiclass sve_mem_sst_sv_32_unscaled<bits<3> opc, string asm,
- RegisterOperand listty,
- ZPRRegOp zprty,
- RegisterOperand sxtw_opnd,
- RegisterOperand uxtw_opnd> {
- def _UXTW : sve_mem_sst_sv<opc, 0, 0, asm, listty, uxtw_opnd>;
- def _SXTW : sve_mem_sst_sv<opc, 1, 0, asm, listty, sxtw_opnd>;
+multiclass sve_mem_64b_sst_sv_32_scaled<bits<3> opc, string asm,
+ SDPatternOperator sxtw_op,
+ SDPatternOperator uxtw_op,
+ RegisterOperand sxtw_opnd,
+ RegisterOperand uxtw_opnd,
+ ValueType vt > {
+ def _UXTW_SCALED : sve_mem_sst_sv<opc, 0, 1, asm, Z_d, uxtw_opnd>;
+ def _SXTW_SCALED : sve_mem_sst_sv<opc, 1, 1, asm, Z_d, sxtw_opnd>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME # _UXTW_SCALED) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME # _SXTW_SCALED) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+
+ def : Pat<(uxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt),
+ (!cast<Instruction>(NAME # _UXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+ def : Pat<(sxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt),
+ (!cast<Instruction>(NAME # _SXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+}
+
+multiclass sve_mem_64b_sst_sv_32_unscaled<bits<3> opc, string asm,
+ SDPatternOperator sxtw_op,
+ SDPatternOperator uxtw_op,
+ RegisterOperand sxtw_opnd,
+ RegisterOperand uxtw_opnd,
+ ValueType vt> {
+ def _UXTW : sve_mem_sst_sv<opc, 0, 0, asm, Z_d, uxtw_opnd>;
+ def _SXTW : sve_mem_sst_sv<opc, 1, 0, asm, Z_d, sxtw_opnd>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
- (!cast<Instruction>(NAME # _UXTW) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
+ (!cast<Instruction>(NAME # _UXTW) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
- (!cast<Instruction>(NAME # _SXTW) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+ (!cast<Instruction>(NAME # _SXTW) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+
+ def : Pat<(uxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt),
+ (!cast<Instruction>(NAME # _UXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+ def : Pat<(sxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt),
+ (!cast<Instruction>(NAME # _SXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+}
+
+multiclass sve_mem_32b_sst_sv_32_unscaled<bits<3> opc, string asm,
+ SDPatternOperator sxtw_op,
+ SDPatternOperator uxtw_op,
+ RegisterOperand sxtw_opnd,
+ RegisterOperand uxtw_opnd,
+ ValueType vt> {
+ def _UXTW : sve_mem_sst_sv<opc, 0, 0, asm, Z_s, uxtw_opnd>;
+ def _SXTW : sve_mem_sst_sv<opc, 1, 0, asm, Z_s, sxtw_opnd>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME # _UXTW) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
+ (!cast<Instruction>(NAME # _SXTW) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+
+ def : Pat<(uxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt),
+ (!cast<Instruction>(NAME # _UXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+ def : Pat<(sxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt),
+ (!cast<Instruction>(NAME # _SXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
}
class sve_mem_sst_sv2<bits<2> msz, bit scaled, string asm,
@@ -4616,19 +4668,28 @@ class sve_mem_sst_sv2<bits<2> msz, bit scaled, string asm,
}
multiclass sve_mem_sst_sv_64_scaled<bits<2> msz, string asm,
- RegisterOperand zprext> {
- def "" : sve_mem_sst_sv2<msz, 1, asm, zprext>;
+ SDPatternOperator op,
+ RegisterOperand zprext,
+ ValueType vt> {
+ def _SCALED_REAL : sve_mem_sst_sv2<msz, 1, asm, zprext>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
- (!cast<Instruction>(NAME) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
+ (!cast<Instruction>(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
+ def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt),
+ (!cast<Instruction>(NAME # _SCALED_REAL) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
}
-multiclass sve_mem_sst_sv_64_unscaled<bits<2> msz, string asm> {
- def "" : sve_mem_sst_sv2<msz, 0, asm, ZPR64ExtLSL8>;
+multiclass sve_mem_sst_sv_64_unscaled<bits<2> msz, string asm,
+ SDPatternOperator op,
+ ValueType vt> {
+ def _REAL : sve_mem_sst_sv2<msz, 0, asm, ZPR64ExtLSL8>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Rn, $Zm]",
- (!cast<Instruction>(NAME) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
+ (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
+
+ def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt),
+ (!cast<Instruction>(NAME # _REAL) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
}
class sve_mem_sst_vi<bits<3> opc, string asm, ZPRRegOp zprty,
@@ -4654,16 +4715,38 @@ class sve_mem_sst_vi<bits<3> opc, string asm, ZPRRegOp zprty,
let mayStore = 1;
}
-multiclass sve_mem_sst_vi_ptrs<bits<3> opc, string asm, RegisterOperand listty,
- ZPRRegOp zprty, Operand imm_ty> {
- def _IMM : sve_mem_sst_vi<opc, asm, zprty, listty, imm_ty>;
+multiclass sve_mem_32b_sst_vi_ptrs<bits<3> opc, string asm,
+ Operand imm_ty,
+ SDPatternOperator op,
+ ValueType vt> {
+ def _IMM : sve_mem_sst_vi<opc, asm, ZPR32, Z_s, imm_ty>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
- (!cast<Instruction>(NAME # _IMM) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, 0), 0>;
+ (!cast<Instruction>(NAME # _IMM) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $imm5]",
- (!cast<Instruction>(NAME # _IMM) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, imm_ty:$imm5), 0>;
+ (!cast<Instruction>(NAME # _IMM) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), 0>;
def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
- (!cast<Instruction>(NAME # _IMM) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, 0), 1>;
+ (!cast<Instruction>(NAME # _IMM) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;
+
+ def : Pat<(op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), (nxv4i32 ZPR:$ptrs), imm_ty:$index, vt),
+ (!cast<Instruction>(NAME # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
+}
+
+multiclass sve_mem_64b_sst_vi_ptrs<bits<3> opc, string asm,
+ Operand imm_ty,
+ SDPatternOperator op,
+ ValueType vt> {
+ def _IMM : sve_mem_sst_vi<opc, asm, ZPR64, Z_d, imm_ty>;
+
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
+ (!cast<Instruction>(NAME # _IMM) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $imm5]",
+ (!cast<Instruction>(NAME # _IMM) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), 0>;
+ def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
+ (!cast<Instruction>(NAME # _IMM) Z_s:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
+
+ def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt),
+ (!cast<Instruction>(NAME # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
}
class sve_mem_z_spill<string asm>
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-scaled-offsets.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-scaled-offsets.ll
new file mode 100644
index 000000000000..380a158b767f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-scaled-offsets.ll
@@ -0,0 +1,193 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; ST1H, ST1W, ST1D: base + 32-bit scaled offset, sign (sxtw) or zero
+; (uxtw) extended to 64 bits.
+; e.g. st1h { z0.d }, p0, [x0, z1.d, uxtw #1]
+;
+
+; ST1H
+define void @sst1h_s_uxtw(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %indices) {
+; CHECK-LABEL: sst1h_s_uxtw:
+; CHECK: st1h { z0.s }, p0, [x0, z1.s, uxtw #1]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 4 x i32> %data to <vscale x 4 x i16>
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4i16(<vscale x 4 x i16> %data_trunc,
+ <vscale x 4 x i1> %pg,
+ i16* %base,
+ <vscale x 4 x i32> %indices)
+ ret void
+}
+
+define void @sst1h_s_sxtw(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %indices) {
+; CHECK-LABEL: sst1h_s_sxtw:
+; CHECK: st1h { z0.s }, p0, [x0, z1.s, sxtw #1]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 4 x i32> %data to <vscale x 4 x i16>
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4i16(<vscale x 4 x i16> %data_trunc,
+ <vscale x 4 x i1> %pg,
+ i16* %base,
+ <vscale x 4 x i32> %indices)
+ ret void
+}
+
+define void @sst1h_d_uxtw(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %indices) {
+; CHECK-LABEL: sst1h_d_uxtw:
+; CHECK: st1h { z0.d }, p0, [x0, z1.d, uxtw #1]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i16>
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2i16(<vscale x 2 x i16> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i32> %indices)
+ ret void
+}
+
+define void @sst1h_d_sxtw(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %indices) {
+; CHECK-LABEL: sst1h_d_sxtw:
+; CHECK: st1h { z0.d }, p0, [x0, z1.d, sxtw #1]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i16>
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2i16(<vscale x 2 x i16> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i32> %indices)
+ ret void
+}
+
+; ST1W
+define void @sst1w_s_uxtw(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %indices) {
+; CHECK-LABEL: sst1w_s_uxtw:
+; CHECK: st1w { z0.s }, p0, [x0, z1.s, uxtw #2]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4i32(<vscale x 4 x i32> %data,
+ <vscale x 4 x i1> %pg,
+ i32* %base,
+ <vscale x 4 x i32> %indices)
+ ret void
+}
+
+define void @sst1w_s_sxtw(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %indices) {
+; CHECK-LABEL: sst1w_s_sxtw:
+; CHECK: st1w { z0.s }, p0, [x0, z1.s, sxtw #2]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4i32(<vscale x 4 x i32> %data,
+ <vscale x 4 x i1> %pg,
+ i32* %base,
+ <vscale x 4 x i32> %indices)
+ ret void
+}
+
+define void @sst1w_d_uxtw(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %indices) {
+; CHECK-LABEL: sst1w_d_uxtw:
+; CHECK: st1w { z0.d }, p0, [x0, z1.d, uxtw #2]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i32>
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2i32(<vscale x 2 x i32> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i32> %indices)
+ ret void
+}
+
+define void @sst1w_d_sxtw(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %indices) {
+; CHECK-LABEL: sst1w_d_sxtw:
+; CHECK: st1w { z0.d }, p0, [x0, z1.d, sxtw #2]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i32>
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2i32(<vscale x 2 x i32> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i32> %indices)
+ ret void
+}
+
+define void @sst1w_s_uxtw_float(<vscale x 4 x float> %data, <vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %indices) {
+; CHECK-LABEL: sst1w_s_uxtw_float:
+; CHECK: st1w { z0.s }, p0, [x0, z1.s, uxtw #2]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4f32(<vscale x 4 x float> %data,
+ <vscale x 4 x i1> %pg,
+ float* %base,
+ <vscale x 4 x i32> %indices)
+ ret void
+}
+
+define void @sst1w_s_sxtw_float(<vscale x 4 x float> %data, <vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %indices) {
+; CHECK-LABEL: sst1w_s_sxtw_float:
+; CHECK: st1w { z0.s }, p0, [x0, z1.s, sxtw #2]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32(<vscale x 4 x float> %data,
+ <vscale x 4 x i1> %pg,
+ float* %base,
+ <vscale x 4 x i32> %indices)
+ ret void
+}
+
+; ST1D
+define void @sst1d_d_uxtw(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %indices) {
+; CHECK-LABEL: sst1d_d_uxtw:
+; CHECK: st1d { z0.d }, p0, [x0, z1.d, uxtw #3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2i64(<vscale x 2 x i64> %data,
+ <vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i32> %indices)
+ ret void
+}
+
+define void @sst1d_d_sxtw(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %indices) {
+; CHECK-LABEL: sst1d_d_sxtw:
+; CHECK: st1d { z0.d }, p0, [x0, z1.d, sxtw #3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2i64(<vscale x 2 x i64> %data,
+ <vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i32> %indices)
+ ret void
+}
+
+define void @sst1d_d_uxtw_double(<vscale x 2 x double> %data, <vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %indices) {
+; CHECK-LABEL: sst1d_d_uxtw_double:
+; CHECK: st1d { z0.d }, p0, [x0, z1.d, uxtw #3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2f64(<vscale x 2 x double> %data,
+ <vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i32> %indices)
+ ret void
+}
+
+define void @sst1d_d_sxtw_double(<vscale x 2 x double> %data, <vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %indices) {
+; CHECK-LABEL: sst1d_d_sxtw_double:
+; CHECK: st1d { z0.d }, p0, [x0, z1.d, sxtw #3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2f64(<vscale x 2 x double> %data,
+ <vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i32> %indices)
+ ret void
+}
+
+
+; ST1H
+declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
+
+; ST1W
+declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
+
+declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+
+; ST1D
+declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*, <vscale x 2 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*, <vscale x 2 x i32>)
+
+declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*, <vscale x 2 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*, <vscale x 2 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-unscaled-offsets.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-unscaled-offsets.ll
new file mode 100644
index 000000000000..52f988e8abc1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-unscaled-offsets.ll
@@ -0,0 +1,248 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; ST1B, ST1W, ST1H, ST1D: base + 32-bit unscaled offset, sign (sxtw) or zero
+; (uxtw) extended to 64 bits.
+; e.g. st1h { z0.d }, p0, [x0, z1.d, uxtw]
+;
+
+; ST1B
+define void @sst1b_s_uxtw(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %offsets) {
+; CHECK-LABEL: sst1b_s_uxtw:
+; CHECK: st1b { z0.s }, p0, [x0, z1.s, uxtw]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 4 x i32> %data to <vscale x 4 x i8>
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i8(<vscale x 4 x i8> %data_trunc,
+ <vscale x 4 x i1> %pg,
+ i8* %base,
+ <vscale x 4 x i32> %offsets)
+ ret void
+}
+
+define void @sst1b_s_sxtw(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %offsets) {
+; CHECK-LABEL: sst1b_s_sxtw:
+; CHECK: st1b { z0.s }, p0, [x0, z1.s, sxtw]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 4 x i32> %data to <vscale x 4 x i8>
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i8(<vscale x 4 x i8> %data_trunc,
+ <vscale x 4 x i1> %pg,
+ i8* %base,
+ <vscale x 4 x i32> %offsets)
+ ret void
+}
+
+define void @sst1b_d_uxtw(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %offsets) {
+; CHECK-LABEL: sst1b_d_uxtw:
+; CHECK: st1b { z0.d }, p0, [x0, z1.d, uxtw]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i8>
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i8(<vscale x 2 x i8> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ i8* %base,
+ <vscale x 2 x i32> %offsets)
+ ret void
+}
+
+define void @sst1b_d_sxtw(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %offsets) {
+; CHECK-LABEL: sst1b_d_sxtw:
+; CHECK: st1b { z0.d }, p0, [x0, z1.d, sxtw]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i8>
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i8(<vscale x 2 x i8> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ i8* %base,
+ <vscale x 2 x i32> %offsets)
+ ret void
+}
+
+; ST1H
+define void @sst1h_s_uxtw(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %offsets) {
+; CHECK-LABEL: sst1h_s_uxtw:
+; CHECK: st1h { z0.s }, p0, [x0, z1.s, uxtw]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 4 x i32> %data to <vscale x 4 x i16>
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i16(<vscale x 4 x i16> %data_trunc,
+ <vscale x 4 x i1> %pg,
+ i16* %base,
+ <vscale x 4 x i32> %offsets)
+ ret void
+}
+
+define void @sst1h_s_sxtw(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %offsets) {
+; CHECK-LABEL: sst1h_s_sxtw:
+; CHECK: st1h { z0.s }, p0, [x0, z1.s, sxtw]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 4 x i32> %data to <vscale x 4 x i16>
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i16(<vscale x 4 x i16> %data_trunc,
+ <vscale x 4 x i1> %pg,
+ i16* %base,
+ <vscale x 4 x i32> %offsets)
+ ret void
+}
+
+define void @sst1h_d_uxtw(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %offsets) {
+; CHECK-LABEL: sst1h_d_uxtw:
+; CHECK: st1h { z0.d }, p0, [x0, z1.d, uxtw]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i16>
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i16(<vscale x 2 x i16> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i32> %offsets)
+ ret void
+}
+
+define void @sst1h_d_sxtw(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %offsets) {
+; CHECK-LABEL: sst1h_d_sxtw:
+; CHECK: st1h { z0.d }, p0, [x0, z1.d, sxtw]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i16>
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i16(<vscale x 2 x i16> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i32> %offsets)
+ ret void
+}
+
+; ST1W
+define void @sst1w_s_uxtw(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %offsets) {
+; CHECK-LABEL: sst1w_s_uxtw:
+; CHECK: st1w { z0.s }, p0, [x0, z1.s, uxtw]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i32(<vscale x 4 x i32> %data,
+ <vscale x 4 x i1> %pg,
+ i32* %base,
+ <vscale x 4 x i32> %offsets)
+ ret void
+}
+
+define void @sst1w_s_sxtw(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %offsets) {
+; CHECK-LABEL: sst1w_s_sxtw:
+; CHECK: st1w { z0.s }, p0, [x0, z1.s, sxtw]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i32(<vscale x 4 x i32> %data,
+ <vscale x 4 x i1> %pg,
+ i32* %base,
+ <vscale x 4 x i32> %offsets)
+ ret void
+}
+
+define void @sst1w_d_uxtw(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %offsets) {
+; CHECK-LABEL: sst1w_d_uxtw:
+; CHECK: st1w { z0.d }, p0, [x0, z1.d, uxtw]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i32>
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i32(<vscale x 2 x i32> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i32> %offsets)
+ ret void
+}
+
+define void @sst1w_d_sxtw(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %offsets) {
+; CHECK-LABEL: sst1w_d_sxtw:
+; CHECK: st1w { z0.d }, p0, [x0, z1.d, sxtw]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i32>
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i32(<vscale x 2 x i32> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i32> %offsets)
+ ret void
+}
+
+define void @sst1w_s_uxtw_float(<vscale x 4 x float> %data, <vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %offsets) {
+; CHECK-LABEL: sst1w_s_uxtw_float:
+; CHECK: st1w { z0.s }, p0, [x0, z1.s, uxtw]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4f32(<vscale x 4 x float> %data,
+ <vscale x 4 x i1> %pg,
+ float* %base,
+ <vscale x 4 x i32> %offsets)
+ ret void
+}
+
+define void @sst1w_s_sxtw_float(<vscale x 4 x float> %data, <vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %offsets) {
+; CHECK-LABEL: sst1w_s_sxtw_float:
+; CHECK: st1w { z0.s }, p0, [x0, z1.s, sxtw]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4f32(<vscale x 4 x float> %data,
+ <vscale x 4 x i1> %pg,
+ float* %base,
+ <vscale x 4 x i32> %offsets)
+ ret void
+}
+
+; ST1D
+define void @sst1d_d_uxtw(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %offsets) {
+; CHECK-LABEL: sst1d_d_uxtw:
+; CHECK: st1d { z0.d }, p0, [x0, z1.d, uxtw]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i64(<vscale x 2 x i64> %data,
+ <vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i32> %offsets)
+ ret void
+}
+
+define void @sst1d_d_sxtw(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %offsets) {
+; CHECK-LABEL: sst1d_d_sxtw:
+; CHECK: st1d { z0.d }, p0, [x0, z1.d, sxtw]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i64(<vscale x 2 x i64> %data,
+ <vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i32> %offsets)
+ ret void
+}
+
+define void @sst1d_d_uxtw_double(<vscale x 2 x double> %data, <vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %offsets) {
+; CHECK-LABEL: sst1d_d_uxtw_double:
+; CHECK: st1d { z0.d }, p0, [x0, z1.d, uxtw]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2f64(<vscale x 2 x double> %data,
+ <vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i32> %offsets)
+ ret void
+}
+
+define void @sst1d_d_sxtw_double(<vscale x 2 x double> %data, <vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %offsets) {
+; CHECK-LABEL: sst1d_d_sxtw_double:
+; CHECK: st1d { z0.d }, p0, [x0, z1.d, sxtw]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2f64(<vscale x 2 x double> %data,
+ <vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i32> %offsets)
+ ret void
+}
+
+
+; ST1B
+declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i1>, i8*, <vscale x 2 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i1>, i8*, <vscale x 2 x i32>)
+
+; ST1H
+declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
+
+; ST1W
+declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
+
+declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+
+; ST1D
+declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*, <vscale x 2 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*, <vscale x 2 x i32>)
+
+declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*, <vscale x 2 x i32>)
+declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*, <vscale x 2 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-scaled-offset.ll
new file mode 100644
index 000000000000..ca81fe14e13a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-scaled-offset.ll
@@ -0,0 +1,58 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; ST1H, ST1W, ST1D: base + 64-bit scaled offset
+; e.g. st1h { z0.d }, p0, [x0, z0.d, lsl #1]
+;
+
+define void @sst1h_index(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: sst1h_index
+; CHECK: st1h { z0.d }, p0, [x0, z1.d, lsl #1]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i16>
+ call void @llvm.aarch64.sve.st1.scatter.index.nxv2i16(<vscale x 2 x i16> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i64> %offsets)
+ ret void
+}
+
+define void @sst1w_index(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: sst1w_index
+; CHECK: st1w { z0.d }, p0, [x0, z1.d, lsl #2]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i32>
+ call void @llvm.aarch64.sve.st1.scatter.index.nxv2i32(<vscale x 2 x i32> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i64> %offsets)
+ ret void
+}
+
+define void @sst1d_index(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: sst1d_index
+; CHECK: st1d { z0.d }, p0, [x0, z1.d, lsl #3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.index.nxv2i64(<vscale x 2 x i64> %data,
+ <vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i64> %offsets)
+ ret void
+}
+
+define void @sst1d_index_double(<vscale x 2 x double> %data, <vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: sst1d_index_double
+; CHECK: st1d { z0.d }, p0, [x0, z1.d, lsl #3]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.index.nxv2f64(<vscale x 2 x double> %data,
+ <vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i64> %offsets)
+ ret void
+}
+
+
+declare void @llvm.aarch64.sve.st1.scatter.index.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1.scatter.index.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1.scatter.index.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1.scatter.index.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*, <vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-unscaled-offset.ll
new file mode 100644
index 000000000000..00e72a5d470e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-unscaled-offset.ll
@@ -0,0 +1,70 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; ST1B, ST1W, ST1H, ST1D: base + 64-bit unscaled offset
+; e.g. st1h { z0.d }, p0, [x0, z1.d]
+;
+
+define void @sst1b_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sst1b_d:
+; CHECK: st1b { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i8>
+ call void @llvm.aarch64.sve.st1.scatter.nxv2i8(<vscale x 2 x i8> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ i8* %base,
+ <vscale x 2 x i64> %b)
+ ret void
+}
+
+define void @sst1h_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sst1h_d:
+; CHECK: st1h { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i16>
+ call void @llvm.aarch64.sve.st1.scatter.nxv2i16(<vscale x 2 x i16> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ i16* %base,
+ <vscale x 2 x i64> %b)
+ ret void
+}
+
+define void @sst1w_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sst1w_d:
+; CHECK: st1w { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i32>
+ call void @llvm.aarch64.sve.st1.scatter.nxv2i32(<vscale x 2 x i32> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ i32* %base,
+ <vscale x 2 x i64> %b)
+ ret void
+}
+
+define void @sst1d_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sst1d_d:
+; CHECK: st1d { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.nxv2i64(<vscale x 2 x i64> %data,
+ <vscale x 2 x i1> %pg,
+ i64* %base,
+ <vscale x 2 x i64> %b)
+ ret void
+}
+
+define void @sst1d_d_double(<vscale x 2 x double> %data, <vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sst1d_d_double:
+; CHECK: st1d { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.nxv2f64(<vscale x 2 x double> %data,
+ <vscale x 2 x i1> %pg,
+ double* %base,
+ <vscale x 2 x i64> %b)
+ ret void
+}
+
+declare void @llvm.aarch64.sve.st1.scatter.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i1>, i8*, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1.scatter.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1.scatter.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1.scatter.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.st1.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*, <vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-vector-base.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-vector-base.ll
new file mode 100644
index 000000000000..66a0489cbeb1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-vector-base.ll
@@ -0,0 +1,133 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; ST1B, ST1W, ST1H, ST1D: vector + immediate (index)
+; e.g. st1h { z0.s }, p0, [z1.s, #16]
+;
+
+; ST1B
+define void @sst1b_s_imm(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: sst1b_s_imm:
+; CHECK: st1b { z0.s }, p0, [z1.s, #16]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 4 x i32> %data to <vscale x 4 x i8>
+ call void @llvm.aarch64.sve.st1.scatter.imm.nxv4i8.nxv4i32(<vscale x 4 x i8> %data_trunc,
+ <vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 16)
+ ret void
+}
+
+define void @sst1b_d_imm(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: sst1b_d_imm:
+; CHECK: st1b { z0.d }, p0, [z1.d, #16]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i8>
+ call void @llvm.aarch64.sve.st1.scatter.imm.nxv2i8.nxv2i64(<vscale x 2 x i8> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 16)
+ ret void
+}
+
+; ST1H
+define void @sst1h_s_imm(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: sst1h_s_imm:
+; CHECK: st1h { z0.s }, p0, [z1.s, #16]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 4 x i32> %data to <vscale x 4 x i16>
+ call void @llvm.aarch64.sve.st1.scatter.imm.nxv4i16.nxv4i32(<vscale x 4 x i16> %data_trunc,
+ <vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 16)
+ ret void
+}
+
+define void @sst1h_d_imm(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: sst1h_d_imm:
+; CHECK: st1h { z0.d }, p0, [z1.d, #16]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i16>
+ call void @llvm.aarch64.sve.st1.scatter.imm.nxv2i16.nxv2i64(<vscale x 2 x i16> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 16)
+ ret void
+}
+
+; ST1W
+define void @sst1w_s_imm(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: sst1w_s_imm:
+; CHECK: st1w { z0.s }, p0, [z1.s, #16]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.imm.nxv4i32.nxv4i32(<vscale x 4 x i32> %data,
+ <vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 16)
+ ret void
+}
+
+define void @sst1w_d_imm(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: sst1w_d_imm:
+; CHECK: st1w { z0.d }, p0, [z1.d, #16]
+; CHECK-NEXT: ret
+ %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i32>
+ call void @llvm.aarch64.sve.st1.scatter.imm.nxv2i32.nxv2i64(<vscale x 2 x i32> %data_trunc,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 16)
+ ret void
+}
+
+define void @sst1w_s_imm_float(<vscale x 4 x float> %data, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: sst1w_s_imm_float:
+; CHECK: st1w { z0.s }, p0, [z1.s, #16]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.imm.nxv4f32.nxv4i32(<vscale x 4 x float> %data,
+ <vscale x 4 x i1> %pg,
+ <vscale x 4 x i32> %base,
+ i64 16)
+ ret void
+}
+
+; ST1D
+define void @sst1d_d_imm(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: sst1d_d_imm:
+; CHECK: st1d { z0.d }, p0, [z1.d, #16]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.imm.nxv2i64.nxv2i64(<vscale x 2 x i64> %data,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 16)
+ ret void
+}
+
+define void @sst1d_d_imm_double(<vscale x 2 x double> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: sst1d_d_imm_double:
+; CHECK: st1d { z0.d }, p0, [z1.d, #16]
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sve.st1.scatter.imm.nxv2f64.nxv2i64(<vscale x 2 x double> %data,
+ <vscale x 2 x i1> %pg,
+ <vscale x 2 x i64> %base,
+ i64 16)
+ ret void
+}
+
+; ST1B
+declare void @llvm.aarch64.sve.st1.scatter.imm.nxv4i8.nxv4i32(<vscale x 4 x i8>, <vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare void @llvm.aarch64.sve.st1.scatter.imm.nxv2i8.nxv2i64(<vscale x 2 x i8>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+; ST1H
+declare void @llvm.aarch64.sve.st1.scatter.imm.nxv4i16.nxv4i32(<vscale x 4 x i16>, <vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare void @llvm.aarch64.sve.st1.scatter.imm.nxv2i16.nxv2i64(<vscale x 2 x i16>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+; ST1W
+declare void @llvm.aarch64.sve.st1.scatter.imm.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare void @llvm.aarch64.sve.st1.scatter.imm.nxv2i32.nxv2i64(<vscale x 2 x i32>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+declare void @llvm.aarch64.sve.st1.scatter.imm.nxv4f32.nxv4i32(<vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+
+; ST1D
+declare void @llvm.aarch64.sve.st1.scatter.imm.nxv2i64.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+declare void @llvm.aarch64.sve.st1.scatter.imm.nxv2f64.nxv2i64(<vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
More information about the llvm-commits
mailing list