[llvm] 170947a - [SVE][CodeGen] Lower scalable masked scatters
Kerry McLaughlin via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 11 03:56:32 PST 2020
Author: Kerry McLaughlin
Date: 2020-11-11T11:50:22Z
New Revision: 170947a5def3a316ac0a11c334af7b3d25f929e1
URL: https://github.com/llvm/llvm-project/commit/170947a5def3a316ac0a11c334af7b3d25f929e1
DIFF: https://github.com/llvm/llvm-project/commit/170947a5def3a316ac0a11c334af7b3d25f929e1.diff
LOG: [SVE][CodeGen] Lower scalable masked scatters
Lowers the llvm.masked.scatter intrinsics (scalar plus vector addressing mode only)
Changes included in this patch:
- Custom lowering for MSCATTER, which chooses the appropriate scatter store opcode to use.
Floating-point scatters are cast to integer, with patterns added to match FP reinterpret_casts.
- Added the getCanonicalIndexType function to convert redundant addressing
modes (e.g. scaling is redundant when accessing bytes)
- Tests with 32 & 64-bit scaled & unscaled offsets
Reviewed By: sdesmalen
Differential Revision: https://reviews.llvm.org/D90941
Added:
llvm/test/CodeGen/AArch64/sve-masked-scatter-32b-scaled.ll
llvm/test/CodeGen/AArch64/sve-masked-scatter-32b-unscaled.ll
llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-scaled.ll
llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-unscaled.ll
Modified:
llvm/include/llvm/CodeGen/SelectionDAGNodes.h
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 34f824366206..eaad25c508ab 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -2391,6 +2391,9 @@ class MaskedGatherScatterSDNode : public MemSDNode {
ISD::MemIndexType getIndexType() const {
return static_cast<ISD::MemIndexType>(LSBaseSDNodeBits.AddressingMode);
}
+ void setIndexType(ISD::MemIndexType IndexType) {
+ LSBaseSDNodeBits.AddressingMode = IndexType;
+ }
bool isIndexScaled() const {
return (getIndexType() == ISD::SIGNED_SCALED) ||
(getIndexType() == ISD::UNSIGNED_SCALED);
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index af7984b426ee..1734c36bda6b 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4510,6 +4510,10 @@ class TargetLowering : public TargetLoweringBase {
// combiner can fold the new nodes.
SDValue lowerCmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) const;
+ /// Give targets the chance to reduce the number of distinct addresing modes.
+ ISD::MemIndexType getCanonicalIndexType(ISD::MemIndexType IndexType,
+ EVT MemVT, SDValue Offsets) const;
+
private:
SDValue foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
const SDLoc &DL, DAGCombinerInfo &DCI) const;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index b4f6d21e44ea..846b84b381c9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1865,6 +1865,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
else
NewOps[OpNo] = ZExtPromotedInteger(N->getOperand(OpNo));
+ N->setIndexType(TLI.getCanonicalIndexType(N->getIndexType(),
+ N->getMemoryVT(), NewOps[OpNo]));
} else {
NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
TruncateStore = true;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 2b7da1f60bd0..e8f9a5fddb1b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7356,15 +7356,21 @@ SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
return SDValue(E, 0);
}
+ IndexType = TLI->getCanonicalIndexType(IndexType, VT, Ops[4]);
auto *N = newSDNode<MaskedScatterSDNode>(dl.getIROrder(), dl.getDebugLoc(),
VTs, VT, MMO, IndexType, IsTrunc);
createOperands(N, Ops);
- assert(N->getMask().getValueType().getVectorNumElements() ==
- N->getValue().getValueType().getVectorNumElements() &&
+ assert(N->getMask().getValueType().getVectorElementCount() ==
+ N->getValue().getValueType().getVectorElementCount() &&
"Vector width mismatch between mask and data");
- assert(N->getIndex().getValueType().getVectorNumElements() >=
- N->getValue().getValueType().getVectorNumElements() &&
+ assert(
+ N->getIndex().getValueType().getVectorElementCount().isScalable() ==
+ N->getValue().getValueType().getVectorElementCount().isScalable() &&
+ "Scalable flags of index and data do not match");
+ assert(ElementCount::isKnownGE(
+ N->getIndex().getValueType().getVectorElementCount(),
+ N->getValue().getValueType().getVectorElementCount()) &&
"Vector width mismatch between index and data");
assert(isa<ConstantSDNode>(N->getScale()) &&
cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 95dddb64f31d..278d7b988545 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4297,7 +4297,7 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
if (!UniformBase) {
Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
Index = getValue(Ptr);
- IndexType = ISD::SIGNED_SCALED;
+ IndexType = ISD::SIGNED_UNSCALED;
Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
}
SDValue Ops[] = { getMemoryRoot(), Src0, Mask, Base, Index, Scale };
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 9d62faf7d956..875a429253ca 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7439,6 +7439,25 @@ SDValue TargetLowering::lowerCmpEqZeroToCtlzSrl(SDValue Op,
return SDValue();
}
+// Convert redundant addressing modes (e.g. scaling is redundant
+// when accessing bytes).
+ISD::MemIndexType
+TargetLowering::getCanonicalIndexType(ISD::MemIndexType IndexType, EVT MemVT,
+ SDValue Offsets) const {
+ bool IsScaledIndex =
+ (IndexType == ISD::SIGNED_SCALED) || (IndexType == ISD::UNSIGNED_SCALED);
+ bool IsSignedIndex =
+ (IndexType == ISD::SIGNED_SCALED) || (IndexType == ISD::SIGNED_UNSCALED);
+
+ // Scaling is unimportant for bytes, canonicalize to unscaled.
+ if (IsScaledIndex && MemVT.getScalarType() == MVT::i8) {
+ IsScaledIndex = false;
+ IndexType = IsSignedIndex ? ISD::SIGNED_UNSCALED : ISD::UNSIGNED_UNSCALED;
+ }
+
+ return IndexType;
+}
+
SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
unsigned Opcode = Node->getOpcode();
SDValue LHS = Node->getOperand(0);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c7cdab26d1b7..887716319f77 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1001,6 +1001,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, VT, Custom);
setOperationAction(ISD::FP_TO_UINT, VT, Custom);
setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
@@ -1052,6 +1053,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
MVT::nxv4f32, MVT::nxv2f64}) {
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::FADD, VT, Custom);
@@ -1073,6 +1075,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_ROUND, VT, Custom);
}
+ for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16})
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+
setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
@@ -3705,6 +3710,100 @@ bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
return ExtVal.getValueType().isScalableVector();
}
+unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
+ std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
+ {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
+ AArch64ISD::SST1_PRED},
+ {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
+ AArch64ISD::SST1_UXTW_PRED},
+ {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
+ AArch64ISD::SST1_PRED},
+ {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
+ AArch64ISD::SST1_SXTW_PRED},
+ {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
+ AArch64ISD::SST1_SCALED_PRED},
+ {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
+ AArch64ISD::SST1_UXTW_SCALED_PRED},
+ {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
+ AArch64ISD::SST1_SCALED_PRED},
+ {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
+ AArch64ISD::SST1_SXTW_SCALED_PRED},
+ };
+ auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
+ return AddrModes.find(Key)->second;
+}
+
+bool getScatterIndexIsExtended(SDValue Index) {
+ unsigned Opcode = Index.getOpcode();
+ if (Opcode == ISD::SIGN_EXTEND_INREG)
+ return true;
+
+ if (Opcode == ISD::AND) {
+ SDValue Splat = Index.getOperand(1);
+ if (Splat.getOpcode() != ISD::SPLAT_VECTOR)
+ return false;
+ ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Splat.getOperand(0));
+ if (!Mask || Mask->getZExtValue() != 0xFFFFFFFF)
+ return false;
+ return true;
+ }
+
+ return false;
+}
+
+SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
+ assert(MSC && "Can only custom lower scatter store nodes");
+
+ SDValue Index = MSC->getIndex();
+ SDValue Chain = MSC->getChain();
+ SDValue StoreVal = MSC->getValue();
+ SDValue Mask = MSC->getMask();
+ SDValue BasePtr = MSC->getBasePtr();
+
+ ISD::MemIndexType IndexType = MSC->getIndexType();
+ bool IsScaled =
+ IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
+ bool IsSigned =
+ IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
+ bool NeedsExtend =
+ getScatterIndexIsExtended(Index) ||
+ Index.getSimpleValueType().getVectorElementType() == MVT::i32;
+
+ EVT VT = StoreVal.getSimpleValueType();
+ SDVTList VTs = DAG.getVTList(MVT::Other);
+ EVT MemVT = MSC->getMemoryVT();
+ SDValue InputVT = DAG.getValueType(MemVT);
+
+ if (VT.getVectorElementType() == MVT::bf16 &&
+ !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+ return SDValue();
+
+ // Handle FP data
+ if (VT.isFloatingPoint()) {
+ VT = VT.changeVectorElementTypeToInteger();
+ ElementCount EC = VT.getVectorElementCount();
+ auto ScalarIntVT =
+ MVT::getIntegerVT(AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
+ StoreVal = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL,
+ MVT::getVectorVT(ScalarIntVT, EC), StoreVal);
+
+ InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
+ }
+
+ if (getScatterIndexIsExtended(Index)) {
+ if (Index.getOpcode() == ISD::AND)
+ IsSigned = false;
+ Index = Index.getOperand(0);
+ }
+
+ SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
+ return DAG.getNode(getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend), DL,
+ VTs, Ops);
+}
+
// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
EVT VT, EVT MemVT,
@@ -3982,6 +4081,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::STORE:
return LowerSTORE(Op, DAG);
+ case ISD::MSCATTER:
+ return LowerMSCATTER(Op, DAG);
case ISD::VECREDUCE_SEQ_FADD:
return LowerVECREDUCE_SEQ_FADD(Op, DAG);
case ISD::VECREDUCE_ADD:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 0b3365b7f25d..bfc83a9a34b9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -807,6 +807,8 @@ class AArch64TargetLowering : public TargetLowering {
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const;
+
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
bool isEligibleForTailCallOptimization(
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index bd43f92b0d07..bdf5d1d771c7 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1191,6 +1191,13 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 2))),
(UUNPKHI_ZZ_D ZPR:$Zs)>;
+ let Predicates = [HasSVE, HasBF16] in {
+ def : Pat<(nxv2bf16 (extract_subvector (nxv4bf16 ZPR:$Zs), (i64 0))),
+ (UUNPKLO_ZZ_D ZPR:$Zs)>;
+ def : Pat<(nxv2bf16 (extract_subvector (nxv4bf16 ZPR:$Zs), (i64 2))),
+ (UUNPKHI_ZZ_D ZPR:$Zs)>;
+ }
+
def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 0))),
(UUNPKLO_ZZ_S ZPR:$Zs)>;
def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 4))),
@@ -1769,6 +1776,16 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
def : Pat<(nxv2i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
def : Pat<(nxv2i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+ def : Pat<(nxv2i64 (reinterpret_cast (nxv2f64 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+ def : Pat<(nxv2i64 (reinterpret_cast (nxv2f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+ def : Pat<(nxv2i64 (reinterpret_cast (nxv2f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+ def : Pat<(nxv4i32 (reinterpret_cast (nxv4f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+ def : Pat<(nxv4i32 (reinterpret_cast (nxv4f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+ let Predicates = [HasSVE, HasBF16] in {
+ def : Pat<(nxv2i64 (reinterpret_cast (nxv2bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+ def : Pat<(nxv4i32 (reinterpret_cast (nxv4bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+ }
+
def : Pat<(nxv16i1 (and PPR:$Ps1, PPR:$Ps2)),
(AND_PPzPP (PTRUE_B 31), PPR:$Ps1, PPR:$Ps2)>;
def : Pat<(nxv8i1 (and PPR:$Ps1, PPR:$Ps2)),
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter-32b-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter-32b-scaled.ll
new file mode 100644
index 000000000000..771f3fe407e6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter-32b-scaled.ll
@@ -0,0 +1,370 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; scaled unpacked 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define void @masked_scatter_nxv2i16_sext(<vscale x 2 x i16> %data, i16* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i16_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, sxtw #1]
+; CHECK-NEXT: ret
+ %ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+ %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %ext
+ call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2i32_sext(<vscale x 2 x i32> %data, i32* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i32_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, sxtw #2]
+; CHECK-NEXT: ret
+ %ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+ %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %ext
+ call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2i64_sext(<vscale x 2 x i64> %data, i64* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i64_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, sxtw #3]
+; CHECK-NEXT: ret
+ %ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+ %ptrs = getelementptr i64, i64* %base, <vscale x 2 x i64> %ext
+ call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2f16_sext(<vscale x 2 x half> %data, half* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2f16_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, sxtw #1]
+; CHECK-NEXT: ret
+ %ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+ %ptrs = getelementptr half, half* %base, <vscale x 2 x i64> %ext
+ call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2bf16_sext(<vscale x 2 x bfloat> %data, bfloat* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind #0 {
+; CHECK-LABEL: masked_scatter_nxv2bf16_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, sxtw #1]
+; CHECK-NEXT: ret
+ %ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+ %ptrs = getelementptr bfloat, bfloat* %base, <vscale x 2 x i64> %ext
+ call void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2f32_sext(<vscale x 2 x float> %data, float* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2f32_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, sxtw #2]
+; CHECK-NEXT: ret
+ %ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+ %ptrs = getelementptr float, float* %base, <vscale x 2 x i64> %ext
+ call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2f64_sext(<vscale x 2 x double> %data, double* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2f64_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, sxtw #3]
+; CHECK-NEXT: ret
+ %ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+ %ptrs = getelementptr double, double* %base, <vscale x 2 x i64> %ext
+ call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2i16_zext(<vscale x 2 x i16> %data, i16* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i16_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, uxtw #1]
+; CHECK-NEXT: ret
+ %ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+ %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %ext
+ call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2i32_zext(<vscale x 2 x i32> %data, i32* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i32_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, uxtw #2]
+; CHECK-NEXT: ret
+ %ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+ %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %ext
+ call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2i64_zext(<vscale x 2 x i64> %data, i64* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i64_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, uxtw #3]
+; CHECK-NEXT: ret
+ %ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+ %ptrs = getelementptr i64, i64* %base, <vscale x 2 x i64> %ext
+ call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2f16_zext(<vscale x 2 x half> %data, half* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2f16_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, uxtw #1]
+; CHECK-NEXT: ret
+ %ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+ %ptrs = getelementptr half, half* %base, <vscale x 2 x i64> %ext
+ call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2bf16_zext(<vscale x 2 x bfloat> %data, bfloat* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind #0 {
+; CHECK-LABEL: masked_scatter_nxv2bf16_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, uxtw #1]
+; CHECK-NEXT: ret
+ %ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+ %ptrs = getelementptr bfloat, bfloat* %base, <vscale x 2 x i64> %ext
+ call void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2f32_zext(<vscale x 2 x float> %data, float* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2f32_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, uxtw #2]
+; CHECK-NEXT: ret
+ %ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+ %ptrs = getelementptr float, float* %base, <vscale x 2 x i64> %ext
+ call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2f64_zext(<vscale x 2 x double> %data, double* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2f64_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, uxtw #3]
+; CHECK-NEXT: ret
+ %ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+ %ptrs = getelementptr double, double* %base, <vscale x 2 x i64> %ext
+ call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; scaled packed 32-bit offset
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define void @masked_scatter_nxv4i16_sext(<vscale x 4 x i16> %data, i16* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv4i16_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uunpkhi z2.d, z1.s
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, sxtw #1]
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, sxtw #1]
+; CHECK-NEXT: ret
+ %ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+ %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %ext
+ call void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16> %data, <vscale x 4 x i16*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4i32_sext(<vscale x 4 x i32> %data, i32* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv4i32_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uunpkhi z2.d, z1.s
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1w { z3.d }, p2, [x0, z1.d, sxtw #2]
+; CHECK-NEXT: st1w { z0.d }, p0, [x0, z2.d, sxtw #2]
+; CHECK-NEXT: ret
+ %ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+ %ptrs = getelementptr i32, i32* %base, <vscale x 4 x i64> %ext
+ call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4f16_sext(<vscale x 4 x half> %data, half* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv4f16_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uunpkhi z2.d, z1.s
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, sxtw #1]
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, sxtw #1]
+; CHECK-NEXT: ret
+ %ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+ %ptrs = getelementptr half, half* %base, <vscale x 4 x i64> %ext
+ call void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half> %data, <vscale x 4 x half*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4bf16_sext(<vscale x 4 x bfloat> %data, bfloat* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind #0 {
+; CHECK-LABEL: masked_scatter_nxv4bf16_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uunpkhi z2.d, z1.s
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, sxtw #1]
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, sxtw #1]
+; CHECK-NEXT: ret
+ %ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+ %ptrs = getelementptr bfloat, bfloat* %base, <vscale x 4 x i64> %ext
+ call void @llvm.masked.scatter.nxv4bf16(<vscale x 4 x bfloat> %data, <vscale x 4 x bfloat*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4f32_sext(<vscale x 4 x float> %data, float* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind #0 {
+; CHECK-LABEL: masked_scatter_nxv4f32_sext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uunpkhi z2.d, z1.s
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1w { z3.d }, p2, [x0, z1.d, sxtw #2]
+; CHECK-NEXT: st1w { z0.d }, p0, [x0, z2.d, sxtw #2]
+; CHECK-NEXT: ret
+ %ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+ %ptrs = getelementptr float, float* %base, <vscale x 4 x i64> %ext
+ call void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float> %data, <vscale x 4 x float*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4i16_zext(<vscale x 4 x i16> %data, i16* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv4i16_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uunpkhi z2.d, z1.s
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, uxtw #1]
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, uxtw #1]
+; CHECK-NEXT: ret
+ %ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+ %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %ext
+ call void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16> %data, <vscale x 4 x i16*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4i32_zext(<vscale x 4 x i32> %data, i32* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv4i32_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uunpkhi z2.d, z1.s
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1w { z3.d }, p2, [x0, z1.d, uxtw #2]
+; CHECK-NEXT: st1w { z0.d }, p0, [x0, z2.d, uxtw #2]
+; CHECK-NEXT: ret
+ %ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+ %ptrs = getelementptr i32, i32* %base, <vscale x 4 x i64> %ext
+ call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4f16_zext(<vscale x 4 x half> %data, half* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv4f16_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uunpkhi z2.d, z1.s
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, uxtw #1]
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, uxtw #1]
+; CHECK-NEXT: ret
+ %ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+ %ptrs = getelementptr half, half* %base, <vscale x 4 x i64> %ext
+ call void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half> %data, <vscale x 4 x half*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4bf16_zext(<vscale x 4 x bfloat> %data, bfloat* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind #0 {
+; CHECK-LABEL: masked_scatter_nxv4bf16_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uunpkhi z2.d, z1.s
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, uxtw #1]
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, uxtw #1]
+; CHECK-NEXT: ret
+ %ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+ %ptrs = getelementptr bfloat, bfloat* %base, <vscale x 4 x i64> %ext
+ call void @llvm.masked.scatter.nxv4bf16(<vscale x 4 x bfloat> %data, <vscale x 4 x bfloat*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4f32_zext(<vscale x 4 x float> %data, float* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind #0 {
+; CHECK-LABEL: masked_scatter_nxv4f32_zext:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uunpkhi z2.d, z1.s
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1w { z3.d }, p2, [x0, z1.d, uxtw #2]
+; CHECK-NEXT: st1w { z0.d }, p0, [x0, z2.d, uxtw #2]
+; CHECK-NEXT: ret
+ %ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+ %ptrs = getelementptr float, float* %base, <vscale x 4 x i64> %ext
+ call void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float> %data, <vscale x 4 x float*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+declare void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8*>, i32, <vscale x 4 x i1>)
+attributes #0 = { "target-features"="+sve,+bf16" }
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter-32b-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter-32b-unscaled.ll
new file mode 100644
index 000000000000..d648e699be82
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter-32b-unscaled.ll
@@ -0,0 +1,577 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled unpacked 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define void @masked_scatter_nxv2i8_sext_offsets(<vscale x 2 x i8> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i8_sext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i8*>
+ call void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2i16_sext_offsets(<vscale x 2 x i16> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i16_sext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+ call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2i32_sext_offsets(<vscale x 2 x i32> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i32_sext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+ call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2i64_sext_offsets(<vscale x 2 x i64> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i64_sext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
+ call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2f16_sext_offsets(<vscale x 2 x half> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2f16_sext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
+ call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2bf16_sext_offsets(<vscale x 2 x bfloat> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind #0 {
+; CHECK-LABEL: masked_scatter_nxv2bf16_sext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x bfloat*>
+ call void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2f32_sext_offsets(<vscale x 2 x float> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2f32_sext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
+ call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2f64_sext_offsets(<vscale x 2 x double> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2f64_sext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
+ call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2i8_zext_offsets(<vscale x 2 x i8> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i8_zext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i8*>
+ call void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2i16_zext_offsets(<vscale x 2 x i16> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i16_zext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+ call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2i32_zext_offsets(<vscale x 2 x i32> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i32_zext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+ call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2i64_zext_offsets(<vscale x 2 x i64> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i64_zext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
+ call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2f16_zext_offsets(<vscale x 2 x half> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2f16_zext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
+ call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2bf16_zext_offsets(<vscale x 2 x bfloat> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind #0 {
+; CHECK-LABEL: masked_scatter_nxv2bf16_zext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x bfloat*>
+ call void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2f32_zext_offsets(<vscale x 2 x float> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2f32_zext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
+ call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2f64_zext_offsets(<vscale x 2 x double> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2f64_zext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
+ call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled packed 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+define void @masked_scatter_nxv4i8_sext_offsets(<vscale x 4 x i8> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv4i8_sext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: sunpklo z3.d, z1.s
+; CHECK-NEXT: sunpkhi z1.d, z1.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: add z2.d, z2.d, z3.d
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1b { z3.d }, p2, [x8, z2.d]
+; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i8*>
+ call void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8> %data, <vscale x 4 x i8*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4i16_sext_offsets(<vscale x 4 x i16> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv4i16_sext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: sunpklo z3.d, z1.s
+; CHECK-NEXT: sunpkhi z1.d, z1.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: add z2.d, z2.d, z3.d
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d]
+; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
+ call void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16> %data, <vscale x 4 x i16*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4i32_sext_offsets(<vscale x 4 x i32> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv4i32_sext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: sunpklo z3.d, z1.s
+; CHECK-NEXT: sunpkhi z1.d, z1.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: add z2.d, z2.d, z3.d
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1w { z3.d }, p2, [x8, z2.d]
+; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i32*>
+ call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4f16_sext_offsets(<vscale x 4 x half> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv4f16_sext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: sunpklo z3.d, z1.s
+; CHECK-NEXT: sunpkhi z1.d, z1.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: add z2.d, z2.d, z3.d
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d]
+; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x half*>
+ call void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half> %data, <vscale x 4 x half*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4bf16_sext_offsets(<vscale x 4 x bfloat> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind #0 {
+; CHECK-LABEL: masked_scatter_nxv4bf16_sext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: sunpklo z3.d, z1.s
+; CHECK-NEXT: sunpkhi z1.d, z1.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: add z2.d, z2.d, z3.d
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d]
+; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x bfloat*>
+ call void @llvm.masked.scatter.nxv4bf16(<vscale x 4 x bfloat> %data, <vscale x 4 x bfloat*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4f32_sext_offsets(<vscale x 4 x float> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind #0 {
+; CHECK-LABEL: masked_scatter_nxv4f32_sext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: sunpklo z3.d, z1.s
+; CHECK-NEXT: sunpkhi z1.d, z1.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: add z2.d, z2.d, z3.d
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1w { z3.d }, p2, [x8, z2.d]
+; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x float*>
+ call void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float> %data, <vscale x 4 x float*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4i8_zext_offsets(<vscale x 4 x i8> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv4i8_zext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: uunpklo z3.d, z1.s
+; CHECK-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: add z2.d, z2.d, z3.d
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1b { z3.d }, p2, [x8, z2.d]
+; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i8*>
+ call void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8> %data, <vscale x 4 x i8*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4i16_zext_offsets(<vscale x 4 x i16> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv4i16_zext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: uunpklo z3.d, z1.s
+; CHECK-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: add z2.d, z2.d, z3.d
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d]
+; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
+ call void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16> %data, <vscale x 4 x i16*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4i32_zext_offsets(<vscale x 4 x i32> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv4i32_zext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: uunpklo z3.d, z1.s
+; CHECK-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: add z2.d, z2.d, z3.d
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1w { z3.d }, p2, [x8, z2.d]
+; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i32*>
+ call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4f16_zext_offsets(<vscale x 4 x half> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv4f16_zext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: uunpklo z3.d, z1.s
+; CHECK-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: add z2.d, z2.d, z3.d
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d]
+; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x half*>
+ call void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half> %data, <vscale x 4 x half*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4bf16_zext_offsets(<vscale x 4 x bfloat> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind #0 {
+; CHECK-LABEL: masked_scatter_nxv4bf16_zext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: uunpklo z3.d, z1.s
+; CHECK-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: add z2.d, z2.d, z3.d
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d]
+; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x bfloat*>
+ call void @llvm.masked.scatter.nxv4bf16(<vscale x 4 x bfloat> %data, <vscale x 4 x bfloat*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv4f32_zext_offsets(<vscale x 4 x float> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind #0 {
+; CHECK-LABEL: masked_scatter_nxv4f32_zext_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: uunpklo z3.d, z1.s
+; CHECK-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: add z2.d, z2.d, z3.d
+; CHECK-NEXT: uunpklo z3.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
+; CHECK-NEXT: st1w { z3.d }, p2, [x8, z2.d]
+; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x float*>
+ call void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float> %data, <vscale x 4 x float*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+ ret void
+}
+
+declare void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8*>, i32, <vscale x 4 x i1>)
+attributes #0 = { "target-features"="+sve,+bf16" }
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-scaled.ll
new file mode 100644
index 000000000000..67aebaa2060c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-scaled.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; scaled 64-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define void @masked_scatter_nxv2i16(<vscale x 2 x i16> %data, i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, lsl #1]
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
+ call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @masked_scatter_nxv2i32(<vscale x 2 x i32> %data, i32* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, lsl #2]
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
+ call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @masked_scatter_nxv2i64(<vscale x 2 x i64> %data, i64* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, lsl #3]
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i64, i64* %base, <vscale x 2 x i64> %offsets
+ call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @masked_scatter_nxv2f16(<vscale x 2 x half> %data, half* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, lsl #1]
+; CHECK-NEXT: ret
+ %ptrs = getelementptr half, half* %base, <vscale x 2 x i64> %offsets
+ call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @masked_scatter_nxv2f32(<vscale x 2 x float> %data, float* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, lsl #2]
+; CHECK-NEXT: ret
+ %ptrs = getelementptr float, float* %base, <vscale x 2 x i64> %offsets
+ call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask)
+ ret void
+}
+
+define void @masked_scatter_nxv2f64(<vscale x 2 x double> %data, double* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, lsl #3]
+; CHECK-NEXT: ret
+ %ptrs = getelementptr double, double* %base, <vscale x 2 x i64> %offsets
+ call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double*>, i32, <vscale x 2 x i1>)
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-unscaled.ll
new file mode 100644
index 000000000000..bb655a9ea90a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-unscaled.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled 64-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define void @masked_scatter_nxv2i8_unscaled_64bit_offsets(<vscale x 2 x i8> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i8_unscaled_64bit_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i8*>
+ call void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2i16_unscaled_64bit_offsets(<vscale x 2 x i16> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i16_unscaled_64bit_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+ call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2i32_unscaled_64bit_offsets(<vscale x 2 x i32> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i32_unscaled_64bit_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+ call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2i64_unscaled_64bit_offsets(<vscale x 2 x i64> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i64_unscaled_64bit_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
+ call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2f16_unscaled_64bit_offsets(<vscale x 2 x half> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2f16_unscaled_64bit_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
+ call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2bf16_unscaled_64bit_offsets(<vscale x 2 x bfloat> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind #0 {
+; CHECK-LABEL: masked_scatter_nxv2bf16_unscaled_64bit_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x bfloat*>
+ call void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2f32_unscaled_64bit_offsets(<vscale x 2 x float> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind #0 {
+; CHECK-LABEL: masked_scatter_nxv2f32_unscaled_64bit_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
+ call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+define void @masked_scatter_nxv2f64_unscaled_64bit_offsets(<vscale x 2 x double> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind #0 {
+; CHECK-LABEL: masked_scatter_nxv2f64_unscaled_64bit_offsets:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z2.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
+ call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+ ret void
+}
+
+declare void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8*>, i32, <vscale x 4 x i1>)
+attributes #0 = { "target-features"="+sve,+bf16" }
More information about the llvm-commits
mailing list