[llvm-branch-commits] [llvm] f6dd32f - [SVE][CodeGen] Lower scalable masked gathers
Kerry McLaughlin via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Dec 7 04:21:35 PST 2020
Author: Kerry McLaughlin
Date: 2020-12-07T12:20:41Z
New Revision: f6dd32fd3584380730a09b042cfbac852f36eb00
URL: https://github.com/llvm/llvm-project/commit/f6dd32fd3584380730a09b042cfbac852f36eb00
DIFF: https://github.com/llvm/llvm-project/commit/f6dd32fd3584380730a09b042cfbac852f36eb00.diff
LOG: [SVE][CodeGen] Lower scalable masked gathers
Lowers the llvm.masked.gather intrinsics (scalar plus vector addressing mode only)
Changes in this patch:
- Add custom lowering for MGATHER, using getGatherVecOpcode() to choose the appropriate
gather load opcode to use.
- Improve codegen with refineIndexType/refineUniformBase, added in D90942
- Tests added for gather loads with 32 & 64-bit scaled & unscaled offsets.
Reviewed By: sdesmalen
Differential Revision: https://reviews.llvm.org/D91092
Added:
llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll
llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
Modified:
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 552545b854d8..9a0925061105 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1746,6 +1746,7 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
SDValue PassThru = MGT->getPassThru();
SDValue Index = MGT->getIndex();
SDValue Scale = MGT->getScale();
+ EVT MemoryVT = MGT->getMemoryVT();
Align Alignment = MGT->getOriginalAlign();
// Split Mask operand
@@ -1759,6 +1760,10 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
}
+ EVT LoMemVT, HiMemVT;
+ // Split MemoryVT
+ std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
SDValue PassThruLo, PassThruHi;
if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(PassThru, PassThruLo, PassThruHi);
@@ -1777,11 +1782,11 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
MGT->getRanges());
SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
- Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl, OpsLo,
+ Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl, OpsLo,
MMO, MGT->getIndexType());
SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
- Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl, OpsHi,
+ Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl, OpsHi,
MMO, MGT->getIndexType());
// Build a factor node to remember that this load is independent of the
@@ -2421,11 +2426,11 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
MGT->getRanges());
SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
- SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl,
+ SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl,
OpsLo, MMO, MGT->getIndexType());
SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
- SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl,
+ SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl,
OpsHi, MMO, MGT->getIndexType());
// Build a factor node to remember that this load is independent of the
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index f6e131838a16..dd837d4d495f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7310,17 +7310,22 @@ SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
return SDValue(E, 0);
}
+ IndexType = TLI->getCanonicalIndexType(IndexType, VT, Ops[4]);
auto *N = newSDNode<MaskedGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(),
VTs, VT, MMO, IndexType);
createOperands(N, Ops);
assert(N->getPassThru().getValueType() == N->getValueType(0) &&
"Incompatible type of the PassThru value in MaskedGatherSDNode");
- assert(N->getMask().getValueType().getVectorNumElements() ==
- N->getValueType(0).getVectorNumElements() &&
+ assert(N->getMask().getValueType().getVectorElementCount() ==
+ N->getValueType(0).getVectorElementCount() &&
"Vector width mismatch between mask and data");
- assert(N->getIndex().getValueType().getVectorNumElements() >=
- N->getValueType(0).getVectorNumElements() &&
+ assert(N->getIndex().getValueType().getVectorElementCount().isScalable() ==
+ N->getValueType(0).getVectorElementCount().isScalable() &&
+ "Scalable flags of index and data do not match");
+ assert(ElementCount::isKnownGE(
+ N->getIndex().getValueType().getVectorElementCount(),
+ N->getValueType(0).getVectorElementCount()) &&
"Vector width mismatch between index and data");
assert(isa<ConstantSDNode>(N->getScale()) &&
cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 397a69654933..f3bce354624b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4416,7 +4416,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
if (!UniformBase) {
Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
Index = getValue(Ptr);
- IndexType = ISD::SIGNED_SCALED;
+ IndexType = ISD::SIGNED_UNSCALED;
Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
}
SDValue Ops[] = { Root, Src0, Mask, Base, Index, Scale };
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 353991691968..d729252c92d9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -113,6 +113,16 @@ EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
"optimization"),
cl::init(true));
+// Temporary option added for the purpose of testing functionality added
+// to DAGCombiner.cpp in D92230. It is expected that this can be removed
+// in future when both implementations will be based off MGATHER rather
+// than the GLD1 nodes added for the SVE gather load intrinsics.
+static cl::opt<bool>
+EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
+ cl::desc("Combine extends of AArch64 masked "
+ "gather intrinsics"),
+ cl::init(true));
+
/// Value type used for condition codes.
static const MVT MVT_CC = MVT::i32;
@@ -1059,6 +1069,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, VT, Custom);
setOperationAction(ISD::FP_TO_UINT, VT, Custom);
setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
@@ -1111,6 +1122,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
MVT::nxv4f32, MVT::nxv2f64}) {
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
@@ -3775,6 +3787,29 @@ bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
return ExtVal.getValueType().isScalableVector();
}
+unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
+ std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
+ {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
+ AArch64ISD::GLD1_MERGE_ZERO},
+ {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
+ AArch64ISD::GLD1_UXTW_MERGE_ZERO},
+ {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
+ AArch64ISD::GLD1_MERGE_ZERO},
+ {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
+ AArch64ISD::GLD1_SXTW_MERGE_ZERO},
+ {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
+ AArch64ISD::GLD1_SCALED_MERGE_ZERO},
+ {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
+ AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
+ {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
+ AArch64ISD::GLD1_SCALED_MERGE_ZERO},
+ {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
+ AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
+ };
+ auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
+ return AddrModes.find(Key)->second;
+}
+
unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
{std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
@@ -3798,7 +3833,7 @@ unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
return AddrModes.find(Key)->second;
}
-bool getScatterIndexIsExtended(SDValue Index) {
+bool getGatherScatterIndexIsExtended(SDValue Index) {
unsigned Opcode = Index.getOpcode();
if (Opcode == ISD::SIGN_EXTEND_INREG)
return true;
@@ -3816,6 +3851,54 @@ bool getScatterIndexIsExtended(SDValue Index) {
return false;
}
+SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
+ assert(MGT && "Can only custom lower gather load nodes");
+
+ SDValue Index = MGT->getIndex();
+ SDValue Chain = MGT->getChain();
+ SDValue PassThru = MGT->getPassThru();
+ SDValue Mask = MGT->getMask();
+ SDValue BasePtr = MGT->getBasePtr();
+
+ ISD::MemIndexType IndexType = MGT->getIndexType();
+ bool IsScaled =
+ IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
+ bool IsSigned =
+ IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
+ bool IdxNeedsExtend =
+ getGatherScatterIndexIsExtended(Index) ||
+ Index.getSimpleValueType().getVectorElementType() == MVT::i32;
+
+ EVT VT = PassThru.getSimpleValueType();
+ EVT MemVT = MGT->getMemoryVT();
+ SDValue InputVT = DAG.getValueType(MemVT);
+
+ if (VT.getVectorElementType() == MVT::bf16 &&
+ !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+ return SDValue();
+
+ // Handle FP data
+ if (VT.isFloatingPoint()) {
+ VT = VT.changeVectorElementTypeToInteger();
+ ElementCount EC = VT.getVectorElementCount();
+ auto ScalarIntVT =
+ MVT::getIntegerVT(AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
+ PassThru = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL,
+ MVT::getVectorVT(ScalarIntVT, EC), PassThru);
+
+ InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
+ }
+
+ SDVTList VTs = DAG.getVTList(PassThru.getSimpleValueType(), MVT::Other);
+
+ SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru};
+ return DAG.getNode(getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend), DL,
+ VTs, Ops);
+}
+
SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
@@ -3834,7 +3917,7 @@ SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
bool IsSigned =
IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
bool NeedsExtend =
- getScatterIndexIsExtended(Index) ||
+ getGatherScatterIndexIsExtended(Index) ||
Index.getSimpleValueType().getVectorElementType() == MVT::i32;
EVT VT = StoreVal.getSimpleValueType();
@@ -3858,7 +3941,7 @@ SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
}
- if (getScatterIndexIsExtended(Index))
+ if (getGatherScatterIndexIsExtended(Index))
Index = Index.getOperand(0);
SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
@@ -4159,6 +4242,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::STORE:
return LowerSTORE(Op, DAG);
+ case ISD::MGATHER:
+ return LowerMGATHER(Op, DAG);
case ISD::MSCATTER:
return LowerMSCATTER(Op, DAG);
case ISD::VECREDUCE_SEQ_FADD:
@@ -12019,6 +12104,9 @@ static SDValue performSVEAndCombine(SDNode *N,
return DAG.getNode(Opc, DL, N->getValueType(0), And);
}
+ if (!EnableCombineMGatherIntrinsics)
+ return SDValue();
+
SDValue Mask = N->getOperand(1);
if (!Src.hasOneUse())
@@ -14982,6 +15070,9 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
}
+ if (!EnableCombineMGatherIntrinsics)
+ return SDValue();
+
// SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
// for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
unsigned NewOpc;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 5c5b9c885809..334517ad992b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -805,6 +805,7 @@ class AArch64TargetLowering : public TargetLowering {
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerMGATHER(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
new file mode 100644
index 000000000000..747468ae3cf4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
@@ -0,0 +1,181 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled unpacked 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT: and z0.d, z0.d, #0xffff
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
+ %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+ %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
+ %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+ %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i64, i64* %base, <vscale x 2 x i32> %offsets
+ %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+ ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT: ret
+ %ptrs = getelementptr half, half* %base, <vscale x 2 x i32> %offsets
+ %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+ ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT: ret
+ %ptrs = getelementptr float, float* %base, <vscale x 2 x i32> %offsets
+ %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+ ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT: ret
+ %ptrs = getelementptr double, double* %base, <vscale x 2 x i32> %offsets
+ %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+ ret <vscale x 2 x double> %vals
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT: sxth z0.d, p1/m, z0.d
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
+ %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+ %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
+ %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+ %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.sext
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled packed 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
+; CHECK-NEXT: and z0.s, z0.s, #0xffff
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i32> %offsets
+ %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+ %vals.zext = zext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %vals.zext
+}
+
+define <vscale x 4 x i32> @masked_gather_nxv4i32(i32* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i32, i32* %base, <vscale x 4 x i32> %offsets
+ %vals = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+ ret <vscale x 4 x i32> %vals
+}
+
+define <vscale x 4 x half> @masked_gather_nxv4f16(half* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
+; CHECK-NEXT: ret
+ %ptrs = getelementptr half, half* %base, <vscale x 4 x i32> %offsets
+ %vals = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef)
+ ret <vscale x 4 x half> %vals
+}
+
+define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
+; CHECK-NEXT: ret
+ %ptrs = getelementptr float, float* %base, <vscale x 4 x i32> %offsets
+ %vals = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
+ ret <vscale x 4 x float> %vals
+}
+
+define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: sxth z0.s, p0/m, z0.s
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i32> %offsets
+ %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+ %vals.sext = sext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %vals.sext
+}
+
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+
+declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
+declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
new file mode 100644
index 000000000000..b214fcf15911
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
@@ -0,0 +1,328 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled unpacked 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: and z0.d, z0.d, #0xff
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+ %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+ %vals.zext = zext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: and z0.d, z0.d, #0xffff
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+ %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+ %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+ %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+ %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
+ %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+ ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
+ %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+ ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
+ %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+ ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
+ %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+ ret <vscale x 2 x double> %vals
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: sxtb z0.d, p1/m, z0.d
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+ %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+ %vals.sext = sext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: sxth z0.d, p1/m, z0.d
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+ %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+ %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+ %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+ %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.sext
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled packed 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT: and z0.s, z0.s, #0xff
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+ %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
+ %vals.zext = zext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %vals.zext
+}
+
+define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: sunpklo z2.d, z0.s
+; CHECK-NEXT: sunpkhi z0.d, z0.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z2.d, z1.d, z2.d
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d]
+; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d]
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: and z0.s, z0.s, #0xffff
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
+ %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+ %vals.zext = zext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %vals.zext
+}
+
+define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: sunpklo z2.d, z0.s
+; CHECK-NEXT: sunpkhi z0.d, z0.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z2.d, z1.d, z2.d
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ld1w { z0.d }, p2/z, [x8, z0.d]
+; CHECK-NEXT: ld1w { z1.d }, p0/z, [x8, z2.d]
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i32*>
+ %vals = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+ ret <vscale x 4 x i32> %vals
+}
+
+define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: sunpklo z2.d, z0.s
+; CHECK-NEXT: sunpkhi z0.d, z0.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z2.d, z1.d, z2.d
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d]
+; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d]
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x half*>
+ %vals = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef)
+ ret <vscale x 4 x half> %vals
+}
+
+define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: sunpklo z2.d, z0.s
+; CHECK-NEXT: sunpkhi z0.d, z0.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z2.d, z1.d, z2.d
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ld1w { z0.d }, p2/z, [x8, z0.d]
+; CHECK-NEXT: ld1w { z1.d }, p0/z, [x8, z2.d]
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x float*>
+ %vals = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
+ ret <vscale x 4 x float> %vals
+}
+
+define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: sxtb z0.s, p0/m, z0.s
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+ %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
+ %vals.sext = sext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %vals.sext
+}
+
+define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: sunpklo z2.d, z0.s
+; CHECK-NEXT: sunpkhi z0.d, z0.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z2.d, z1.d, z2.d
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d]
+; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: sxth z0.s, p0/m, z0.s
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
+ %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+ %vals.sext = sext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %vals.sext
+}
+
+declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+
+declare <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*>, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
+declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
+declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
new file mode 100644
index 000000000000..d938567beb04
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
@@ -0,0 +1,223 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled unpacked 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT: and z0.d, z0.d, #0xffff
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+ %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets.zext
+ %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+ %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+ %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets.zext
+ %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+ %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+ %ptrs = getelementptr i64, i64* %base, <vscale x 2 x i64> %offsets.zext
+ %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+ ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+ %ptrs = getelementptr half, half* %base, <vscale x 2 x i64> %offsets.zext
+ %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+ ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+ %ptrs = getelementptr float, float* %base, <vscale x 2 x i64> %offsets.zext
+ %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+ ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+ %ptrs = getelementptr double, double* %base, <vscale x 2 x i64> %offsets.zext
+ %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+ ret <vscale x 2 x double> %vals
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: sxth z0.d, p0/m, z0.d
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+ %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets.zext
+ %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+ %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+ %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets.zext
+ %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+ %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.sext
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled packed 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uunpklo z1.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1]
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: and z0.s, z0.s, #0xffff
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+ %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %offsets.zext
+ %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+ %vals.zext = zext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %vals.zext
+}
+
+define <vscale x 4 x i32> @masked_gather_nxv4i32(i32* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uunpklo z1.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ld1w { z0.d }, p2/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, z1.d, sxtw #2]
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+ %ptrs = getelementptr i32, i32* %base, <vscale x 4 x i64> %offsets.zext
+ %vals = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+ ret <vscale x 4 x i32> %vals
+}
+
+define <vscale x 4 x half> @masked_gather_nxv4f16(half* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uunpklo z1.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1]
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+ %ptrs = getelementptr half, half* %base, <vscale x 4 x i64> %offsets.zext
+ %vals = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef)
+ ret <vscale x 4 x half> %vals
+}
+
+define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uunpklo z1.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ld1w { z0.d }, p2/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, z1.d, sxtw #2]
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+ %ptrs = getelementptr float, float* %base, <vscale x 4 x i64> %offsets.zext
+ %vals = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
+ ret <vscale x 4 x float> %vals
+}
+
+define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uunpklo z1.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: sxth z0.s, p0/m, z0.s
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+ %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %offsets.zext
+ %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+ %vals.sext = sext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %vals.sext
+}
+
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+
+declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
+declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
new file mode 100644
index 000000000000..7a47311484f8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
@@ -0,0 +1,352 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled unpacked 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: and z0.d, z0.d, #0xff
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+ %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+ %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+ %vals.zext = zext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: and z0.d, z0.d, #0xffff
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+ %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+ %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+ %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+ %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
+ %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+ ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
+ %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+ ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
+ %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+ ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
+ %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+ ret <vscale x 2 x double> %vals
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+ %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+ %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+ %vals.sext = sext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: sxth z0.d, p0/m, z0.d
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+ %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+ %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+ %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+ %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.sext
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled packed 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uunpklo z1.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ld1b { z0.d }, p2/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ld1b { z1.d }, p0/z, [x0, z1.d, sxtw]
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: and z0.s, z0.s, #0xff
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+ %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+ %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
+ %vals.zext = zext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %vals.zext
+}
+
+define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: uunpkhi z2.d, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: add z1.d, z1.d, z2.d
+; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d]
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: and z0.s, z0.s, #0xffff
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
+ %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+ %vals.zext = zext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %vals.zext
+}
+
+define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: uunpkhi z2.d, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: add z1.d, z1.d, z2.d
+; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ld1w { z1.d }, p2/z, [x8, z1.d]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i32*>
+ %vals = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+ ret <vscale x 4 x i32> %vals
+}
+
+define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: uunpkhi z2.d, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: add z1.d, z1.d, z2.d
+; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d]
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x half*>
+ %vals = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef)
+ ret <vscale x 4 x half> %vals
+}
+
+define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: uunpkhi z2.d, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: add z1.d, z1.d, z2.d
+; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ld1w { z1.d }, p2/z, [x8, z1.d]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x float*>
+ %vals = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
+ ret <vscale x 4 x float> %vals
+}
+
+define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: uunpklo z1.d, z0.s
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ld1b { z0.d }, p2/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ld1b { z1.d }, p0/z, [x0, z1.d, sxtw]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: sxtb z0.s, p0/m, z0.s
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+ %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+ %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
+ %vals.sext = sext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %vals.sext
+}
+
+define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: uunpkhi z2.d, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: add z1.d, z1.d, z2.d
+; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d]
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: sxth z0.s, p0/m, z0.s
+; CHECK-NEXT: ret
+ %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+ %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
+ %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+ %vals.sext = sext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %vals.sext
+}
+
+declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+
+declare <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*>, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
+declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
+declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll
new file mode 100644
index 000000000000..197ed69ee52f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
+; CHECK-NEXT: and z0.d, z0.d, #0xffff
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
+ %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+ %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
+ %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+ %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, lsl #3]
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i64, i64* %base, <vscale x 2 x i64> %offsets
+ %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+ ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
+; CHECK-NEXT: ret
+ %ptrs = getelementptr half, half* %base, <vscale x 2 x i64> %offsets
+ %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+ ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
+; CHECK-NEXT: ret
+ %ptrs = getelementptr float, float* %base, <vscale x 2 x i64> %offsets
+ %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+ ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, lsl #3]
+; CHECK-NEXT: ret
+ %ptrs = getelementptr double, double* %base, <vscale x 2 x i64> %offsets
+ %vals.sext = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+ ret <vscale x 2 x double> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: sxth z0.d, p0/m, z0.d
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
+ %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+ %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
+ %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+ %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.sext
+}
+
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
new file mode 100644
index 000000000000..be8909201a83
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
@@ -0,0 +1,157 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+
+define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: and z0.d, z0.d, #0xff
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+ %vals.zext = zext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: and z0.d, z0.d, #0xffff
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+ %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+ %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+ %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+ %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
+ %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+ ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
+ %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+ ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
+ %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+ ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
+ %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+ ret <vscale x 2 x double> %vals
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT: ret
+ %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+ %vals.sext = sext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: sxth z0.d, p0/m, z0.d
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+ %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+ %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z1.d, x0
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT: ret
+ %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+ %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+ %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+ %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %vals.sext
+}
+
+declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
new file mode 100644
index 000000000000..962ba079ca9e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+; Tests that exercise various type legalisation scenarios for ISD::MGATHER.
+
+; Code generate load of an illegal datatype via promotion.
+define <vscale x 2 x i32> @masked_gather_nxv2i32(<vscale x 2 x i32*> %ptrs, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK-DAG: mov x8, xzr
+; CHECK-DAG: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK: ret
+ %data = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+ ret <vscale x 2 x i32> %data
+}
+
+; Code generate the worst case scenario when all vector types are illegal.
+define <vscale x 32 x i32> @masked_gather_nxv32i32(i32* %base, <vscale x 32 x i32> %indices, <vscale x 32 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv32i32:
+; CHECK-NOT: unpkhi
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z0.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z1.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z2.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z3.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z4.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z5.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z6.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z7.s, sxtw #2]
+; CHECK: ret
+ %ptrs = getelementptr i32, i32* %base, <vscale x 32 x i32> %indices
+ %data = call <vscale x 32 x i32> @llvm.masked.gather.nxv32i32(<vscale x 32 x i32*> %ptrs, i32 4, <vscale x 32 x i1> %mask, <vscale x 32 x i32> undef)
+ ret <vscale x 32 x i32> %data
+}
+
+; TODO: Currently, the sign extend gets applied to the values after a 'uzp1' of two
+; registers, so it doesn't get folded away. Same for any other vector-of-pointers
+; style gathers which don't fit in an <vscale x 2 x type*> single register. Better folding
+; is required before we can check those off.
+define <vscale x 4 x i32> @masked_sgather_nxv4i8(<vscale x 4 x i8*> %ptrs, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i8:
+; CHECK: pfalse p1.b
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ld1b { z1.d }, p2/z, [x8, z1.d]
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT: sxtb z0.s, p0/m, z0.s
+; CHECK-NEXT: ret
+ %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
+ %svals = sext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %svals
+}
+
+declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+
+declare <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*>, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
+
+declare <vscale x 16 x i8> @llvm.masked.gather.nxv16i8(<vscale x 16 x i8*>, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
+declare <vscale x 32 x i32> @llvm.masked.gather.nxv32i32(<vscale x 32 x i32*>, i32, <vscale x 32 x i1>, <vscale x 32 x i32>)
More information about the llvm-branch-commits
mailing list