[llvm-branch-commits] [llvm] f6dd32f - [SVE][CodeGen] Lower scalable masked gathers

Kerry McLaughlin via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Dec 7 04:21:35 PST 2020


Author: Kerry McLaughlin
Date: 2020-12-07T12:20:41Z
New Revision: f6dd32fd3584380730a09b042cfbac852f36eb00

URL: https://github.com/llvm/llvm-project/commit/f6dd32fd3584380730a09b042cfbac852f36eb00
DIFF: https://github.com/llvm/llvm-project/commit/f6dd32fd3584380730a09b042cfbac852f36eb00.diff

LOG: [SVE][CodeGen] Lower scalable masked gathers

Lowers the llvm.masked.gather intrinsics (scalar plus vector addressing mode only)

Changes in this patch:
- Add custom lowering for MGATHER, using getGatherVecOpcode() to choose the appropriate
  gather load opcode to use.
- Improve codegen with refineIndexType/refineUniformBase, added in D90942
- Tests added for gather loads with 32 & 64-bit scaled & unscaled offsets.

Reviewed By: sdesmalen

Differential Revision: https://reviews.llvm.org/D91092

Added: 
    llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
    llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
    llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
    llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
    llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll
    llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
    llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll

Modified: 
    llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 552545b854d8..9a0925061105 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1746,6 +1746,7 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
   SDValue PassThru = MGT->getPassThru();
   SDValue Index = MGT->getIndex();
   SDValue Scale = MGT->getScale();
+  EVT MemoryVT = MGT->getMemoryVT();
   Align Alignment = MGT->getOriginalAlign();
 
   // Split Mask operand
@@ -1759,6 +1760,10 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
       std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
   }
 
+  EVT LoMemVT, HiMemVT;
+  // Split MemoryVT
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
   SDValue PassThruLo, PassThruHi;
   if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector)
     GetSplitVector(PassThru, PassThruLo, PassThruHi);
@@ -1777,11 +1782,11 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
       MGT->getRanges());
 
   SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
-  Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl, OpsLo,
+  Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl, OpsLo,
                            MMO, MGT->getIndexType());
 
   SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
-  Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl, OpsHi,
+  Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl, OpsHi,
                            MMO, MGT->getIndexType());
 
   // Build a factor node to remember that this load is independent of the
@@ -2421,11 +2426,11 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
       MGT->getRanges());
 
   SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
-  SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl,
+  SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl,
                                    OpsLo, MMO, MGT->getIndexType());
 
   SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
-  SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl,
+  SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl,
                                    OpsHi, MMO, MGT->getIndexType());
 
   // Build a factor node to remember that this load is independent of the

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index f6e131838a16..dd837d4d495f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7310,17 +7310,22 @@ SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
     return SDValue(E, 0);
   }
 
+  IndexType = TLI->getCanonicalIndexType(IndexType, VT, Ops[4]);
   auto *N = newSDNode<MaskedGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(),
                                           VTs, VT, MMO, IndexType);
   createOperands(N, Ops);
 
   assert(N->getPassThru().getValueType() == N->getValueType(0) &&
          "Incompatible type of the PassThru value in MaskedGatherSDNode");
-  assert(N->getMask().getValueType().getVectorNumElements() ==
-             N->getValueType(0).getVectorNumElements() &&
+  assert(N->getMask().getValueType().getVectorElementCount() ==
+             N->getValueType(0).getVectorElementCount() &&
          "Vector width mismatch between mask and data");
-  assert(N->getIndex().getValueType().getVectorNumElements() >=
-             N->getValueType(0).getVectorNumElements() &&
+  assert(N->getIndex().getValueType().getVectorElementCount().isScalable() ==
+             N->getValueType(0).getVectorElementCount().isScalable() &&
+         "Scalable flags of index and data do not match");
+  assert(ElementCount::isKnownGE(
+             N->getIndex().getValueType().getVectorElementCount(),
+             N->getValueType(0).getVectorElementCount()) &&
          "Vector width mismatch between index and data");
   assert(isa<ConstantSDNode>(N->getScale()) &&
          cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 397a69654933..f3bce354624b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4416,7 +4416,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
-    IndexType = ISD::SIGNED_SCALED;
+    IndexType = ISD::SIGNED_UNSCALED;
     Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
   SDValue Ops[] = { Root, Src0, Mask, Base, Index, Scale };

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 353991691968..d729252c92d9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -113,6 +113,16 @@ EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
                                   "optimization"),
                          cl::init(true));
 
+// Temporary option added for the purpose of testing functionality added
+// to DAGCombiner.cpp in D92230. It is expected that this can be removed
+// in future when both implementations will be based off MGATHER rather
+// than the GLD1 nodes added for the SVE gather load intrinsics.
+static cl::opt<bool>
+EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
+                                cl::desc("Combine extends of AArch64 masked "
+                                         "gather intrinsics"),
+                                cl::init(true));
+
 /// Value type used for condition codes.
 static const MVT MVT_CC = MVT::i32;
 
@@ -1059,6 +1069,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SINT_TO_FP, VT, Custom);
       setOperationAction(ISD::FP_TO_UINT, VT, Custom);
       setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+      setOperationAction(ISD::MGATHER, VT, Custom);
       setOperationAction(ISD::MSCATTER, VT, Custom);
       setOperationAction(ISD::MUL, VT, Custom);
       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
@@ -1111,6 +1122,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
                     MVT::nxv4f32, MVT::nxv2f64}) {
       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+      setOperationAction(ISD::MGATHER, VT, Custom);
       setOperationAction(ISD::MSCATTER, VT, Custom);
       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
       setOperationAction(ISD::SELECT, VT, Custom);
@@ -3775,6 +3787,29 @@ bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
   return ExtVal.getValueType().isScalableVector();
 }
 
+unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
+  std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
+       AArch64ISD::GLD1_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
+       AArch64ISD::GLD1_UXTW_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
+       AArch64ISD::GLD1_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
+       AArch64ISD::GLD1_SXTW_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
+       AArch64ISD::GLD1_SCALED_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
+       AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
+       AArch64ISD::GLD1_SCALED_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
+       AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
+  };
+  auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
+  return AddrModes.find(Key)->second;
+}
+
 unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
   std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
       {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
@@ -3798,7 +3833,7 @@ unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
   return AddrModes.find(Key)->second;
 }
 
-bool getScatterIndexIsExtended(SDValue Index) {
+bool getGatherScatterIndexIsExtended(SDValue Index) {
   unsigned Opcode = Index.getOpcode();
   if (Opcode == ISD::SIGN_EXTEND_INREG)
     return true;
@@ -3816,6 +3851,54 @@ bool getScatterIndexIsExtended(SDValue Index) {
   return false;
 }
 
+SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
+  assert(MGT && "Can only custom lower gather load nodes");
+
+  SDValue Index = MGT->getIndex();
+  SDValue Chain = MGT->getChain();
+  SDValue PassThru = MGT->getPassThru();
+  SDValue Mask = MGT->getMask();
+  SDValue BasePtr = MGT->getBasePtr();
+
+  ISD::MemIndexType IndexType = MGT->getIndexType();
+  bool IsScaled =
+      IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
+  bool IsSigned =
+      IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
+  bool IdxNeedsExtend =
+      getGatherScatterIndexIsExtended(Index) ||
+      Index.getSimpleValueType().getVectorElementType() == MVT::i32;
+
+  EVT VT = PassThru.getSimpleValueType();
+  EVT MemVT = MGT->getMemoryVT();
+  SDValue InputVT = DAG.getValueType(MemVT);
+
+  if (VT.getVectorElementType() == MVT::bf16 &&
+      !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+    return SDValue();
+
+  // Handle FP data
+  if (VT.isFloatingPoint()) {
+    VT = VT.changeVectorElementTypeToInteger();
+    ElementCount EC = VT.getVectorElementCount();
+    auto ScalarIntVT =
+        MVT::getIntegerVT(AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
+    PassThru = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL,
+                           MVT::getVectorVT(ScalarIntVT, EC), PassThru);
+
+    InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
+  }
+
+  SDVTList VTs = DAG.getVTList(PassThru.getSimpleValueType(), MVT::Other);
+
+  SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru};
+  return DAG.getNode(getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend), DL,
+                     VTs, Ops);
+}
+
 SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
                                              SelectionDAG &DAG) const {
   SDLoc DL(Op);
@@ -3834,7 +3917,7 @@ SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
   bool IsSigned =
       IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
   bool NeedsExtend =
-      getScatterIndexIsExtended(Index) ||
+      getGatherScatterIndexIsExtended(Index) ||
       Index.getSimpleValueType().getVectorElementType() == MVT::i32;
 
   EVT VT = StoreVal.getSimpleValueType();
@@ -3858,7 +3941,7 @@ SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
     InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
   }
 
-  if (getScatterIndexIsExtended(Index))
+  if (getGatherScatterIndexIsExtended(Index))
     Index = Index.getOperand(0);
 
   SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
@@ -4159,6 +4242,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::STORE:
     return LowerSTORE(Op, DAG);
+  case ISD::MGATHER:
+    return LowerMGATHER(Op, DAG);
   case ISD::MSCATTER:
     return LowerMSCATTER(Op, DAG);
   case ISD::VECREDUCE_SEQ_FADD:
@@ -12019,6 +12104,9 @@ static SDValue performSVEAndCombine(SDNode *N,
     return DAG.getNode(Opc, DL, N->getValueType(0), And);
   }
 
+  if (!EnableCombineMGatherIntrinsics)
+    return SDValue();
+
   SDValue Mask = N->getOperand(1);
 
   if (!Src.hasOneUse())
@@ -14982,6 +15070,9 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
     return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
   }
 
+  if (!EnableCombineMGatherIntrinsics)
+    return SDValue();
+
   // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
   // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
   unsigned NewOpc;

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 5c5b9c885809..334517ad992b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -805,6 +805,7 @@ class AArch64TargetLowering : public TargetLowering {
 
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue LowerMGATHER(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
new file mode 100644
index 000000000000..747468ae3cf4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
@@ -0,0 +1,181 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled unpacked 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT:    and z0.d, z0.d, #0xffff
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i64, i64* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+  ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr half, half* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+  ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr float, float* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+  ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr double, double* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+  ret <vscale x 2 x double> %vals
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT:    sxth z0.d, p1/m, z0.d
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled packed 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
+; CHECK-NEXT:    and z0.s, z0.s, #0xffff
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i32> %offsets
+  %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+  %vals.zext = zext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.zext
+}
+
+define <vscale x 4 x i32> @masked_gather_nxv4i32(i32* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i32, i32* %base, <vscale x 4 x i32> %offsets
+  %vals = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+  ret <vscale x 4 x i32> %vals
+}
+
+define <vscale x 4 x half> @masked_gather_nxv4f16(half* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr half, half* %base, <vscale x 4 x i32> %offsets
+  %vals = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef)
+  ret <vscale x 4 x half> %vals
+}
+
+define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr float, float* %base, <vscale x 4 x i32> %offsets
+  %vals = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
+  ret <vscale x 4 x float> %vals
+}
+
+define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i32> %offsets
+  %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+  %vals.sext = sext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.sext
+}
+
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+
+declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
+declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
new file mode 100644
index 000000000000..b214fcf15911
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
@@ -0,0 +1,328 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled unpacked 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:    and z0.d, z0.d, #0xff
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %vals.zext = zext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    and z0.d, z0.d, #0xffff
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
+  %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+  ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
+  %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+  ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
+  %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+  ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
+  %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+  ret <vscale x 2 x double> %vals
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:    sxtb z0.d, p1/m, z0.d
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %vals.sext = sext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    sxth z0.d, p1/m, z0.d
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    sxtw z0.d, p1/m, z0.d
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled packed 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT:    and z0.s, z0.s, #0xff
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+  %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
+  %vals.zext = zext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.zext
+}
+
+define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    sunpklo z2.d, z0.s
+; CHECK-NEXT:    sunpkhi z0.d, z0.s
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z2.d, z1.d, z2.d
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1h { z0.d }, p2/z, [x8, z0.d]
+; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x8, z2.d]
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    and z0.s, z0.s, #0xffff
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
+  %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+  %vals.zext = zext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.zext
+}
+
+define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    sunpklo z2.d, z0.s
+; CHECK-NEXT:    sunpkhi z0.d, z0.s
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z2.d, z1.d, z2.d
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1w { z0.d }, p2/z, [x8, z0.d]
+; CHECK-NEXT:    ld1w { z1.d }, p0/z, [x8, z2.d]
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i32*>
+  %vals = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+  ret <vscale x 4 x i32> %vals
+}
+
+define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    sunpklo z2.d, z0.s
+; CHECK-NEXT:    sunpkhi z0.d, z0.s
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z2.d, z1.d, z2.d
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1h { z0.d }, p2/z, [x8, z0.d]
+; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x8, z2.d]
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x half*>
+  %vals = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef)
+  ret <vscale x 4 x half> %vals
+}
+
+define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    sunpklo z2.d, z0.s
+; CHECK-NEXT:    sunpkhi z0.d, z0.s
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z2.d, z1.d, z2.d
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1w { z0.d }, p2/z, [x8, z0.d]
+; CHECK-NEXT:    ld1w { z1.d }, p0/z, [x8, z2.d]
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x float*>
+  %vals = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
+  ret <vscale x 4 x float> %vals
+}
+
+define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    sxtb z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+  %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
+  %vals.sext = sext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.sext
+}
+
+define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    sunpklo z2.d, z0.s
+; CHECK-NEXT:    sunpkhi z0.d, z0.s
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z2.d, z1.d, z2.d
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1h { z0.d }, p2/z, [x8, z0.d]
+; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x8, z2.d]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
+  %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+  %vals.sext = sext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.sext
+}
+
+declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+
+declare <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*>, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
+declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
+declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
new file mode 100644
index 000000000000..d938567beb04
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
@@ -0,0 +1,223 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled unpacked 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT:    and z0.d, z0.d, #0xffff
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr i64, i64* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+  ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr half, half* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+  ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr float, float* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+  ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr double, double* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+  ret <vscale x 2 x double> %vals
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled packed 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1]
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    and z0.s, z0.s, #0xffff
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %offsets.zext
+  %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+  %vals.zext = zext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.zext
+}
+
+define <vscale x 4 x i32> @masked_gather_nxv4i32(i32* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1w { z0.d }, p2/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT:    ld1w { z1.d }, p0/z, [x0, z1.d, sxtw #2]
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %ptrs = getelementptr i32, i32* %base, <vscale x 4 x i64> %offsets.zext
+  %vals = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+  ret <vscale x 4 x i32> %vals
+}
+
+define <vscale x 4 x half> @masked_gather_nxv4f16(half* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1]
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %ptrs = getelementptr half, half* %base, <vscale x 4 x i64> %offsets.zext
+  %vals = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef)
+  ret <vscale x 4 x half> %vals
+}
+
+define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1w { z0.d }, p2/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT:    ld1w { z1.d }, p0/z, [x0, z1.d, sxtw #2]
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %ptrs = getelementptr float, float* %base, <vscale x 4 x i64> %offsets.zext
+  %vals = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
+  ret <vscale x 4 x float> %vals
+}
+
+define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %offsets.zext
+  %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+  %vals.sext = sext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.sext
+}
+
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+
+declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
+declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
new file mode 100644
index 000000000000..7a47311484f8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
@@ -0,0 +1,352 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled unpacked 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:    and z0.d, z0.d, #0xff
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %vals.zext = zext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    and z0.d, z0.d, #0xffff
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
+  %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+  ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
+  %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+  ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
+  %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+  ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
+  %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+  ret <vscale x 2 x double> %vals
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %vals.sext = sext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled packed 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1b { z0.d }, p2/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:    ld1b { z1.d }, p0/z, [x0, z1.d, sxtw]
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    and z0.s, z0.s, #0xff
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+  %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
+  %vals.zext = zext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.zext
+}
+
+define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    add z1.d, z1.d, z2.d
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1h { z1.d }, p2/z, [x8, z1.d]
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    and z0.s, z0.s, #0xffff
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
+  %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+  %vals.zext = zext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.zext
+}
+
+define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    add z1.d, z1.d, z2.d
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1w { z1.d }, p2/z, [x8, z1.d]
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i32*>
+  %vals = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+  ret <vscale x 4 x i32> %vals
+}
+
+define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    add z1.d, z1.d, z2.d
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1h { z1.d }, p2/z, [x8, z1.d]
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x half*>
+  %vals = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef)
+  ret <vscale x 4 x half> %vals
+}
+
+define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    add z1.d, z1.d, z2.d
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1w { z1.d }, p2/z, [x8, z1.d]
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x float*>
+  %vals = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
+  ret <vscale x 4 x float> %vals
+}
+
+define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    uunpklo z1.d, z0.s
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1b { z0.d }, p2/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:    ld1b { z1.d }, p0/z, [x0, z1.d, sxtw]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT:    sxtb z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+  %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
+  %vals.sext = sext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.sext
+}
+
+define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    pfalse p1.b
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    add z1.d, z1.d, z2.d
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1h { z1.d }, p2/z, [x8, z1.d]
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
+  %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+  %vals.sext = sext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.sext
+}
+
+declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+
+declare <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*>, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
+declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
+declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll
new file mode 100644
index 000000000000..197ed69ee52f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
+; CHECK-NEXT:    and z0.d, z0.d, #0xffff
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, lsl #3]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i64, i64* %base, <vscale x 2 x i64> %offsets
+  %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+  ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr half, half* %base, <vscale x 2 x i64> %offsets
+  %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+  ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr float, float* %base, <vscale x 2 x i64> %offsets
+  %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+  ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, lsl #3]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr double, double* %base, <vscale x 2 x i64> %offsets
+  %vals.sext = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+  ret <vscale x 2 x double> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
new file mode 100644
index 000000000000..be8909201a83
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
@@ -0,0 +1,157 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+
+define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    and z0.d, z0.d, #0xff
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %vals.zext = zext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    and z0.d, z0.d, #0xffff
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
+  %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+  ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
+  %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+  ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
+  %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+  ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
+  %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+  ret <vscale x 2 x double> %vals
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %vals.sext = sext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, x0
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
new file mode 100644
index 000000000000..962ba079ca9e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+; Tests that exercise various type legalisation scenarios for ISD::MGATHER.
+
+; Code generate load of an illegal datatype via promotion.
+define <vscale x 2 x i32> @masked_gather_nxv2i32(<vscale x 2 x i32*> %ptrs, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK-DAG: mov x8, xzr
+; CHECK-DAG: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK:     ret
+  %data = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  ret <vscale x 2 x i32> %data
+}
+
+; Code generate the worst case scenario when all vector types are illegal.
+define <vscale x 32 x i32> @masked_gather_nxv32i32(i32* %base, <vscale x 32 x i32> %indices, <vscale x 32 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv32i32:
+; CHECK-NOT: unpkhi
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z0.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z1.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z2.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z3.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z4.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z5.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z6.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z7.s, sxtw #2]
+; CHECK: ret
+  %ptrs = getelementptr i32, i32* %base, <vscale x 32 x i32> %indices
+  %data = call <vscale x 32 x i32> @llvm.masked.gather.nxv32i32(<vscale x 32 x i32*> %ptrs, i32 4, <vscale x 32 x i1> %mask, <vscale x 32 x i32> undef)
+  ret <vscale x 32 x i32> %data
+}
+
+; TODO: Currently, the sign extend gets applied to the values after a 'uzp1' of two
+; registers, so it doesn't get folded away. Same for any other vector-of-pointers
+; style gathers which don't fit in an <vscale x 2 x type*> single register. Better folding
+; is required before we can check those off.
+define <vscale x 4 x i32> @masked_sgather_nxv4i8(<vscale x 4 x i8*> %ptrs, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i8:
+; CHECK:         pfalse p1.b
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1b { z1.d }, p2/z, [x8, z1.d]
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    sxtb z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
+  %svals = sext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %svals
+}
+
+declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+
+declare <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*>, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
+
+declare <vscale x 16 x i8> @llvm.masked.gather.nxv16i8(<vscale x 16 x i8*>, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
+declare <vscale x 32 x i32> @llvm.masked.gather.nxv32i32(<vscale x 32 x i32*>, i32, <vscale x 32 x i1>, <vscale x 32 x i32>)


        


More information about the llvm-branch-commits mailing list