[llvm] 05edfc5 - [SVE][CodeGen] Add DAG combines for s/zext_masked_gather

Wed Dec 9 03:54:38 PST 2020

Author: Kerry McLaughlin
Date: 2020-12-09T11:53:19Z
New Revision: 05edfc54750bd539f5caa30b0cd4344f68677b00

URL: https://github.com/llvm/llvm-project/commit/05edfc54750bd539f5caa30b0cd4344f68677b00
DIFF: https://github.com/llvm/llvm-project/commit/05edfc54750bd539f5caa30b0cd4344f68677b00.diff

LOG: [SVE][CodeGen] Add DAG combines for s/zext_masked_gather

This patch adds the following DAGCombines, which apply if isVectorLoadExtDesirable() returns true:
 - fold (and (masked_gather x)) -> (zext_masked_gather x)
 - fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x)

LowerMGATHER has also been updated to fetch the LoadExtType associated with the
gather and also use this value to determine the correct masked gather opcode to use.

Reviewed By: sdesmalen

Differential Revision: https://reviews.llvm.org/D92230

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
    llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
    llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
    llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
    llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll
    llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
    llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ce4ee89103ce..212e0a2ea988 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -932,6 +932,33 @@ bool DAGCombiner::isOneUseSetCC(SDValue N) const {
   return false;
 }
 
+static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy) {
+  if (!ScalarTy.isSimple())
+    return false;
+
+  uint64_t MaskForTy = 0ULL;
+  switch (ScalarTy.getSimpleVT().SimpleTy) {
+  case MVT::i8:
+    MaskForTy = 0xFFULL;
+    break;
+  case MVT::i16:
+    MaskForTy = 0xFFFFULL;
+    break;
+  case MVT::i32:
+    MaskForTy = 0xFFFFFFFFULL;
+    break;
+  default:
+    return false;
+    break;
+  }
+
+  APInt Val;
+  if (ISD::isConstantSplatVector(N, Val))
+    return Val.getLimitedValue() == MaskForTy;
+
+  return false;
+}
+
 // Returns the SDNode if it is a constant float BuildVector
 // or constant float.
 static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) {
@@ -5622,6 +5649,28 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     }
   }
 
+  // fold (and (masked_gather x)) -> (zext_masked_gather x)
+  if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
+    EVT MemVT = GN0->getMemoryVT();
+    EVT ScalarVT = MemVT.getScalarType();
+
+    if (SDValue(GN0, 0).hasOneUse() &&
+        isConstantSplatVectorMaskForType(N1.getNode(), ScalarVT) &&
+        TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
+      SDValue Ops[] = {GN0->getChain(),   GN0->getPassThru(), GN0->getMask(),
+                       GN0->getBasePtr(), GN0->getIndex(),    GN0->getScale()};
+
+      SDValue ZExtLoad = DAG.getMaskedGather(
+          DAG.getVTList(VT, MVT::Other), MemVT, SDLoc(N), Ops,
+          GN0->getMemOperand(), GN0->getIndexType(), ISD::ZEXTLOAD);
+
+      CombineTo(N, ZExtLoad);
+      AddToWorklist(ZExtLoad.getNode());
+      // Avoid recheck of N.
+      return SDValue(N, 0);
+    }
+  }
+
   // fold (and (load x), 255) -> (zextload x, i8)
   // fold (and (extload x, i16), 255) -> (zextload x, i8)
   // fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8)
@@ -11597,6 +11646,25 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
     }
   }
 
+  // fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x)
+  if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
+    if (SDValue(GN0, 0).hasOneUse() &&
+        ExtVT == GN0->getMemoryVT() &&
+        TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
+      SDValue Ops[] = {GN0->getChain(),   GN0->getPassThru(), GN0->getMask(),
+                       GN0->getBasePtr(), GN0->getIndex(),    GN0->getScale()};
+
+      SDValue ExtLoad = DAG.getMaskedGather(
+          DAG.getVTList(VT, MVT::Other), ExtVT, SDLoc(N), Ops,
+          GN0->getMemOperand(), GN0->getIndexType(), ISD::SEXTLOAD);
+
+      CombineTo(N, ExtLoad);
+      CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+      AddToWorklist(ExtLoad.getNode());
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
+    }
+  }
+
   // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16))
   if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) {
     if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 20f5ded99350..5d9c66e170ea 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3836,6 +3836,26 @@ unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
   return AddrModes.find(Key)->second;
 }
 
+unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    llvm_unreachable("unimplemented opcode");
+    return Opcode;
+  case AArch64ISD::GLD1_MERGE_ZERO:
+    return AArch64ISD::GLD1S_MERGE_ZERO;
+  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
+    return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
+  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
+    return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
+  case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
+    return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
+  case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
+    return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
+  case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
+    return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
+  }
+}
+
 bool getGatherScatterIndexIsExtended(SDValue Index) {
   unsigned Opcode = Index.getOpcode();
   if (Opcode == ISD::SIGN_EXTEND_INREG)
@@ -3865,6 +3885,7 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
   SDValue PassThru = MGT->getPassThru();
   SDValue Mask = MGT->getMask();
   SDValue BasePtr = MGT->getBasePtr();
+  ISD::LoadExtType ExtTy = MGT->getExtensionType();
 
   ISD::MemIndexType IndexType = MGT->getIndexType();
   bool IsScaled =
@@ -3874,6 +3895,7 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
   bool IdxNeedsExtend =
       getGatherScatterIndexIsExtended(Index) ||
       Index.getSimpleValueType().getVectorElementType() == MVT::i32;
+  bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD;
 
   EVT VT = PassThru.getSimpleValueType();
   EVT MemVT = MGT->getMemoryVT();
@@ -3900,9 +3922,12 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
   if (getGatherScatterIndexIsExtended(Index))
     Index = Index.getOperand(0);
 
+  unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend);
+  if (ResNeedsSignExtend)
+    Opcode = getSignExtendedGatherOpcode(Opcode);
+
   SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru};
-  return DAG.getNode(getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend), DL,
-                     VTs, Ops);
+  return DAG.getNode(Opcode, DL, VTs, Ops);
 }
 
 SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
index 32dca0d26cdc..e6b89b0070d6 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; unscaled unpacked 32-bit offsets
@@ -9,7 +10,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32>
 ; CHECK-LABEL: masked_gather_nxv2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffff
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
   %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
@@ -21,7 +21,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32>
 ; CHECK-LABEL: masked_gather_nxv2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
   %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
@@ -72,9 +71,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x
 define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
   %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
@@ -85,9 +82,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32>
 define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
   %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
@@ -103,7 +98,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32>
 ; CHECK-LABEL: masked_gather_nxv4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
-; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i32> %offsets
   %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
@@ -144,9 +138,7 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i
 define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw #1]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i32> %offsets
   %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
index 1fc048a3adf7..2d4ce50e8464 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; unscaled unpacked 32-bit offsets
@@ -9,7 +10,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %o
 ; CHECK-LABEL: masked_gather_nxv2i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT:    and z0.d, z0.d, #0xff
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
   %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
@@ -21,7 +21,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %
 ; CHECK-LABEL: masked_gather_nxv2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffff
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
@@ -34,7 +33,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %
 ; CHECK-LABEL: masked_gather_nxv2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
@@ -90,9 +88,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32
 define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
   %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
@@ -103,9 +99,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %
 define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
@@ -117,9 +111,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32>
 define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
@@ -136,7 +128,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %o
 ; CHECK-LABEL: masked_gather_nxv4i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
-; CHECK-NEXT:    and z0.s, z0.s, #0xff
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
   %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
@@ -148,7 +139,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %
 ; CHECK-LABEL: masked_gather_nxv4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
-; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
   %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
@@ -193,9 +183,7 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32>
 define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sxtb z0.s, p0/m, z0.s
+; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
   %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
@@ -206,9 +194,7 @@ define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %
 define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
   %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
index ada49b7fecbc..41f1eb4e94d4 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; unscaled unpacked 32-bit offsets
@@ -9,7 +10,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32>
 ; CHECK-LABEL: masked_gather_nxv2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffff
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets.zext
@@ -22,7 +22,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32>
 ; CHECK-LABEL: masked_gather_nxv2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets.zext
@@ -78,9 +77,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x
 define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1]
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets.zext
@@ -92,9 +89,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32>
 define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2]
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets.zext
@@ -111,7 +106,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32>
 ; CHECK-LABEL: masked_gather_nxv4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
-; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
   %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %offsets.zext
@@ -156,9 +150,7 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i
 define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw #1]
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
   %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %offsets.zext

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
index 61b8e3e53e23..51ab73c14ac9 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; unscaled unpacked 32-bit offsets
@@ -9,7 +10,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %o
 ; CHECK-LABEL: masked_gather_nxv2i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT:    and z0.d, z0.d, #0xff
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -22,7 +22,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %
 ; CHECK-LABEL: masked_gather_nxv2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffff
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -36,7 +35,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %
 ; CHECK-LABEL: masked_gather_nxv2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -97,9 +95,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32
 define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -111,9 +107,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %
 define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -126,9 +120,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32>
 define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
@@ -146,7 +138,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %o
 ; CHECK-LABEL: masked_gather_nxv4i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, z0.s, uxtw]
-; CHECK-NEXT:    and z0.s, z0.s, #0xff
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
   %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
@@ -159,7 +150,6 @@ define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %
 ; CHECK-LABEL: masked_gather_nxv4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
-; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
@@ -208,9 +198,7 @@ define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32>
 define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, z0.s, uxtw]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sxtb z0.s, p0/m, z0.s
+; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x0, z0.s, uxtw]
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
   %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
@@ -222,9 +210,7 @@ define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %
 define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw]
 ; CHECK-NEXT:    ret
   %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll
index 197ed69ee52f..15dfcc61316e 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
 
 define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffff
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
   %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
@@ -17,7 +17,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i64>
 ; CHECK-LABEL: masked_gather_nxv2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
   %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
@@ -68,9 +67,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x
 define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, lsl #1]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
   %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
@@ -81,9 +78,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i64>
 define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, lsl #2]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
   %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
index 3f4f54c5d839..3320b88691ee 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
 
 define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT:    and z0.d, z0.d, #0xff
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
   %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
@@ -17,7 +17,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i64> %
 ; CHECK-LABEL: masked_gather_nxv2i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffff
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
@@ -30,7 +29,6 @@ define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i64> %
 ; CHECK-LABEL: masked_gather_nxv2i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
@@ -86,9 +84,7 @@ define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i64
 define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0, z0.d]
 ; CHECK-NEXT:    ret
   %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
   %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
@@ -99,9 +95,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i64> %
 define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d]
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
@@ -113,9 +107,7 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i64>
 define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d]
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d]
 ; CHECK-NEXT:    ret
   %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
   %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>

diff  --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
index 962ba079ca9e..076edc1fd86d 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
@@ -1,5 +1,46 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s
+
+; Test for multiple uses of the mgather where the s/zext should not be combined
+
+define <vscale x 2 x i64> @masked_sgather_sext(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask, <vscale x 2 x i8> %vals) {
+; CHECK-LABEL: masked_sgather_sext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sxtb z2.d, p0/m, z0.d
+; CHECK-NEXT:    add z0.d, z0.d, z1.d
+; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %data = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %data.sext = sext <vscale x 2 x i8> %data to <vscale x 2 x i64>
+  %add = add <vscale x 2 x i8> %data, %vals
+  %add.sext = sext <vscale x 2 x i8> %add to <vscale x 2 x i64>
+  %mul = mul <vscale x 2 x i64> %data.sext, %add.sext
+  ret <vscale x 2 x i64> %mul
+}
+
+define <vscale x 2 x i64> @masked_sgather_zext(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask, <vscale x 2 x i8> %vals) {
+; CHECK-LABEL: masked_sgather_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add z1.d, z0.d, z1.d
+; CHECK-NEXT: and z0.d, z0.d, #0xff
+; CHECK-NEXT: and z1.d, z1.d, #0xff
+; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %data = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %data.zext = zext <vscale x 2 x i8> %data to <vscale x 2 x i64>
+  %add = add <vscale x 2 x i8> %data, %vals
+  %add.zext = zext <vscale x 2 x i8> %add to <vscale x 2 x i64>
+  %mul = mul <vscale x 2 x i64> %data.zext, %add.zext
+  ret <vscale x 2 x i64> %mul
+}
 
 ; Tests that exercise various type legalisation scenarios for ISD::MGATHER.
 
@@ -7,7 +48,7 @@
 define <vscale x 2 x i32> @masked_gather_nxv2i32(<vscale x 2 x i32*> %ptrs, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2i32:
 ; CHECK-DAG: mov x8, xzr
-; CHECK-DAG: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-DAG: ld1sw { z0.d }, p0/z, [x8, z0.d]
 ; CHECK:     ret
   %data = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
   ret <vscale x 2 x i32> %data
@@ -41,8 +82,8 @@ define <vscale x 4 x i32> @masked_sgather_nxv4i8(<vscale x 4 x i8*> %ptrs, <vsca
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
 ; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT:    ld1b { z1.d }, p2/z, [x8, z1.d]
-; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ld1sb { z1.d }, p2/z, [x8, z1.d]
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x8, z0.d]
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    sxtb z0.s, p0/m, z0.s