[llvm] 404da13 - [AArch64][SVE] Gather loads: pass 32 bit unpacked offsets as nxv2i32

Thu Jan 2 05:01:37 PST 2020

Author: Andrzej Warzynski
Date: 2020-01-02T13:01:28Z
New Revision: 404da13e1e94ac092b2010566f95dbd4b126a500

URL: https://github.com/llvm/llvm-project/commit/404da13e1e94ac092b2010566f95dbd4b126a500
DIFF: https://github.com/llvm/llvm-project/commit/404da13e1e94ac092b2010566f95dbd4b126a500.diff

LOG: [AArch64][SVE]  Gather loads: pass 32 bit unpacked offsets as nxv2i32

Summary:
Currently 32 bit unpacked offsets are passed as nxv2i64. However, as
pointed out in https://reviews.llvm.org/D71074, using nxv2i32 instead
would improve consistency with:
  * how other arguments are treated
  * how scatter stores are implemented
This patch makes sure that 32 bit unpacked offsets are passes as nxv2i32
instead of nxv2i64.

Reviewers: sdesmalen, efriedma

Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D71724

Added: 
    

Modified: 
    llvm/include/llvm/IR/IntrinsicsAArch64.td
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-scaled-offsets.ll
    llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-unscaled-offsets.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 25689614524a..623f8f3d2055 100644

--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1114,7 +1114,8 @@ class AdvSIMD_GatherLoad_32bitOffset_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
                 [
                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
-                  LLVMPointerToElt<0>, llvm_anyvector_ty
+                  LLVMPointerToElt<0>,
+                  LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>
                 ],
                 [IntrReadMem, IntrArgMemOnly]>;
 

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c4eb388ccff1..5b181622e2c4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12231,18 +12231,14 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
 }
 
 static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
-                                       unsigned Opcode) {
+                                       unsigned Opcode,
+                                       bool OnlyPackedOffsets = true) {
   EVT RetVT = N->getValueType(0);
   assert(RetVT.isScalableVector() &&
          "Gather loads are only possible for SVE vectors");
-
   SDLoc DL(N);
-  MVT RetElVT = RetVT.getVectorElementType().getSimpleVT();
-  unsigned NumElements = AArch64::SVEBitsPerBlock / RetElVT.getSizeInBits();
 
-  EVT MaxVT = llvm::MVT::getScalableVectorVT(RetElVT, NumElements);
-  if (RetVT.getSizeInBits().getKnownMinSize() >
-      MaxVT.getSizeInBits().getKnownMinSize())
+  if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
     return SDValue();
 
   // Depending on the addressing mode, this is either a pointer or a vector of
@@ -12250,12 +12246,19 @@ static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
   const SDValue Base = N->getOperand(3);
   // Depending on the addressing mode, this is either a single offset or a
   // vector of offsets  (that fits into one register)
-  const SDValue Offset = N->getOperand(4);
+  SDValue Offset = N->getOperand(4);
 
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(Base.getValueType()) ||
-      !DAG.getTargetLoweringInfo().isTypeLegal(Offset.getValueType()))
+  auto &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isTypeLegal(Base.getValueType()))
     return SDValue();
 
+  // Some gather load variants allow unpacked offsets, but only as nxv2i32
+  // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
+  // nxv2i64. Legalize accordingly.
+  if (!OnlyPackedOffsets &&
+      Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
+    Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
+
   // Return value type that is representable in hardware
   EVT HwRetVt = getSVEContainerType(RetVT);
 
@@ -12439,13 +12442,17 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     case Intrinsic::aarch64_sve_ld1_gather_index:
       return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SCALED);
     case Intrinsic::aarch64_sve_ld1_gather_sxtw:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW);
+      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW,
+                                      /*OnlyPackedOffsets=*/false);
     case Intrinsic::aarch64_sve_ld1_gather_uxtw:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW);
+      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW,
+                                      /*OnlyPackedOffsets=*/false);
     case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED);
+      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED,
+                                      /*OnlyPackedOffsets=*/false);
     case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED);
+      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED,
+                                      /*OnlyPackedOffsets=*/false);
     case Intrinsic::aarch64_sve_ld1_gather_imm:
       return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_IMM);
     case Intrinsic::aarch64_sve_st1_scatter:

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-scaled-offsets.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-scaled-offsets.ll
index c76874d66eb2..db593413f7af 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-scaled-offsets.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-scaled-offsets.ll
@@ -11,9 +11,9 @@ define <vscale x 4 x i32> @gld1h_s_uxtw_index(<vscale x 4 x i1> %pg, i16* %base,
 ; CHECK-LABEL: gld1h_s_uxtw_index:
 ; CHECK: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                          i16* %base,
-                                                                                          <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
+                                                                                  i16* %base,
+                                                                                  <vscale x 4 x i32> %b)
   %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %res
 }
@@ -22,31 +22,31 @@ define <vscale x 4 x i32> @gld1h_s_sxtw_index(<vscale x 4 x i1> %pg, i16* %base,
 ; CHECK-LABEL: gld1h_s_sxtw_index:
 ; CHECK: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                          i16* %base,
-                                                                                          <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
+                                                                                  i16* %base,
+                                                                                  <vscale x 4 x i32> %b)
   %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %res
 }
 
-define <vscale x 2 x i64> @gld1h_d_uxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1h_d_uxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1h_d_uxtw_index:
 ; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i16* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                                  i16* %base,
+                                                                                  <vscale x 2 x i32> %b)
   %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
 
-define <vscale x 2 x i64> @gld1h_d_sxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1h_d_sxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1h_d_sxtw_index:
 ; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i16* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                                  i16* %base,
+                                                                                  <vscale x 2 x i32> %b)
   %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
@@ -56,9 +56,9 @@ define <vscale x 4 x i32> @gld1w_s_uxtw_index(<vscale x 4 x i1> %pg, i32* %base,
 ; CHECK-LABEL: gld1w_s_uxtw_index:
 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                          i32* %base,
-                                                                                          <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                  i32* %base,
+                                                                                  <vscale x 4 x i32> %b)
   ret <vscale x 4 x i32> %load
 }
 
@@ -66,30 +66,30 @@ define <vscale x 4 x i32> @gld1w_s_sxtw_index(<vscale x 4 x i1> %pg, i32* %base,
 ; CHECK-LABEL: gld1w_s_sxtw_index:
 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                          i32* %base,
-                                                                                          <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                  i32* %base,
+                                                                                  <vscale x 4 x i32> %b)
   ret <vscale x 4 x i32> %load
 }
 
-define <vscale x 2 x i64> @gld1w_d_uxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1w_d_uxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1w_d_uxtw_index:
 ; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i32* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                                  i32* %base,
+                                                                                  <vscale x 2 x i32> %b)
   %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
 
-define <vscale x 2 x i64> @gld1w_d_sxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1w_d_sxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1w_d_sxtw_index:
 ; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i32* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                                  i32* %base,
+                                                                                  <vscale x 2 x i32> %b)
   %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
@@ -98,9 +98,9 @@ define <vscale x 4 x float> @gld1w_s_uxtw_index_float(<vscale x 4 x i1> %pg, flo
 ; CHECK-LABEL: gld1w_s_uxtw_index_float:
 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                            float* %base,
-                                                                                            <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                                    float* %base,
+                                                                                    <vscale x 4 x i32> %b)
   ret <vscale x 4 x float> %load
 }
 
@@ -108,50 +108,50 @@ define <vscale x 4 x float> @gld1w_s_sxtw_index_float(<vscale x 4 x i1> %pg, flo
 ; CHECK-LABEL: gld1w_s_sxtw_index_float:
 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                            float* %base,
-                                                                                            <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                                    float* %base,
+                                                                                    <vscale x 4 x i32> %b)
   ret <vscale x 4 x float> %load
 }
 
 ; LD1D
-define <vscale x 2 x i64> @gld1d_s_uxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1d_s_uxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1d_s_uxtw_index:
 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i64* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                  i64* %base,
+                                                                                  <vscale x 2 x i32> %b)
   ret <vscale x 2 x i64> %load
 }
 
-define <vscale x 2 x i64> @gld1d_sxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1d_sxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1d_sxtw_index:
 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i64* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                  i64* %base,
+                                                                                  <vscale x 2 x i32> %b)
   ret <vscale x 2 x i64> %load
 }
 
-define <vscale x 2 x double> @gld1d_uxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x double> @gld1d_uxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1d_uxtw_index_double:
 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                             double* %base,
-                                                                                             <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                                     double* %base,
+                                                                                     <vscale x 2 x i32> %b)
   ret <vscale x 2 x double> %load
 }
 
-define <vscale x 2 x double> @gld1d_sxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x double> @gld1d_sxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1d_sxtw_index_double:
 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                             double* %base,
-                                                                                             <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                                     double* %base,
+                                                                                     <vscale x 2 x i32> %b)
   ret <vscale x 2 x double> %load
 }
 
@@ -166,9 +166,9 @@ define <vscale x 4 x i32> @gld1sh_s_uxtw_index(<vscale x 4 x i1> %pg, i16* %base
 ; CHECK-LABEL: gld1sh_s_uxtw_index:
 ; CHECK: ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw #1]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                          i16* %base,
-                                                                                          <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
+                                                                                  i16* %base,
+                                                                                  <vscale x 4 x i32> %b)
   %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %res
 }
@@ -177,79 +177,79 @@ define <vscale x 4 x i32> @gld1sh_s_sxtw_index(<vscale x 4 x i1> %pg, i16* %base
 ; CHECK-LABEL: gld1sh_s_sxtw_index:
 ; CHECK: ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw #1]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                          i16* %base,
-                                                                                          <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16(<vscale x 4 x i1> %pg,
+                                                                                  i16* %base,
+                                                                                  <vscale x 4 x i32> %b)
   %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %res
 }
 
-define <vscale x 2 x i64> @gld1sh_d_uxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sh_d_uxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sh_d_uxtw_index:
 ; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i16* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                                  i16* %base,
+                                                                                  <vscale x 2 x i32> %b)
   %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
 
-define <vscale x 2 x i64> @gld1sh_d_sxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sh_d_sxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sh_d_sxtw_index:
 ; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i16* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                                  i16* %base,
+                                                                                  <vscale x 2 x i32> %b)
   %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
 
 ; LD1SW
-define <vscale x 2 x i64> @gld1sw_d_uxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sw_d_uxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sw_d_uxtw_index:
 ; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i32* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                                  i32* %base,
+                                                                                  <vscale x 2 x i32> %b)
   %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
 
-define <vscale x 2 x i64> @gld1sw_d_sxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sw_d_sxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sw_d_sxtw_index:
 ; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                          i32* %base,
-                                                                                          <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                                  i32* %base,
+                                                                                  <vscale x 2 x i32> %b)
   %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
 
 
 ; LD1H/LD1SH
-declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
-declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
 
-declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
-declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
 
 ; LD1W/LD1SW
-declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
-declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
 
-declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
-declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
 
-declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
-declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
 
 ; LD1D
-declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
-declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>)
 
-declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
-declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>)

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-unscaled-offsets.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-unscaled-offsets.ll
index 24bfa3dcd43e..ba8806986d69 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-unscaled-offsets.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-unscaled-offsets.ll
@@ -11,9 +11,9 @@ define <vscale x 4 x i32> @gld1b_s_uxtw(<vscale x 4 x i1> %pg, i8* %base, <vscal
 ; CHECK-LABEL: gld1b_s_uxtw:
 ; CHECK: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                  i8* %base,
-                                                                                  <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i8(<vscale x 4 x i1> %pg,
+                                                                          i8* %base,
+                                                                          <vscale x 4 x i32> %b)
   %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %res
 }
@@ -22,31 +22,31 @@ define <vscale x 4 x i32> @gld1b_s_sxtw(<vscale x 4 x i1> %pg, i8* %base, <vscal
 ; CHECK-LABEL: gld1b_s_sxtw:
 ; CHECK: ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                  i8* %base,
-                                                                                  <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i8(<vscale x 4 x i1> %pg,
+                                                                          i8* %base,
+                                                                          <vscale x 4 x i32> %b)
   %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %res
 }
 
-define <vscale x 2 x i64> @gld1b_d_uxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1b_d_uxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1b_d_uxtw:
 ; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                  i8* %base,
-                                                                                  <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i8(<vscale x 2 x i1> %pg,
+                                                                          i8* %base,
+                                                                          <vscale x 2 x i32> %b)
   %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
 
-define <vscale x 2 x i64> @gld1b_d_sxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1b_d_sxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1b_d_sxtw:
 ; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                  i8* %base,
-                                                                                  <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i8(<vscale x 2 x i1> %pg,
+                                                                          i8* %base,
+                                                                          <vscale x 2 x i32> %b)
   %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
@@ -56,9 +56,9 @@ define <vscale x 4 x i32> @gld1h_s_uxtw(<vscale x 4 x i1> %pg, i16* %base, <vsca
 ; CHECK-LABEL: gld1h_s_uxtw:
 ; CHECK: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                    i16* %base,
-                                                                                    <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i16(<vscale x 4 x i1> %pg,
+                                                                            i16* %base,
+                                                                            <vscale x 4 x i32> %b)
   %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %res
 }
@@ -67,31 +67,31 @@ define <vscale x 4 x i32> @gld1h_s_sxtw(<vscale x 4 x i1> %pg, i16* %base, <vsca
 ; CHECK-LABEL: gld1h_s_sxtw:
 ; CHECK: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                    i16* %base,
-                                                                                    <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i16(<vscale x 4 x i1> %pg,
+                                                                            i16* %base,
+                                                                            <vscale x 4 x i32> %b)
   %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %res
 }
 
-define <vscale x 2 x i64> @gld1h_d_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1h_d_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1h_d_uxtw:
 ; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i16* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                            i16* %base,
+                                                                            <vscale x 2 x i32> %b)
   %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
 
-define <vscale x 2 x i64> @gld1h_d_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1h_d_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1h_d_sxtw:
 ; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i16* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                            i16* %base,
+                                                                            <vscale x 2 x i32> %b)
   %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
@@ -101,9 +101,9 @@ define <vscale x 4 x i32> @gld1w_s_uxtw(<vscale x 4 x i1> %pg, i32* %base, <vsca
 ; CHECK-LABEL: gld1w_s_uxtw:
 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                    i32* %base,
-                                                                                    <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                            i32* %base,
+                                                                            <vscale x 4 x i32> %b)
   ret <vscale x 4 x i32> %load
 }
 
@@ -111,30 +111,30 @@ define <vscale x 4 x i32> @gld1w_s_sxtw(<vscale x 4 x i1> %pg, i32* %base, <vsca
 ; CHECK-LABEL: gld1w_s_sxtw:
 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                    i32* %base,
-                                                                                    <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                            i32* %base,
+                                                                            <vscale x 4 x i32> %b)
   ret <vscale x 4 x i32> %load
 }
 
-define <vscale x 2 x i64> @gld1w_d_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1w_d_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1w_d_uxtw:
 ; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i32* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                            i32* %base,
+                                                                            <vscale x 2 x i32> %b)
   %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
 
-define <vscale x 2 x i64> @gld1w_d_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1w_d_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1w_d_sxtw:
 ; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i32* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                            i32* %base,
+                                                                            <vscale x 2 x i32> %b)
   %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
@@ -143,9 +143,9 @@ define <vscale x 4 x float> @gld1w_s_uxtw_float(<vscale x 4 x i1> %pg, float* %b
 ; CHECK-LABEL: gld1w_s_uxtw_float:
 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                      float* %base,
-                                                                                      <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                              float* %base,
+                                                                              <vscale x 4 x i32> %b)
   ret <vscale x 4 x float> %load
 }
 
@@ -153,50 +153,50 @@ define <vscale x 4 x float> @gld1w_s_sxtw_float(<vscale x 4 x i1> %pg, float* %b
 ; CHECK-LABEL: gld1w_s_sxtw_float:
 ; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                      float* %base,
-                                                                                      <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                              float* %base,
+                                                                              <vscale x 4 x i32> %b)
   ret <vscale x 4 x float> %load
 }
 
 ; LD1D
-define <vscale x 2 x i64> @gld1d_d_uxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1d_d_uxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1d_d_uxtw:
 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i64* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                            i64* %base,
+                                                                            <vscale x 2 x i32> %b)
   ret <vscale x 2 x i64> %load
 }
 
-define <vscale x 2 x i64> @gld1d_d_sxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1d_d_sxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1d_d_sxtw:
 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i64* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                            i64* %base,
+                                                                            <vscale x 2 x i32> %b)
   ret <vscale x 2 x i64> %load
 }
 
-define <vscale x 2 x double> @gld1d_d_uxtw_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x double> @gld1d_d_uxtw_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1d_d_uxtw_double:
 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                       double* %base,
-                                                                                       <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                               double* %base,
+                                                                               <vscale x 2 x i32> %b)
   ret <vscale x 2 x double> %load
 }
 
-define <vscale x 2 x double> @gld1d_d_sxtw_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x double> @gld1d_d_sxtw_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1d_d_sxtw_double:
 ; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                       double* %base,
-                                                                                       <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                               double* %base,
+                                                                               <vscale x 2 x i32> %b)
   ret <vscale x 2 x double> %load
 }
 
@@ -211,9 +211,9 @@ define <vscale x 4 x i32> @gld1sb_s_uxtw(<vscale x 4 x i1> %pg, i8* %base, <vsca
 ; CHECK-LABEL: gld1sb_s_uxtw:
 ; CHECK: ld1sb { z0.s }, p0/z, [x0, z0.s, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                  i8* %base,
-                                                                                  <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i8(<vscale x 4 x i1> %pg,
+                                                                          i8* %base,
+                                                                          <vscale x 4 x i32> %b)
   %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %res
 }
@@ -222,31 +222,31 @@ define <vscale x 4 x i32> @gld1sb_s_sxtw(<vscale x 4 x i1> %pg, i8* %base, <vsca
 ; CHECK-LABEL: gld1sb_s_sxtw:
 ; CHECK: ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                  i8* %base,
-                                                                                  <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i8(<vscale x 4 x i1> %pg,
+                                                                          i8* %base,
+                                                                          <vscale x 4 x i32> %b)
   %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %res
 }
 
-define <vscale x 2 x i64> @gld1sb_d_uxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sb_d_uxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sb_d_uxtw:
 ; CHECK: ld1sb { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                  i8* %base,
-                                                                                  <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i8(<vscale x 2 x i1> %pg,
+                                                                          i8* %base,
+                                                                          <vscale x 2 x i32> %b)
   %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
 
-define <vscale x 2 x i64> @gld1sb_d_sxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sb_d_sxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sb_d_sxtw:
 ; CHECK: ld1sb { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                  i8* %base,
-                                                                                  <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i8(<vscale x 2 x i1> %pg,
+                                                                          i8* %base,
+                                                                          <vscale x 2 x i32> %b)
   %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
@@ -256,9 +256,9 @@ define <vscale x 4 x i32> @gld1sh_s_uxtw(<vscale x 4 x i1> %pg, i16* %base, <vsc
 ; CHECK-LABEL: gld1sh_s_uxtw:
 ; CHECK: ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw]
 ; CHECK-NEXT:	ret
-  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                    i16* %base,
-                                                                                    <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i16(<vscale x 4 x i1> %pg,
+                                                                            i16* %base,
+                                                                            <vscale x 4 x i32> %b)
   %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %res
 }
@@ -267,82 +267,82 @@ define <vscale x 4 x i32> @gld1sh_s_sxtw(<vscale x 4 x i1> %pg, i16* %base, <vsc
 ; CHECK-LABEL: gld1sh_s_sxtw:
 ; CHECK: ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
-                                                                                    i16* %base,
-                                                                                    <vscale x 4 x i32> %b)
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i16(<vscale x 4 x i1> %pg,
+                                                                            i16* %base,
+                                                                            <vscale x 4 x i32> %b)
   %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
   ret <vscale x 4 x i32> %res
 }
 
-define <vscale x 2 x i64> @gld1sh_d_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sh_d_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sh_d_uxtw:
 ; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i16* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                            i16* %base,
+                                                                            <vscale x 2 x i32> %b)
   %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
 
-define <vscale x 2 x i64> @gld1sh_d_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sh_d_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sh_d_sxtw:
 ; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i16* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                            i16* %base,
+                                                                            <vscale x 2 x i32> %b)
   %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
 
 ; LD1SW
-define <vscale x 2 x i64> @gld1sw_d_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sw_d_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sw_d_uxtw:
 ; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i32* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                            i32* %base,
+                                                                            <vscale x 2 x i32> %b)
   %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
 
-define <vscale x 2 x i64> @gld1sw_d_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+define <vscale x 2 x i64> @gld1sw_d_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: gld1sw_d_sxtw:
 ; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw]
 ; CHECK-NEXT: ret
-  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
-                                                                                    i32* %base,
-                                                                                    <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                            i32* %base,
+                                                                            <vscale x 2 x i32> %b)
   %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
   ret <vscale x 2 x i64> %res
 }
 
 ; LD1B/LD1SB
-declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i8.nxv4i32(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
-declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i8.nxv2i64(<vscale x 2 x i1>, i8*, <vscale x 2 x i64>)
-declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i8.nxv4i32(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
-declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i8.nxv2i64(<vscale x 2 x i1>, i8*, <vscale x 2 x i64>)
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i8(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i8(<vscale x 2 x i1>, i8*, <vscale x 2 x i32>)
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i8(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i8(<vscale x 2 x i1>, i8*, <vscale x 2 x i32>)
 
 ; LD1H/LD1SH
-declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i16.nxv4i32(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
-declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i16.nxv2i64(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
-declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i16.nxv4i32(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
-declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i16.nxv2i64(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i16(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i32>)
 
 ; LD1W/LD1SW
-declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i32.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
-declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i32.nxv2i64(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
-declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i32.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
-declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i32.nxv2i64(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i32>)
 
-declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4f32.nxv4i32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
-declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4f32.nxv4i32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4f32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
 
 ; LD1D
-declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i64.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
-declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i64.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i32>)
 
-declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2f64.nxv2i64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
-declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2f64.nxv2i64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i32>)