[llvm] ff2dd8a - [AArch64][SVE] Fold vector ZExt/SExt into gather loads where possible

Tue Mar 16 08:10:22 PDT 2021

Author: Joe Ellis
Date: 2021-03-16T15:09:46Z
New Revision: ff2dd8a21251ba0e6d284c9823ff1118a23b59ae

URL: https://github.com/llvm/llvm-project/commit/ff2dd8a21251ba0e6d284c9823ff1118a23b59ae
DIFF: https://github.com/llvm/llvm-project/commit/ff2dd8a21251ba0e6d284c9823ff1118a23b59ae.diff

LOG: [AArch64][SVE] Fold vector ZExt/SExt into gather loads where possible

This commit folds sxtw'd or uxtw'd offsets into gather loads where
possible with a DAGCombine optimization.

As an example, the following code:

     1	#include <arm_sve.h>
     2
     3	svuint64_t func(svbool_t pred, const int32_t *base, svint64_t offsets) {
     4	  return svld1sw_gather_s64offset_u64(
     5	    pred, base, svextw_s64_x(pred, offsets)
     6	  );
     7	}

would previously lower to the following assembly:

    sxtw	z0.d, p0/m, z0.d
    ld1sw	{ z0.d }, p0/z, [x0, z0.d]
    ret

but now lowers to:

    ld1sw   { z0.d }, p0/z, [x0, z0.d, sxtw]
    ret

Differential Revision: https://reviews.llvm.org/D97858

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll
    llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 613895dd3625..e61a6edac34c 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14400,6 +14400,63 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opc = N->getOpcode();
+
+  assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
+           Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) ||
+          (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
+           Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) &&
+         "Invalid opcode.");
+
+  const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
+                      Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
+  const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
+                      Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
+  const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
+                        Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
+                        Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
+                        Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
+
+  SDLoc DL(N);
+  SDValue Chain = N->getOperand(0);
+  SDValue Pg = N->getOperand(1);
+  SDValue Base = N->getOperand(2);
+  SDValue Offset = N->getOperand(3);
+  SDValue Ty = N->getOperand(4);
+
+  EVT ResVT = N->getValueType(0);
+
+  const auto OffsetOpc = Offset.getOpcode();
+  const bool OffsetIsZExt =
+      OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
+  const bool OffsetIsSExt =
+      OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
+
+  // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
+  if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
+    SDValue ExtPg = Offset.getOperand(0);
+    VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
+    EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
+
+    // If the predicate for the sign- or zero-extended offset is the
+    // same as the predicate used for this load and the sign-/zero-extension
+    // was from a 32-bits...
+    if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
+      SDValue UnextendedOffset = Offset.getOperand(1);
+
+      unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
+      if (Signed)
+        NewOpc = getSignExtendedGatherOpcode(NewOpc);
+
+      return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
+                         {Chain, Pg, Base, UnextendedOffset, Ty});
+    }
+  }
+
+  return SDValue();
+}
+
 /// Target-specific DAG combine function for post-increment LD1 (lane) and
 /// post-increment LD1R.
 static SDValue performPostLD1Combine(SDNode *N,
@@ -15777,6 +15834,21 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performNVCASTCombine(N);
   case AArch64ISD::UZP1:
     return performUzpCombine(N, DAG);
+  case AArch64ISD::GLD1_MERGE_ZERO:
+  case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
+  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
+  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
+  case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
+  case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
+  case AArch64ISD::GLD1_IMM_MERGE_ZERO:
+  case AArch64ISD::GLD1S_MERGE_ZERO:
+  case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
+  case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
+  case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
+  case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
+  case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
+  case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
+    return performGLD1Combine(N, DAG);
   case ISD::INSERT_VECTOR_ELT:
     return performPostLD1Combine(N, DCI, true);
   case ISD::EXTRACT_VECTOR_ELT:

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll
index 64cb89edd679..57778847b545 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll
@@ -78,7 +78,194 @@ define <vscale x 2 x i64> @gld1sw_index(<vscale x 2 x i1> %pg, i32* %base, <vsca
   ret <vscale x 2 x i64> %res
 }
 
+;
+; LD1H, LD1W, LD1D: base + 64-bit sxtw'd scaled offset
+;   e.g. ld1h z0.d, p0/z, [x0, z0.d, sxtw #1]
+;
+
+define <vscale x 2 x i64> @gld1h_index_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1h_index_sxtw
+; CHECK:	    ld1h	{ z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT:	ret
+  %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.index.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                             i16* %base,
+                                                                             <vscale x 2 x i64> %sxtw)
+  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1w_index_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1w_index_sxtw
+; CHECK:	    ld1w	{ z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT:	ret
+  %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.index.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                             i32* %base,
+                                                                             <vscale x 2 x i64> %sxtw)
+  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1d_index_sxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_index_sxtw
+; CHECK:	    ld1d	{ z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT:	ret
+  %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.index.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                             i64* %base,
+                                                                             <vscale x 2 x i64> %sxtw)
+  ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gld1d_index_double_sxtw(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_index_double_sxtw
+; CHECK:	    ld1d	{ z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT:	ret
+  %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                                double* %base,
+                                                                                <vscale x 2 x i64> %sxtw)
+  ret <vscale x 2 x double> %load
+}
+
+;
+; LD1SH, LD1SW: base + 64-bit sxtw'd scaled offset
+;   e.g. ld1sh z0.d, p0/z, [x0, z0.d, sxtw #1]
+;
+
+define <vscale x 2 x i64> @gld1sh_index_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1sh_index_sxtw
+; CHECK:	    ld1sh	{ z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT:	ret
+  %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.index.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                             i16* %base,
+                                                                             <vscale x 2 x i64> %sxtw)
+  %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1sw_index_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1sw_index_sxtw
+; CHECK:	    ld1sw	{ z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT:	ret
+  %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.index.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                             i32* %base,
+                                                                             <vscale x 2 x i64> %sxtw)
+  %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+;
+; LD1H, LD1W, LD1D: base + 64-bit sxtw'd scaled offset
+;   e.g. ld1h z0.d, p0/z, [x0, z0.d, uxtw #1]
+;
+
+define <vscale x 2 x i64> @gld1h_index_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1h_index_uxtw
+; CHECK:	    ld1h	{ z0.d }, p0/z, [x0, z0.d, uxtw #1]
+; CHECK-NEXT:	ret
+  %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.index.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                             i16* %base,
+                                                                             <vscale x 2 x i64> %uxtw)
+  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1w_index_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1w_index_uxtw
+; CHECK:	    ld1w	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
+; CHECK-NEXT:	ret
+  %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.index.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                             i32* %base,
+                                                                             <vscale x 2 x i64> %uxtw)
+  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1d_index_uxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_index_uxtw
+; CHECK:	    ld1d	{ z0.d }, p0/z, [x0, z0.d, uxtw #3]
+; CHECK-NEXT:	ret
+  %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.index.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                             i64* %base,
+                                                                             <vscale x 2 x i64> %uxtw)
+  ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gld1d_index_double_uxtw(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_index_double_uxtw
+; CHECK:	    ld1d	{ z0.d }, p0/z, [x0, z0.d, uxtw #3]
+; CHECK-NEXT:	ret
+  %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                                double* %base,
+                                                                                <vscale x 2 x i64> %uxtw)
+  ret <vscale x 2 x double> %load
+}
+
+;
+; LD1SH, LD1SW: base + 64-bit uxtw'd scaled offset
+;   e.g. ld1sh z0.d, p0/z, [x0, z0.d, uxtw #1]
+;
+
+define <vscale x 2 x i64> @gld1sh_index_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1sh_index_uxtw
+; CHECK:	    ld1sh	{ z0.d }, p0/z, [x0, z0.d, uxtw #1]
+; CHECK-NEXT:	ret
+  %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.index.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                             i16* %base,
+                                                                             <vscale x 2 x i64> %uxtw)
+  %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1sw_index_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1sw_index_uxtw
+; CHECK:	    ld1sw	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
+; CHECK-NEXT:	ret
+  %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.index.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                             i32* %base,
+                                                                             <vscale x 2 x i64> %uxtw)
+  %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
 declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.index.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
 declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.index.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.index.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
 declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
+
+declare <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>)

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll
index 7cf641a26427..21c08d152ef3 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll
@@ -100,8 +100,251 @@ define <vscale x 2 x i64> @gld1sw_d(<vscale x 2 x i1> %pg, i32* %base, <vscale x
   ret <vscale x 2 x i64> %res
 }
 
+;
+; LD1B, LD1W, LD1H, LD1D: base + 64-bit sxtw'd unscaled offset
+;   e.g. ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
+;
+
+define <vscale x 2 x i64> @gld1b_d_sxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1b_d_sxtw:
+; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+  %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.nxv2i8(<vscale x 2 x i1> %pg,
+                                                                     i8* %base,
+                                                                     <vscale x 2 x i64> %sxtw)
+  %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1h_d_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1h_d_sxtw:
+; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+  %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                       i16* %base,
+                                                                       <vscale x 2 x i64> %sxtw)
+  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1w_d_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: gld1w_d_sxtw:
+; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+  %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %offsets)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                       i32* %base,
+                                                                       <vscale x 2 x i64> %sxtw)
+  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1d_d_sxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_d_sxtw:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+  %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                       i64* %base,
+                                                                       <vscale x 2 x i64> %sxtw)
+  ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gld1d_d_double_sxtw(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_d_double_sxtw:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+  %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                       double* %base,
+                                                                       <vscale x 2 x i64> %sxtw)
+  ret <vscale x 2 x double> %load
+}
+
+;
+; LD1SB, LD1SW, LD1SH: base + 64-bit sxtw'd unscaled offset
+;   e.g. ld1sh { z0.d }, p0/z, [x0, z0.d]
+;
+
+define <vscale x 2 x i64> @gld1sb_d_sxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1sb_d_sxtw:
+; CHECK: ld1sb { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+  %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.nxv2i8(<vscale x 2 x i1> %pg,
+                                                                     i8* %base,
+                                                                     <vscale x 2 x i64> %sxtw)
+  %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1sh_d_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1sh_d_sxtw:
+; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+  %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                       i16* %base,
+                                                                       <vscale x 2 x i64> %sxtw)
+  %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1sw_d_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: gld1sw_d_sxtw:
+; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+  %sxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %offsets)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                       i32* %base,
+                                                                       <vscale x 2 x i64> %sxtw)
+  %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+;
+; LD1B, LD1W, LD1H, LD1D: base + 64-bit uxtw'd unscaled offset
+;   e.g. ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
+;
+
+define <vscale x 2 x i64> @gld1b_d_uxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1b_d_uxtw:
+; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+  %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.nxv2i8(<vscale x 2 x i1> %pg,
+                                                                     i8* %base,
+                                                                     <vscale x 2 x i64> %uxtw)
+  %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1h_d_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1h_d_uxtw:
+; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+  %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                       i16* %base,
+                                                                       <vscale x 2 x i64> %uxtw)
+  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1w_d_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: gld1w_d_uxtw:
+; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+  %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %offsets)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                       i32* %base,
+                                                                       <vscale x 2 x i64> %uxtw)
+  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1d_d_uxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_d_uxtw:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+  %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                       i64* %base,
+                                                                       <vscale x 2 x i64> %uxtw)
+  ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gld1d_d_double_uxtw(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_d_double_uxtw:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+  %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                       double* %base,
+                                                                       <vscale x 2 x i64> %uxtw)
+  ret <vscale x 2 x double> %load
+}
+
+;
+; LD1SB, LD1SW, LD1SH: base + 64-bit uxtw'd unscaled offset
+;   e.g. ld1sh { z0.d }, p0/z, [x0, z0.d]
+;
+
+define <vscale x 2 x i64> @gld1sb_d_uxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1sb_d_uxtw:
+; CHECK: ld1sb { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+  %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.nxv2i8(<vscale x 2 x i1> %pg,
+                                                                     i8* %base,
+                                                                     <vscale x 2 x i64> %uxtw)
+  %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1sh_d_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1sh_d_uxtw:
+; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+  %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                       i16* %base,
+                                                                       <vscale x 2 x i64> %uxtw)
+  %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1sw_d_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: gld1sw_d_uxtw:
+; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+  %uxtw = call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> undef,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x i64> %offsets)
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                       i32* %base,
+                                                                       <vscale x 2 x i64> %uxtw)
+  %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
 declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.nxv2i8(<vscale x 2 x i1>, i8*, <vscale x 2 x i64>)
 declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
 declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
 declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
+
+declare <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>)