[llvm] 46b9f14 - [AArch64][SVE] Add intrinsics for non-temporal scatters/gathers

Thu Mar 12 06:56:07 PDT 2020

Author: Andrzej Warzynski
Date: 2020-03-12T13:55:56Z
New Revision: 46b9f14d712d47fa0bebd72edd8cfe58cae11f53

URL: https://github.com/llvm/llvm-project/commit/46b9f14d712d47fa0bebd72edd8cfe58cae11f53
DIFF: https://github.com/llvm/llvm-project/commit/46b9f14d712d47fa0bebd72edd8cfe58cae11f53.diff

LOG: [AArch64][SVE] Add intrinsics for non-temporal scatters/gathers

Summary:
This patch adds the following intrinsics for non-temporal gather loads
and scatter stores:
  * aarch64_sve_ldnt1_gather_index
  * aarch64_sve_stnt1_scatter_index
These intrinsics implement the "scalar + vector of indices" addressing
mode.

As opposed to regular and first-faulting gathers/scatters, there's no
instruction that would take indices and then scale them. Instead, the
indices for non-temporal gathers/scatters are scaled before the
intrinsics are lowered to `ldnt1` instructions.

The new ISD nodes, GLDNT1_INDEX and SSTNT1_INDEX, are only used as
placeholders so that we can easily identify the cases implemented in
this patch in performGatherLoadCombine and performScatterStoreCombined.
Once encountered, they are replaced with:
  * GLDNT1_INDEX -> SPLAT_VECTOR + SHL + GLDNT1
  * SSTNT1_INDEX -> SPLAT_VECTOR + SHL + SSTNT1

The patterns for lowering ISD::SHL for scalable vectors (required by
this patch) were missing, so these are added too.

Reviewed By: sdesmalen

Differential Revision: https://reviews.llvm.org/D75601

Added: 
    llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-scaled-offset.ll
    llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-scaled-offset.ll

Modified: 
    llvm/include/llvm/IR/IntrinsicsAArch64.td
    llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
    llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h
    llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
    llvm/lib/Target/AArch64/SVEInstrFormats.td

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index bd5a05d2e022..033e63255972 100644

--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1782,6 +1782,9 @@ def int_aarch64_sve_ldff1_gather_scalar_offset : AdvSIMD_GatherLoad_VS_Intrinsic
 // 64 bit unscaled offsets
 def int_aarch64_sve_ldnt1_gather : AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic;
 
+// 64 bit indices
+def int_aarch64_sve_ldnt1_gather_index : AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic;
+
 // 32 bit unscaled offsets, zero (zxtw) extended to 64 bits
 def int_aarch64_sve_ldnt1_gather_uxtw : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic;
 
@@ -1829,6 +1832,10 @@ def int_aarch64_sve_st1_scatter_scalar_offset : AdvSIMD_ScatterStore_VS_Intrinsi
 // 64 bit unscaled offsets
 def int_aarch64_sve_stnt1_scatter : AdvSIMD_ScatterStore_SV_64b_Offsets_Intrinsic;
 
+// 64 bit indices
+def int_aarch64_sve_stnt1_scatter_index
+    : AdvSIMD_ScatterStore_SV_64b_Offsets_Intrinsic;
+
 // 32 bit unscaled offsets, zero (zxtw) extended to 64 bits
 def int_aarch64_sve_stnt1_scatter_uxtw : AdvSIMD_ScatterStore_SV_32b_Offsets_Intrinsic;
 

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 53357a44c4d7..e6d431af5954 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5262,7 +5262,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     // amounts.  This catches things like trying to shift an i1024 value by an
     // i8, which is easy to fall into in generic code that uses
     // TLI.getShiftAmount().
-    assert(N2.getValueSizeInBits() >= Log2_32_Ceil(N1.getValueSizeInBits()) &&
+    assert(N2.getValueType().getScalarSizeInBits().getFixedSize() >=
+               Log2_32_Ceil(VT.getScalarSizeInBits().getFixedSize()) &&
            "Invalid use of small shift amount with oversized value!");
 
     // Always fold shifts of i1 values so the code generator doesn't need to

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 376688825c06..05675503c96f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -190,6 +190,11 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
     return SelectSVELogicalImm(N, VT, Imm);
   }
 
+  template <unsigned Low, unsigned High>
+  bool SelectSVEShiftImm64(SDValue N, SDValue &Imm) {
+    return SelectSVEShiftImm64(N, Low, High, Imm);
+  }
+
   // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
   template<signed Min, signed Max, signed Scale, bool Shift>
   bool SelectCntImm(SDValue N, SDValue &Imm) {
@@ -307,6 +312,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
   bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm);
 
   bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
+  bool SelectSVEShiftImm64(SDValue N, uint64_t Low, uint64_t High,
+                           SDValue &Imm);
 
   bool SelectSVEArithImm(SDValue N, SDValue &Imm);
   bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
@@ -3072,6 +3079,24 @@ bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm) {
   return false;
 }
 
+// This method is only needed to "cast" i64s into i32s when the value
+// is a valid shift which has been splatted into a vector with i64 elements.
+// Every other type is fine in tablegen.
+bool AArch64DAGToDAGISel::SelectSVEShiftImm64(SDValue N, uint64_t Low,
+                                              uint64_t High, SDValue &Imm) {
+  if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
+    uint64_t ImmVal = CN->getZExtValue();
+    SDLoc DL(N);
+
+    if (ImmVal >= Low && ImmVal <= High) {
+      Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
+      return true;
+    }
+  }
+
+  return false;
+}
+
 bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) {
   // tagp(FrameIndex, IRGstack, tag_offset):
   // since the offset between FrameIndex and IRGstack is a compile-time

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 208e16da8028..e2c56c39c06d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1440,6 +1440,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case AArch64ISD::GLDFF1S_IMM:       return "AArch64ISD::GLDFF1S_IMM";
 
   case AArch64ISD::GLDNT1:            return "AArch64ISD::GLDNT1";
+  case AArch64ISD::GLDNT1_INDEX:      return "AArch64ISD::GLDNT1_INDEX";
   case AArch64ISD::GLDNT1S:           return "AArch64ISD::GLDNT1S";
 
   case AArch64ISD::SST1:              return "AArch64ISD::SST1";
@@ -1451,6 +1452,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case AArch64ISD::SST1_IMM:          return "AArch64ISD::SST1_IMM";
 
   case AArch64ISD::SSTNT1:            return "AArch64ISD::SSTNT1";
+  case AArch64ISD::SSTNT1_INDEX:      return "AArch64ISD::SSTNT1_INDEX";
 
   case AArch64ISD::LDP:               return "AArch64ISD::LDP";
   case AArch64ISD::STP:               return "AArch64ISD::STP";
@@ -12628,6 +12630,19 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
                      DAG.getConstant(MinOffset, DL, MVT::i64));
 }
 
+// Turns the vector of indices into a vector of byte offstes by scaling Offset
+// by (BitWidth / 8).
+static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
+                                          SDLoc DL, unsigned BitWidth) {
+  assert(Offset.getValueType().isScalableVector() &&
+         "This method is only for scalable vectors of offsets");
+
+  SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
+  SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
+
+  return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
+}
+
 static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
                                           unsigned Opcode,
                                           bool OnlyPackedOffsets = true) {
@@ -12655,6 +12670,15 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
   // vector of offsets  (that fits into one register)
   SDValue Offset = N->getOperand(5);
 
+  // For "scalar + vector of indices", just scale the indices. This only
+  // applies to non-temporal scatters because there's no instruction that takes
+  // indicies.
+  if (Opcode == AArch64ISD::SSTNT1_INDEX) {
+    Offset =
+        getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
+    Opcode = AArch64ISD::SSTNT1;
+  }
+
   // In the case of non-temporal gather loads there's only one SVE instruction
   // per data-size: "scalar + vector", i.e.
   //    * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
@@ -12749,6 +12773,15 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
   // vector of offsets  (that fits into one register)
   SDValue Offset = N->getOperand(4);
 
+  // For "scalar + vector of indices", just scale the indices. This only
+  // applies to non-temporal gathers because there's no instruction that takes
+  // indicies.
+  if (Opcode == AArch64ISD::GLDNT1_INDEX) {
+    Offset =
+        getScaledOffsetForBitWidth(DAG, Offset, DL, RetElVT.getSizeInBits());
+    Opcode = AArch64ISD::GLDNT1;
+  }
+
   // In the case of non-temporal gather loads there's only one SVE instruction
   // per data-size: "scalar + vector", i.e.
   //    * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
@@ -13006,6 +13039,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
       return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1);
     case Intrinsic::aarch64_sve_ldnt1_gather:
       return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1);
+    case Intrinsic::aarch64_sve_ldnt1_gather_index:
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_INDEX);
     case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
       return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1);
     case Intrinsic::aarch64_sve_ldnf1:
@@ -13020,6 +13055,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
       return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1);
     case Intrinsic::aarch64_sve_stnt1_scatter:
       return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1);
+    case Intrinsic::aarch64_sve_stnt1_scatter_index:
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX);
     case Intrinsic::aarch64_sve_ld1_gather:
       return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1);
     case Intrinsic::aarch64_sve_ld1_gather_index:

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 2f267447e0bf..8394c90a11be 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -263,6 +263,7 @@ enum NodeType : unsigned {
 
   // Non-temporal gather loads
   GLDNT1,
+  GLDNT1_INDEX,
   GLDNT1S,
 
   // Scatter store
@@ -276,6 +277,7 @@ enum NodeType : unsigned {
 
   // Non-temporal scatter store
   SSTNT1,
+  SSTNT1_INDEX,
 
   // Strict (exception-raising) floating point comparison
   STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,

diff  --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index fb17e9df53e7..833aee041aa5 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 def SVE8BitLslImm : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>;
+def SVELShiftImm64 : ComplexPattern<i32, 1, "SelectSVEShiftImm64<0, 64>", []>;
 
 // Non-faulting loads - node definitions
 //
@@ -139,7 +140,6 @@ def index_vector : SDNode<"AArch64ISD::INDEX_VECTOR", SDT_IndexVector, []>;
 def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>;
 
 let Predicates = [HasSVE] in {
-
   defm RDFFR_PPz  : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>;
   def  RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">;
   defm RDFFR_P    : sve_int_rdffr_unpred<"rdffr", int_aarch64_sve_rdffr>;
@@ -1108,9 +1108,23 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   defm INDEX_II : sve_int_index_ii<"index", index_vector>;
 
   // Unpredicated shifts
-  defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr">;
-  defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr">;
-  defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl">;
+  defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", sra>;
+  defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", srl>;
+  defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", shl>;
+
+  // Patterns for unpredicated left shift by immediate
+  def : Pat<(nxv16i8 (shl (nxv16i8 ZPR:$Zs1),
+                          (nxv16i8 (AArch64dup (vecshiftL8:$imm))))),
+            (LSL_ZZI_B ZPR:$Zs1, vecshiftL8:$imm)>;
+  def : Pat<(nxv8i16 (shl (nxv8i16 ZPR:$Zs1),
+                          (nxv8i16 (AArch64dup (vecshiftL16:$imm))))),
+            (LSL_ZZI_H ZPR:$Zs1, vecshiftL16:$imm)>;
+  def : Pat<(nxv4i32 (shl (nxv4i32 ZPR:$Zs1),
+                          (nxv4i32 (AArch64dup (vecshiftL32:$imm))))),
+            (LSL_ZZI_S ZPR:$Zs1, vecshiftL32:$imm)>;
+  def : Pat<(nxv2i64 (shl (nxv2i64 ZPR:$Zs1),
+                          (nxv2i64 (AArch64dup (i64 (SVELShiftImm64 i32:$imm)))))),
+            (LSL_ZZI_D ZPR:$Zs1, vecshiftL64:$imm)>;
 
   defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">;
   defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">;

diff  --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 907820ebbe7f..6a9d3acff8fb 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -4828,10 +4828,12 @@ multiclass sve_int_bin_cons_shift_wide<bits<2> opc, string asm> {
 }
 
 class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
-                               ZPRRegOp zprty, Operand immtype>
+                               ZPRRegOp zprty, Operand immtype, ValueType vt,
+                               SDPatternOperator op>
 : I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm),
   asm, "\t$Zd, $Zn, $imm",
-  "", []>, Sched<[]> {
+  "",
+  [(set (vt zprty:$Zd), (op (vt zprty:$Zn), immtype:$imm))]>, Sched<[]> {
   bits<5> Zd;
   bits<5> Zn;
   bits<6> imm;
@@ -4846,29 +4848,31 @@ class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm> {
-  def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
-  def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
+multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm,
+                                         SDPatternOperator op> {
+  def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8, nxv16i8, op>;
+  def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16, nxv8i16, op> {
     let Inst{19} = imm{3};
   }
-  def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
+  def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32, nxv4i32, op> {
     let Inst{20-19} = imm{4-3};
   }
-  def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
+  def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64, nxv2i64, op> {
     let Inst{22}    = imm{5};
     let Inst{20-19} = imm{4-3};
   }
 }
 
-multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm> {
-  def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
-  def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
+multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm,
+                                          SDPatternOperator op> {
+  def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8, nxv16i8, op>;
+  def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16, nxv8i16, op> {
     let Inst{19} = imm{3};
   }
-  def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
+  def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32, nxv4i32, op> {
     let Inst{20-19} = imm{4-3};
   }
-  def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
+  def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64, nxv2i64, op> {
     let Inst{22}    = imm{5};
     let Inst{20-19} = imm{4-3};
   }

diff  --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-scaled-offset.ll
new file mode 100644
index 000000000000..34b24284b1b8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-scaled-offset.ll
@@ -0,0 +1,90 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; LDNT1H, LDNT1W, LDNT1D: base + 64-bit index
+;   e.g.
+;     lsl z0.d, z0.d, #1
+;     ldnt1h z0.d, p0/z, [z0.d, x0]
+;
+
+define <vscale x 2 x i64> @gldnt1h_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldnt1h_index
+; CHECK:        lsl z0.d, z0.d, #1
+; CHECK-NEXT:   ldnt1h  { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:   ret
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.index.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                               i16* %base,
+                                                                               <vscale x 2 x i64> %b)
+  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldnt1w_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldnt1w_index
+; CHECK:        lsl z0.d, z0.d, #2
+; CHECK-NEXT:   ldnt1w  { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:   ret
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.index.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                               i32* %base,
+                                                                               <vscale x 2 x i64> %b)
+  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldnt1d_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldnt1d_index
+; CHECK:        lsl z0.d, z0.d, #3
+; CHECK-NEXT:   ldnt1d  { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:   ret
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.gather.index.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                               i64* %base,
+                                                                               <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gldnt1d_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldnt1d_index_double
+; CHECK:        lsl z0.d, z0.d, #3
+; CHECK-NEXT:   ldnt1d  { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:   ret
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.gather.index.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                                  double* %base,
+                                                                                  <vscale x 2 x i64> %b)
+  ret <vscale x 2 x double> %load
+}
+
+;
+; LDNT1SH, LDNT1SW: base + 64-bit index
+;   e.g.
+;     lsl z0.d, z0.d, #1
+;     ldnt1sh z0.d, p0/z, [z0.d, x0]
+;
+
+define <vscale x 2 x i64> @gldnt1sh_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldnt1sh_index
+; CHECK:        lsl z0.d, z0.d, #1
+; CHECK-NEXT:   ldnt1sh { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:   ret
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.index.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                               i16* %base,
+                                                                               <vscale x 2 x i64> %b)
+  %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gldnt1sw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gldnt1sw_index
+; CHECK:        lsl z0.d, z0.d, #2
+; CHECK-NEXT:   ldnt1sw { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:   ret
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.index.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                               i32* %base,
+                                                                               <vscale x 2 x i64> %b)
+  %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.index.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.index.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.gather.index.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.gather.index.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)

diff  --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-scaled-offset.ll
new file mode 100644
index 000000000000..fc14d20822b9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-scaled-offset.ll
@@ -0,0 +1,64 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; STNT1H, STNT1W, STNT1D: base + 64-bit index
+;   e.g.
+;     lsl z1.d, z1.d, #1
+;     stnt1h { z0.d }, p0, [z0.d, x0]
+;
+
+define void @sstnt1h_index(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: sstnt1h_index
+; CHECK:        lsl z1.d, z1.d, #1
+; CHECK-NEXT:   stnt1h  { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT:   ret
+  %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i16>
+  call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i16(<vscale x 2 x i16> %data_trunc,
+                                                          <vscale x 2 x i1> %pg,
+                                                          i16* %base,
+                                                          <vscale x 2 x i64> %offsets)
+  ret void
+}
+
+define void @sstnt1w_index(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: sstnt1w_index
+; CHECK:        lsl z1.d, z1.d, #2
+; CHECK-NEXT:   stnt1w  { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT:   ret
+  %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i32>
+  call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i32(<vscale x 2 x i32> %data_trunc,
+                                                          <vscale x 2 x i1> %pg,
+                                                          i32* %base,
+                                                          <vscale x 2 x i64> %offsets)
+  ret void
+}
+
+define void  @sstnt1d_index(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: sstnt1d_index
+; CHECK:        lsl z1.d, z1.d, #3
+; CHECK-NEXT:   stnt1d  { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT:   ret
+  call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i64(<vscale x 2 x i64> %data,
+                                                          <vscale x 2 x i1> %pg,
+                                                          i64* %base,
+                                                          <vscale x 2 x i64> %offsets)
+  ret void
+}
+
+define void  @sstnt1d_index_double(<vscale x 2 x double> %data, <vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: sstnt1d_index_double
+; CHECK:        lsl z1.d, z1.d, #3
+; CHECK-NEXT:   stnt1d  { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT:   ret
+  call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2f64(<vscale x 2 x double> %data,
+                                                          <vscale x 2 x i1> %pg,
+                                                          double* %base,
+                                                          <vscale x 2 x i64> %offsets)
+  ret void
+}
+
+
+declare void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare void @llvm.aarch64.sve.stnt1.scatter.index.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*, <vscale x 2 x i64>)