[llvm] [AArch64][GlobalISel] Add support for post-indexed loads/stores. (PR #69532)

Amara Emerson via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 19 11:43:22 PDT 2023


https://github.com/aemerson updated https://github.com/llvm/llvm-project/pull/69532

>From 54a92838644980ff6429c7e1d031d21747e29072 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara at apple.com>
Date: Mon, 25 Sep 2023 08:55:08 -0700
Subject: [PATCH 1/3] [AArch64][GlobalISel] Add support for post-indexed
 loads/stores.

Gives small code size improvements across the board at -Os CTMark.

Much of the work is porting the existing heuristics in the DAGCombiner.
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |   6 +-
 .../CodeGen/GlobalISel/GenericMachineInstrs.h |   2 +-
 .../include/llvm/Target/GlobalISel/Combine.td |   2 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 209 ++++--
 llvm/lib/Target/AArch64/AArch64Combine.td     |   1 +
 .../Target/AArch64/AArch64ISelLowering.cpp    |  19 +
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   2 +
 .../GISel/AArch64InstructionSelector.cpp      |  85 +++
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  48 ++
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp |  74 +-
 .../AArch64/GISel/AArch64RegisterBankInfo.h   |   5 +
 .../combiner-load-store-indexing.ll           | 217 +++---
 .../legalize-indexed-load-stores.mir          |  89 +++
 .../GlobalISel/legalizer-info-validation.mir  |  17 +-
 .../CodeGen/AArch64/arm64-indexed-memory.ll   | 284 ++------
 .../AArch64/arm64-indexed-vector-ldst.ll      | 652 ++++++------------
 16 files changed, 887 insertions(+), 825 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/legalize-indexed-load-stores.mir

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index d64b414f2747621..65299e852574bd1 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -58,6 +58,8 @@ struct IndexedLoadStoreMatchInfo {
   Register Addr;
   Register Base;
   Register Offset;
+  bool RematOffset; // True if Offset is a constant that needs to be
+                    // rematerialized before the new load/store.
   bool IsPre;
 };
 
@@ -814,12 +816,14 @@ class CombinerHelper {
   void applyCommuteBinOpOperands(MachineInstr &MI);
 
 private:
+  /// Checks for legality of an indexed variant of \p LdSt.
+  bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
   /// Given a non-indexed load or store instruction \p MI, find an offset that
   /// can be usefully and legally folded into it as a post-indexing operation.
   ///
   /// \returns true if a candidate is found.
   bool findPostIndexCandidate(GLoadStore &MI, Register &Addr, Register &Base,
-                              Register &Offset);
+                              Register &Offset, bool &RematOffset);
 
   /// Given a non-indexed load or store instruction \p MI, find an offset that
   /// can be usefully and legally folded into it as a pre-indexing operation.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 6c36b1bbcf8649b..b34b90fd24eb602 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -97,7 +97,7 @@ class GIndexedLoad : public GMemOperation {
   /// Get the offset register of the pointer value.
   Register getOffsetReg() const { return getOperand(3).getReg(); }
 
-  bool isPre() const { return getOperand(5).getImm() == 1; }
+  bool isPre() const { return getOperand(4).getImm() == 1; }
   bool isPost() const { return !isPre(); }
 
   static bool classof(const MachineInstr *MI) {
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 7e0691e1ee95048..bb8223ba3486a8d 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -1248,7 +1248,7 @@ def constant_fold_binops : GICombineGroup<[constant_fold_binop,
 
 def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
     extract_vec_elt_combines, combines_for_extload,
-    combine_indexed_load_store, undef_combines, identity_combines, phi_combines,
+    undef_combines, identity_combines, phi_combines,
     simplify_add_to_sub, hoist_logic_op_with_same_opcode_hands, shifts_too_big,
     reassocs, ptr_add_immed_chain,
     shl_ashr_to_sext_inreg, sext_inreg_of_load,
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 9efb70f28fee3ee..a8425db6584f61c 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -945,42 +945,171 @@ void CombinerHelper::applySextInRegOfLoad(
   MI.eraseFromParent();
 }
 
+static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
+  if (Ty.isVector())
+    return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
+                                Ty.getNumElements());
+  return IntegerType::get(C, Ty.getSizeInBits());
+}
+
+/// Return true if 'MI' is a load or a store that may be fold it's address
+/// operand into the load / store addressing mode.
+static bool canFoldInAddressingMode(GLoadStore *MI,
+                                    const TargetLowering &TLI,
+                                    MachineRegisterInfo &MRI) {
+  TargetLowering::AddrMode AM;
+  auto *MF = MI->getMF();
+  auto *Addr = getOpcodeDef<GPtrAdd>(MI->getPointerReg(), MRI);
+  if (!Addr)
+    return false;
+
+  AM.HasBaseReg = true;
+  auto CstOff = getIConstantVRegVal(Addr->getOffsetReg(), MRI);
+  if (CstOff)
+    AM.BaseOffs = CstOff->getSExtValue(); // [reg +/- imm]
+  else
+    AM.Scale = 1; // [reg +/- reg]
+
+  return TLI.isLegalAddressingMode(
+      MF->getDataLayout(), AM,
+      getTypeForLLT(MI->getMMO().getMemoryType(),
+                    MF->getFunction().getContext()),
+      MI->getMMO().getAddrSpace());
+}
+
+namespace {
+unsigned getIndexedOpc(unsigned LdStOpc) {
+  switch (LdStOpc) {
+  case TargetOpcode::G_LOAD:
+    return TargetOpcode::G_INDEXED_LOAD;
+  case TargetOpcode::G_STORE:
+    return TargetOpcode::G_INDEXED_STORE;
+  case TargetOpcode::G_ZEXTLOAD:
+    return TargetOpcode::G_INDEXED_ZEXTLOAD;
+  case TargetOpcode::G_SEXTLOAD:
+    return TargetOpcode::G_INDEXED_SEXTLOAD;
+  default:
+    llvm_unreachable("Unexpected opcode");
+  }
+}
+} // namespace
+
+bool CombinerHelper::isIndexedLoadStoreLegal(GLoadStore &LdSt) const {
+    // Check for legality.
+  LLT PtrTy = MRI.getType(LdSt.getPointerReg());
+  LLT Ty = MRI.getType(LdSt.getReg(0));
+  LLT MemTy = LdSt.getMMO().getMemoryType();
+  SmallVector<LegalityQuery::MemDesc, 2> MemDescrs(
+      {{MemTy, MemTy.getSizeInBits(), AtomicOrdering::NotAtomic}});
+  unsigned IndexedOpc = getIndexedOpc(LdSt.getOpcode());
+  SmallVector<LLT> OpTys;
+  if (IndexedOpc == TargetOpcode::G_INDEXED_STORE)
+    OpTys = {PtrTy, Ty, Ty};
+  else
+    OpTys = {Ty, PtrTy}; // For G_INDEXED_LOAD, G_INDEXED_[SZ]EXTLOAD
+
+  LegalityQuery Q(IndexedOpc, OpTys, MemDescrs);
+  return isLegal(Q);
+}
+
+static cl::opt<unsigned> PostIndexUseThreshold(
+    "post-index-use-threshold", cl::Hidden, cl::init(32),
+    cl::desc("Number of uses of a base pointer to check before it is no longer "
+             "considered for post-indexing."));
+
 bool CombinerHelper::findPostIndexCandidate(GLoadStore &LdSt, Register &Addr,
-                                            Register &Base, Register &Offset) {
+                                            Register &Base, Register &Offset,
+                                            bool &RematOffset) {
+  // We're looking for the following pattern, for either load or store:
+  // %baseptr:_(p0) = ...
+  // G_STORE %val(s64), %baseptr(p0)
+  // %offset:_(s64) = G_CONSTANT i64 -256
+  // %new_addr:_(p0) = G_PTR_ADD %baseptr, %offset(s64)
   auto &MF = *LdSt.getParent()->getParent();
   const auto &TLI = *MF.getSubtarget().getTargetLowering();
 
-  Base = LdSt.getPointerReg();
+  Register Ptr = LdSt.getPointerReg();
+  // If the store is the only use, don't bother.
+  if (MRI.hasOneNonDBGUse(Ptr))
+    return false;
+
+  if (!isIndexedLoadStoreLegal(LdSt))
+    return false;
 
-  if (getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Base, MRI))
+  if (getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Ptr, MRI))
     return false;
 
-  // FIXME: The following use traversal needs a bail out for patholigical cases.
-  for (auto &Use : MRI.use_nodbg_instructions(Base)) {
+  MachineInstr *StoredValDef = getDefIgnoringCopies(LdSt.getReg(0), MRI);
+  auto *PtrDef = MRI.getVRegDef(Ptr);
+
+  unsigned NumUsesChecked = 0;
+  for (auto &Use : MRI.use_nodbg_instructions(Ptr)) {
+    if (++NumUsesChecked > PostIndexUseThreshold)
+      return false; // Try to avoid exploding compile time.
+
     auto *PtrAdd = dyn_cast<GPtrAdd>(&Use);
-    if (!PtrAdd)
+    // The use itself might be dead. This can happen during combines if DCE
+    // hasn't had a chance to run yet. Don't allow it to form an indexed op.
+    if (!PtrAdd || MRI.use_nodbg_empty(PtrAdd->getReg(0)))
+      continue;
+
+    // Check the user of this isn't the store, otherwise we'd be generate a
+    // indexed store defining its own use.
+    if (StoredValDef == &Use)
       continue;
 
     Offset = PtrAdd->getOffsetReg();
     if (!ForceLegalIndexing &&
-        !TLI.isIndexingLegal(LdSt, Base, Offset, /*IsPre*/ false, MRI))
+        !TLI.isIndexingLegal(LdSt, PtrAdd->getBaseReg(), Offset,
+                             /*IsPre*/ false, MRI))
       continue;
 
     // Make sure the offset calculation is before the potentially indexed op.
     MachineInstr *OffsetDef = MRI.getVRegDef(Offset);
-    if (!dominates(*OffsetDef, LdSt))
-      continue;
+    if (!dominates(*OffsetDef, LdSt)) {
+      // If the offset however is just a G_CONSTANT, we can always just
+      // rematerialize it where we need it.
+      if (OffsetDef->getOpcode() != TargetOpcode::G_CONSTANT)
+        continue;
+      RematOffset = true;
+    }
 
-    // FIXME: check whether all uses of Base are load/store with foldable
-    // addressing modes. If so, using the normal addr-modes is better than
-    // forming an indexed one.
-    if (any_of(MRI.use_nodbg_instructions(PtrAdd->getReg(0)),
-               [&](MachineInstr &PtrAddUse) {
-                 return !dominates(LdSt, PtrAddUse);
-               }))
-      continue;
+    for (auto &BasePtrUse : MRI.use_nodbg_instructions(PtrAdd->getBaseReg())) {
+      if (&BasePtrUse == PtrDef)
+        continue;
+
+      // If the user is a later load/store that can be post-indexed, then don't
+      // combine this one.
+      auto *BasePtrLdSt = dyn_cast<GLoadStore>(&BasePtrUse);
+      if (BasePtrLdSt && BasePtrLdSt != &LdSt) {
+        if (dominates(LdSt, *BasePtrLdSt)) {
+          if (isIndexedLoadStoreLegal(*BasePtrLdSt))
+            return false;
+        }
+      }
+
+      // Now we're looking for the key G_PTR_ADD instruction, which contains
+      // the offset add that we want to fold.
+      if (auto *BasePtrUseDef = dyn_cast<GPtrAdd>(&BasePtrUse)) {
+        Register PtrAddDefReg = BasePtrUseDef->getReg(0);
+        for (auto &BaseUseUse : MRI.use_nodbg_instructions(PtrAddDefReg)) {
+          // If the use is in a different block, then we may produce worse code
+          // due to the extra register pressure.
+          if (BaseUseUse.getParent() != LdSt.getParent())
+            return false;
+
+          if (auto *UseUseLdSt = dyn_cast<GLoadStore>(&BaseUseUse)) {
+            if (canFoldInAddressingMode(UseUseLdSt, TLI, MRI))
+              return false;
+          }
+        }
+        if (!dominates(LdSt, BasePtrUse))
+          return false; // All use must be dominated by the load/store.
+      }
+    }
 
     Addr = PtrAdd->getReg(0);
+    Base = PtrAdd->getBaseReg();
     return true;
   }
 
@@ -1001,6 +1130,9 @@ bool CombinerHelper::findPreIndexCandidate(GLoadStore &LdSt, Register &Addr,
       !TLI.isIndexingLegal(LdSt, Base, Offset, /*IsPre*/ true, MRI))
     return false;
 
+  if (!isIndexedLoadStoreLegal(LdSt))
+    return false;
+
   MachineInstr *BaseDef = getDefIgnoringCopies(Base, MRI);
   if (BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
     return false;
@@ -1027,16 +1159,14 @@ bool CombinerHelper::matchCombineIndexedLoadStore(
     MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
   auto &LdSt = cast<GLoadStore>(MI);
 
-  // For now, no targets actually support these opcodes so don't waste time
-  // running these unless we're forced to for testing.
-  if (!ForceLegalIndexing)
+  if (LdSt.isAtomic())
     return false;
 
   MatchInfo.IsPre = findPreIndexCandidate(LdSt, MatchInfo.Addr, MatchInfo.Base,
                                           MatchInfo.Offset);
   if (!MatchInfo.IsPre &&
       !findPostIndexCandidate(LdSt, MatchInfo.Addr, MatchInfo.Base,
-                              MatchInfo.Offset))
+                              MatchInfo.Offset, MatchInfo.RematOffset))
     return false;
 
   return true;
@@ -1045,28 +1175,21 @@ bool CombinerHelper::matchCombineIndexedLoadStore(
 void CombinerHelper::applyCombineIndexedLoadStore(
     MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
   MachineInstr &AddrDef = *MRI.getUniqueVRegDef(MatchInfo.Addr);
-  MachineIRBuilder MIRBuilder(MI);
+  Builder.setInstrAndDebugLoc(MI);
   unsigned Opcode = MI.getOpcode();
   bool IsStore = Opcode == TargetOpcode::G_STORE;
-  unsigned NewOpcode;
-  switch (Opcode) {
-  case TargetOpcode::G_LOAD:
-    NewOpcode = TargetOpcode::G_INDEXED_LOAD;
-    break;
-  case TargetOpcode::G_SEXTLOAD:
-    NewOpcode = TargetOpcode::G_INDEXED_SEXTLOAD;
-    break;
-  case TargetOpcode::G_ZEXTLOAD:
-    NewOpcode = TargetOpcode::G_INDEXED_ZEXTLOAD;
-    break;
-  case TargetOpcode::G_STORE:
-    NewOpcode = TargetOpcode::G_INDEXED_STORE;
-    break;
-  default:
-    llvm_unreachable("Unknown load/store opcode");
+  unsigned NewOpcode = getIndexedOpc(Opcode);
+
+  // If the offset constant didn't happen to dominate the load/store, we can
+  // just clone it as needed.
+  if (MatchInfo.RematOffset) {
+    auto *OldCst = MRI.getVRegDef(MatchInfo.Offset);
+    auto NewCst = Builder.buildConstant(MRI.getType(MatchInfo.Offset),
+                                        *OldCst->getOperand(1).getCImm());
+    MatchInfo.Offset = NewCst.getReg(0);
   }
 
-  auto MIB = MIRBuilder.buildInstr(NewOpcode);
+  auto MIB = Builder.buildInstr(NewOpcode);
   if (IsStore) {
     MIB.addDef(MatchInfo.Addr);
     MIB.addUse(MI.getOperand(0).getReg());
@@ -1245,13 +1368,7 @@ void CombinerHelper::applyOptBrCondByInvertingCond(MachineInstr &MI,
   Observer.changedInstr(*BrCond);
 }
 
-static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
-  if (Ty.isVector())
-    return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
-                                Ty.getNumElements());
-  return IntegerType::get(C, Ty.getSizeInBits());
-}
-
+ 
 bool CombinerHelper::tryEmitMemcpyInline(MachineInstr &MI) {
   MachineIRBuilder HelperBuilder(MI);
   GISelObserverWrapper DummyObserver;
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index f7b55cad4269944..017c4523c23a184 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -246,6 +246,7 @@ def AArch64PostLegalizerLowering
 def AArch64PostLegalizerCombiner
     : GICombiner<"AArch64PostLegalizerCombinerImpl",
                        [copy_prop, combines_for_extload,
+                        combine_indexed_load_store,
                         sext_trunc_sextload, mutate_anyext_to_zext,
                         hoist_logic_op_with_same_opcode_hands,
                         redundant_and, xor_of_and_with_same_reg,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a16a102e472e709..b20c5823371c226 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -37,6 +37,8 @@
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -23615,6 +23617,23 @@ bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   return CI->isTailCall();
 }
 
+bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
+                                            Register Offset, bool IsPre,
+                                            MachineRegisterInfo &MRI) const {
+  // HACK
+  if (IsPre)
+    return false; // Until we implement.
+  
+  auto CstOffset = getIConstantVRegVal(Offset, MRI);
+  if (!CstOffset || CstOffset->isZero())
+    return false;
+
+  // All of the indexed addressing mode instructions take a signed 9 bit
+  // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
+  // encodes the sign/indexing direction.
+  return isInt<9>(CstOffset->getSExtValue());
+}
+
 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
                                                    SDValue &Base,
                                                    SDValue &Offset,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 9dcfba3a229cccd..52e519cd8a0c93c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1201,6 +1201,8 @@ class AArch64TargetLowering : public TargetLowering {
   bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
                                   SDValue &Offset, ISD::MemIndexedMode &AM,
                                   SelectionDAG &DAG) const override;
+  bool isIndexingLegal(MachineInstr &MI, Register Base, Register Offset,
+                       bool IsPre, MachineRegisterInfo &MRI) const override;
 
   void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 1c7a09696e853e2..152a6bfab21faf2 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -37,6 +37,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
@@ -224,6 +225,9 @@ class AArch64InstructionSelector : public InstructionSelector {
   bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
 
+  bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI);
+  bool selectIndexedStore(GIndexedStore &I, MachineRegisterInfo &MRI);
+
   unsigned emitConstantPoolEntry(const Constant *CPVal,
                                  MachineFunction &MF) const;
   MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
@@ -3038,6 +3042,11 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
   }
 
+  case TargetOpcode::G_INDEXED_LOAD:
+    return selectIndexedLoad(I, MRI);
+  case TargetOpcode::G_INDEXED_STORE:
+    return selectIndexedStore(cast<GIndexedStore>(I), MRI);
+
   case TargetOpcode::G_SMULH:
   case TargetOpcode::G_UMULH: {
     // Reject the various things we don't support yet.
@@ -5621,6 +5630,82 @@ MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImmFP(
   return &*Mov;
 }
 
+bool AArch64InstructionSelector::selectIndexedLoad(MachineInstr &MI,
+                                                   MachineRegisterInfo &MRI) {
+  // TODO: extending loads.
+  if (isa<GIndexedExtLoad>(MI))
+    return false;
+
+  auto &Ld = cast<GIndexedLoad>(MI);
+  Register Dst = Ld.getDstReg();
+  Register WriteBack = Ld.getWritebackReg();
+  Register Base = Ld.getBaseReg();
+  Register Offset = Ld.getOffsetReg();
+
+  if (Ld.isPre())
+    return false; // TODO: add pre-inc support
+
+  unsigned Opc = 0;
+  static constexpr unsigned GPROpcodes[] = {
+      AArch64::LDRBBpost, AArch64::LDRHHpost, AArch64::LDRWpost,
+      AArch64::LDRXpost};
+  static constexpr unsigned FPROpcodes[] = {
+      AArch64::LDRBpost, AArch64::LDRHpost, AArch64::LDRSpost,
+      AArch64::LDRDpost, AArch64::LDRQpost};
+
+  unsigned MemSize = Ld.getMMO().getMemoryType().getSizeInBytes();
+  if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
+    Opc = FPROpcodes[Log2_32(MemSize)];
+  else
+    Opc = GPROpcodes[Log2_32(MemSize)];
+
+  auto Cst = getIConstantVRegVal(Offset, MRI);
+  if (!Cst)
+    return false; // Shouldn't happen, but just in case.
+  auto LdMI =
+      MIB.buildInstr(Opc, {WriteBack, Dst}, {Base}).addImm(Cst->getSExtValue());
+  LdMI.cloneMemRefs(Ld);
+  constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI);
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AArch64InstructionSelector::selectIndexedStore(GIndexedStore &I,
+                                                    MachineRegisterInfo &MRI) {
+  Register Dst = I.getWritebackReg();
+  Register Val = I.getValueReg();
+  Register Base = I.getBaseReg();
+  Register Offset = I.getOffsetReg();
+  LLT ValTy = MRI.getType(Val);
+
+  if (I.isPre())
+    return false; // TODO: add pre-inc support
+
+  unsigned Opc = 0;
+  static constexpr unsigned GPROpcodes[] = {
+      AArch64::STRBBpost, AArch64::STRHHpost, AArch64::STRWpost,
+      AArch64::STRXpost};
+  static constexpr unsigned FPROpcodes[] = {
+      AArch64::STRBpost, AArch64::STRHpost, AArch64::STRSpost,
+      AArch64::STRDpost, AArch64::STRQpost};
+
+  assert(ValTy.getSizeInBits() <= 128);
+  if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
+    Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())];
+  else
+    Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())];
+
+  auto Cst = getIConstantVRegVal(Offset, MRI);
+  if (!Cst)
+    return false; // Shouldn't happen, but just in case.
+  auto Str =
+      MIB.buildInstr(Opc, {Dst}, {Val, Base}).addImm(Cst->getSExtValue());
+  Str.cloneMemRefs(I);
+  constrainSelectedInstRegOperands(*Str, TII, TRI, RBI);
+  I.eraseFromParent();
+  return true;
+}
+
 MachineInstr *
 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
                                                MachineIRBuilder &MIRBuilder,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index ddc27bebb767693..bb396cb26afc793 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -14,6 +14,7 @@
 #include "AArch64LegalizerInfo.h"
 #include "AArch64RegisterBankInfo.h"
 #include "AArch64Subtarget.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
@@ -65,6 +66,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                                                         /* End 128bit types */
                                                         /* Begin 64bit types */
                                                         v8s8, v4s16, v2s32};
+  std::initializer_list<LLT> ScalarAndPtrTypesList = {s8, s16, s32, s64, p0};
+  SmallVector<LLT, 8> PackedVectorAllTypesVec(PackedVectorAllTypeList);
+  SmallVector<LLT, 8> ScalarAndPtrTypesVec(ScalarAndPtrTypesList);
 
   const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
 
@@ -413,6 +417,50 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .customIf(IsPtrVecPred)
       .scalarizeIf(typeIs(0, v2s16), 0);
 
+  getActionDefinitionsBuilder(G_INDEXED_STORE)
+      // Idx 0 == Ptr, Idx 1 == Val
+      // TODO: we can implement legalizations but as of now these are
+      // generated in a very specific way.
+      .legalForTypesWithMemDesc({
+          {p0, s8, s8, 8},
+          {p0, s16, s16, 8},
+          {p0, s32, s8, 8},
+          {p0, s32, s16, 8},
+          {p0, s32, s32, 8},
+          {p0, s64, s64, 8},
+          {p0, p0, p0, 8},
+          {p0, v8s8, v8s8, 8},
+          {p0, v16s8, v16s8, 8},
+          {p0, v4s16, v4s16, 8},
+          {p0, v8s16, v8s16, 8},
+          {p0, v2s32, v2s32, 8},
+          {p0, v4s32, v4s32, 8},
+          {p0, v2s64, v2s64, 8},
+          {p0, v2p0, v2p0, 8},
+          {p0, s128, s128, 8},
+      })
+      .unsupported();
+
+  auto IndexedLoadBasicPred = [=](const LegalityQuery &Query) {
+    LLT LdTy = Query.Types[0];
+    LLT PtrTy = Query.Types[1];
+    if (llvm::find(PackedVectorAllTypesVec, LdTy) ==
+            PackedVectorAllTypesVec.end() &&
+        llvm::find(ScalarAndPtrTypesVec, LdTy) == ScalarAndPtrTypesVec.end() &&
+        LdTy != s128)
+      return false;
+    if (PtrTy != p0)
+      return false;
+    return true;
+  };
+  getActionDefinitionsBuilder(G_INDEXED_LOAD)
+      .unsupportedIf(
+          atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
+      .legalIf(IndexedLoadBasicPred)
+      .unsupported();
+  getActionDefinitionsBuilder({G_INDEXED_SEXTLOAD, G_INDEXED_ZEXTLOAD})
+      .unsupported(); // TODO: implement
+
   // Constants
   getActionDefinitionsBuilder(G_CONSTANT)
       .legalFor({p0, s8, s16, s32, s64})
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 26954c62e03f1fc..5e96bc67d8ded13 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -605,6 +605,35 @@ bool AArch64RegisterBankInfo::onlyDefinesFP(const MachineInstr &MI,
   return hasFPConstraints(MI, MRI, TRI, Depth);
 }
 
+bool AArch64RegisterBankInfo::isLoadFromFPType(const MachineInstr &MI) const {
+  // GMemOperation because we also want to match indexed loads.
+  auto *Load = dyn_cast<GMemOperation>(&MI);
+
+  const auto &MMO = Load->getMMO();
+  const Value *LdVal = MMO.getValue();
+  if (!LdVal)
+    return false;
+
+  Type *EltTy = nullptr;
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(LdVal)) {
+    EltTy = GV->getValueType();
+  } else {
+    // FIXME: grubbing around uses is pretty ugly, but with no more
+    // `getPointerElementType` there's not much else we can do.
+    for (const auto *LdUser : LdVal->users()) {
+      if (isa<LoadInst>(LdUser)) {
+        EltTy = LdUser->getType();
+        break;
+      }
+      if (isa<StoreInst>(LdUser) && LdUser->getOperand(1) == LdVal) {
+        EltTy = LdUser->getOperand(0)->getType();
+        break;
+      }
+    }
+  }
+  return EltTy && EltTy->isFPOrFPVectorTy();
+}
+
 const RegisterBankInfo::InstructionMapping &
 AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   const unsigned Opc = MI.getOpcode();
@@ -814,30 +843,9 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
 
     // Try to guess the type of the load from the MMO.
-    const auto &MMO = **MI.memoperands_begin();
-    const Value *LdVal = MMO.getValue();
-    if (LdVal) {
-      Type *EltTy = nullptr;
-      if (const GlobalValue *GV = dyn_cast<GlobalValue>(LdVal)) {
-        EltTy = GV->getValueType();
-      } else {
-        // FIXME: grubbing around uses is pretty ugly, but with no more
-        // `getPointerElementType` there's not much else we can do.
-        for (const auto *LdUser : LdVal->users()) {
-          if (isa<LoadInst>(LdUser)) {
-            EltTy = LdUser->getType();
-            break;
-          }
-          if (isa<StoreInst>(LdUser) && LdUser->getOperand(1) == LdVal) {
-            EltTy = LdUser->getOperand(0)->getType();
-            break;
-          }
-        }
-      }
-      if (EltTy && EltTy->isFPOrFPVectorTy()) {
-        OpRegBankIdx[0] = PMI_FirstFPR;
-        break;
-      }
+    if (isLoadFromFPType(MI)) {
+      OpRegBankIdx[0] = PMI_FirstFPR;
+      break;
     }
 
     // Check if that load feeds fp instructions.
@@ -870,6 +878,24 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       break;
     }
     break;
+  case TargetOpcode::G_INDEXED_STORE:
+    if (OpRegBankIdx[1] == PMI_FirstGPR) {
+      Register VReg = MI.getOperand(1).getReg();
+      if (!VReg)
+        break;
+      MachineInstr *DefMI = MRI.getVRegDef(VReg);
+      if (onlyDefinesFP(*DefMI, MRI, TRI))
+        OpRegBankIdx[1] = PMI_FirstFPR;
+      break;
+    }
+    break;
+  case TargetOpcode::G_INDEXED_LOAD:
+  case TargetOpcode::G_INDEXED_SEXTLOAD:
+  case TargetOpcode::G_INDEXED_ZEXTLOAD: {
+    if (isLoadFromFPType(MI))
+      OpRegBankIdx[0] = PMI_FirstFPR;
+    break;
+  }
   case TargetOpcode::G_SELECT: {
     // If the destination is FPR, preserve that.
     if (OpRegBankIdx[0] != PMI_FirstGPR)
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
index f8b16e3177cc4bd..4d40efb5ac92485 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H
 #define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H
 
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/RegisterBankInfo.h"
 
 #define GET_REGBANK_DECLARATIONS
@@ -131,6 +132,10 @@ class AArch64RegisterBankInfo final : public AArch64GenRegisterBankInfo {
   bool onlyDefinesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI,
                      const TargetRegisterInfo &TRI, unsigned Depth = 0) const;
 
+  /// \returns true if the load \p MI is likely loading from a floating-point
+  /// type.
+  bool isLoadFromFPType(const MachineInstr &MI) const;
+
 public:
   AArch64RegisterBankInfo(const TargetRegisterInfo &TRI);
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combiner-load-store-indexing.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combiner-load-store-indexing.ll
index 05d0ef9551bb230..cae1b3949dca7bf 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combiner-load-store-indexing.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combiner-load-store-indexing.ll
@@ -1,63 +1,53 @@
-; RUN: llc -mtriple=arm64-apple-ios -global-isel -global-isel-abort=1 -verify-machineinstrs -stop-after=aarch64-prelegalizer-combiner -force-legal-indexing %s -o - | FileCheck %s
-; RUN: llc -debugify-and-strip-all-safe -mtriple=arm64-apple-ios -global-isel -global-isel-abort=1 -verify-machineinstrs -stop-after=aarch64-prelegalizer-combiner -force-legal-indexing %s -o - | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=arm64-apple-ios -global-isel -global-isel-abort=1 -verify-machineinstrs -stop-after=aarch64-postlegalizer-combiner -force-legal-indexing %s -o - | FileCheck %s
+; RUN: llc -debugify-and-strip-all-safe -mtriple=arm64-apple-ios -global-isel -global-isel-abort=1 -verify-machineinstrs -stop-after=aarch64-postlegalizer-combiner -force-legal-indexing %s -o - | FileCheck %s
 
 define ptr @test_simple_load_pre(ptr %ptr) {
-; CHECK-LABEL: name: test_simple_load_pre
-; CHECK: [[BASE:%.*]]:_(p0) = COPY $x0
-; CHECK: [[OFFSET:%.*]]:_(s64) = G_CONSTANT i64 42
-; CHECK-NOT: G_PTR_ADD
-; CHECK: {{%.*}}:_(s8), [[NEXT:%.*]]:_(p0) = G_INDEXED_LOAD [[BASE]], [[OFFSET]](s64), 1
-; CHECK: $x0 = COPY [[NEXT]](p0)
 
+  ; CHECK-LABEL: name: test_simple_load_pre
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 42
+  ; CHECK-NEXT:   [[INDEXED_LOAD:%[0-9]+]]:_(s8), [[INDEXED_LOAD1:%[0-9]+]]:_(p0) = G_INDEXED_LOAD [[COPY]], [[C]](s64), 1 :: (volatile load (s8) from %ir.next)
+  ; CHECK-NEXT:   $x0 = COPY [[INDEXED_LOAD1]](p0)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $x0
   %next = getelementptr i8, ptr %ptr, i32 42
   load volatile i8, ptr %next
   ret ptr %next
 }
 
 define ptr @test_unused_load_pre(ptr %ptr) {
-; CHECK-LABEL: name: test_unused_load_pre
-; CHECK-NOT: G_INDEXED_LOAD
 
+  ; CHECK-LABEL: name: test_unused_load_pre
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 42
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD]](p0) :: (volatile load (s8) from %ir.next)
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
+  ; CHECK-NEXT:   $x0 = COPY [[C1]](p0)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $x0
   %next = getelementptr i8, ptr %ptr, i32 42
   load volatile i8, ptr %next
   ret ptr null
 }
 
-define void @test_load_multiple_dominated(ptr %ptr, i1 %tst, i1 %tst2) {
-; CHECK-LABEL: name: test_load_multiple_dominated
-; CHECK: [[BASE:%.*]]:_(p0) = COPY $x0
-; CHECK: [[OFFSET:%.*]]:_(s64) = G_CONSTANT i64 42
-; CHECK-NOT: G_PTR_ADD
-; CHECK: {{%.*}}:_(s8), [[NEXT:%.*]]:_(p0) = G_INDEXED_LOAD [[BASE]], [[OFFSET]](s64), 1
-; CHECK: $x0 = COPY [[NEXT]](p0)
-  %next = getelementptr i8, ptr %ptr, i32 42
-  br i1 %tst, label %do_load, label %end
-
-do_load:
-  load volatile i8, ptr %next
-  br i1 %tst2, label %bb1, label %bb2
-
-bb1:
-  store volatile ptr %next, ptr undef
-  ret void
-
-bb2:
-  call void @bar(ptr %next)
-  ret void
-
-end:
-  ret void
-}
-
 define ptr @test_simple_store_pre(ptr %ptr) {
-; CHECK-LABEL: name: test_simple_store_pre
-; CHECK: [[BASE:%.*]]:_(p0) = COPY $x0
-; CHECK: [[VAL:%.*]]:_(s8) = G_CONSTANT i8 0
-; CHECK: [[OFFSET:%.*]]:_(s64) = G_CONSTANT i64 42
-; CHECK-NOT: G_PTR_ADD
-; CHECK: [[NEXT:%.*]]:_(p0) = G_INDEXED_STORE [[VAL]](s8), [[BASE]], [[OFFSET]](s64), 1
-; CHECK: $x0 = COPY [[NEXT]](p0)
 
+  ; CHECK-LABEL: name: test_simple_store_pre
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 42
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
+  ; CHECK-NEXT:   [[INDEXED_STORE:%[0-9]+]]:_(p0) = G_INDEXED_STORE [[C1]](s8), [[COPY]], [[C]](s64), 1 :: (volatile store (s8) into %ir.next)
+  ; CHECK-NEXT:   $x0 = COPY [[INDEXED_STORE]](p0)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $x0
   %next = getelementptr i8, ptr %ptr, i32 42
   store volatile i8 0, ptr %next
   ret ptr %next
@@ -66,10 +56,17 @@ define ptr @test_simple_store_pre(ptr %ptr) {
 ; The potentially pre-indexed address is used as the value stored. Converting
 ; would produce the value too late but only by one instruction.
 define ptr @test_store_pre_val_loop(ptr %ptr) {
-; CHECK-LABEL: name: test_store_pre_val_loop
-; CHECK: G_PTR_ADD
-; CHECK: G_STORE %
 
+  ; CHECK-LABEL: name: test_store_pre_val_loop
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 336
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+  ; CHECK-NEXT:   G_STORE [[PTR_ADD]](p0), [[PTR_ADD]](p0) :: (volatile store (p0) into %ir.next)
+  ; CHECK-NEXT:   $x0 = COPY [[PTR_ADD]](p0)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $x0
   %next = getelementptr ptr, ptr %ptr, i32 42
   store volatile ptr %next, ptr %next
   ret ptr %next
@@ -77,11 +74,21 @@ define ptr @test_store_pre_val_loop(ptr %ptr) {
 
 ; Potentially pre-indexed address is used between GEP computing it and load.
 define ptr @test_load_pre_before(ptr %ptr) {
-; CHECK-LABEL: name: test_load_pre_before
-; CHECK: G_PTR_ADD
-; CHECK: BL @bar
-; CHECK: G_LOAD %
 
+  ; CHECK-LABEL: name: test_load_pre_before
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 42
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+  ; CHECK-NEXT:   $x0 = COPY [[PTR_ADD]](p0)
+  ; CHECK-NEXT:   BL @bar, csr_darwin_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD]](p0) :: (volatile load (s8) from %ir.next)
+  ; CHECK-NEXT:   $x0 = COPY [[PTR_ADD]](p0)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $x0
   %next = getelementptr i8, ptr %ptr, i32 42
   call void @bar(ptr %next)
   load volatile i8, ptr %next
@@ -91,56 +98,51 @@ define ptr @test_load_pre_before(ptr %ptr) {
 ; Materializing the base into a writable register (from sp/fp) would be just as
 ; bad as the original GEP.
 define ptr @test_alloca_load_pre() {
-; CHECK-LABEL: name: test_alloca_load_pre
-; CHECK: G_PTR_ADD
-; CHECK: G_LOAD %
 
+  ; CHECK-LABEL: name: test_alloca_load_pre
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 42
+  ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0.ptr
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C]](s64)
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[PTR_ADD]](p0) :: (volatile load (s8) from %ir.next)
+  ; CHECK-NEXT:   $x0 = COPY [[PTR_ADD]](p0)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $x0
   %ptr = alloca i8, i32 128
   %next = getelementptr i8, ptr %ptr, i32 42
   load volatile i8, ptr %next
   ret ptr %next
 }
 
-; Load does not dominate use of its address. No indexing.
-define ptr @test_pre_nodom(ptr %in, i1 %tst) {
-; CHECK-LABEL: name: test_pre_nodom
-; CHECK: G_PTR_ADD
-; CHECK: G_LOAD %
-
-  %next = getelementptr i8, ptr %in, i32 16
-  br i1 %tst, label %do_indexed, label %use_addr
-
-do_indexed:
-  %val = load i8, ptr %next
-  store i8 %val, ptr @var
-  store ptr %next, ptr @varp8
-  br label %use_addr
-
-use_addr:
-  ret ptr %next
-}
-
 define ptr @test_simple_load_post(ptr %ptr) {
-; CHECK-LABEL: name: test_simple_load_post
-; CHECK: [[BASE:%.*]]:_(p0) = COPY $x0
-; CHECK: [[OFFSET:%.*]]:_(s64) = G_CONSTANT i64 42
-; CHECK-NOT: G_PTR_ADD
-; CHECK: {{%.*}}:_(s8), [[NEXT:%.*]]:_(p0) = G_INDEXED_LOAD [[BASE]], [[OFFSET]](s64), 0
-; CHECK: $x0 = COPY [[NEXT]](p0)
 
+  ; CHECK-LABEL: name: test_simple_load_post
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 42
+  ; CHECK-NEXT:   [[INDEXED_LOAD:%[0-9]+]]:_(s8), [[INDEXED_LOAD1:%[0-9]+]]:_(p0) = G_INDEXED_LOAD [[COPY]], [[C]](s64), 0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   $x0 = COPY [[INDEXED_LOAD1]](p0)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $x0
   %next = getelementptr i8, ptr %ptr, i32 42
   load volatile i8, ptr %ptr
   ret ptr %next
 }
 
 define ptr @test_simple_load_post_gep_after(ptr %ptr) {
-; CHECK-LABEL: name: test_simple_load_post_gep_after
-; CHECK: [[BASE:%.*]]:_(p0) = COPY $x0
-; CHECK: BL @get_offset
-; CHECK: [[OFFSET:%.*]]:_(s64) = COPY $x0
-; CHECK: {{%.*}}:_(s8), [[ADDR:%.*]]:_(p0) = G_INDEXED_LOAD [[BASE]], [[OFFSET]](s64), 0
-; CHECK: $x0 = COPY [[ADDR]](p0)
 
+  ; CHECK-LABEL: name: test_simple_load_post_gep_after
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK-NEXT:   BL @get_offset, csr_darwin_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit-def $x0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
+  ; CHECK-NEXT:   [[INDEXED_LOAD:%[0-9]+]]:_(s8), [[INDEXED_LOAD1:%[0-9]+]]:_(p0) = G_INDEXED_LOAD [[COPY]], [[COPY1]](s64), 0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   $x0 = COPY [[INDEXED_LOAD1]](p0)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $x0
   %offset = call i64 @get_offset()
   load volatile i8, ptr %ptr
   %next = getelementptr i8, ptr %ptr, i64 %offset
@@ -148,9 +150,24 @@ define ptr @test_simple_load_post_gep_after(ptr %ptr) {
 }
 
 define ptr @test_load_post_keep_looking(ptr %ptr) {
-; CHECK: name: test_load_post_keep_looking
-; CHECK: G_INDEXED_LOAD
 
+  ; CHECK-LABEL: name: test_load_post_keep_looking
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK-NEXT:   BL @get_offset, csr_darwin_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit-def $x0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
+  ; CHECK-NEXT:   [[INDEXED_LOAD:%[0-9]+]]:_(s8), [[INDEXED_LOAD1:%[0-9]+]]:_(p0) = G_INDEXED_LOAD [[COPY]], [[COPY1]](s64), 0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY]](p0)
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[PTRTOINT]](s64)
+  ; CHECK-NEXT:   [[ADRP:%[0-9]+]]:gpr64(p0) = ADRP target-flags(aarch64-page) @var
+  ; CHECK-NEXT:   [[ADD_LOW:%[0-9]+]]:_(p0) = G_ADD_LOW [[ADRP]](p0), target-flags(aarch64-pageoff, aarch64-nc) @var
+  ; CHECK-NEXT:   G_STORE [[TRUNC]](s8), [[ADD_LOW]](p0) :: (store (s8) into @var)
+  ; CHECK-NEXT:   $x0 = COPY [[INDEXED_LOAD1]](p0)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $x0
   %offset = call i64 @get_offset()
   load volatile i8, ptr %ptr
   %intval = ptrtoint ptr %ptr to i8
@@ -162,10 +179,15 @@ define ptr @test_load_post_keep_looking(ptr %ptr) {
 
 ; Base is frame index. Using indexing would need copy anyway.
 define ptr @test_load_post_alloca() {
-; CHECK-LABEL: name: test_load_post_alloca
-; CHECK: G_PTR_ADD
-; CHECK: G_LOAD %
 
+  ; CHECK-LABEL: name: test_load_post_alloca
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0.ptr
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX]](p0) :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 42
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[C]](s64)
+  ; CHECK-NEXT:   $x0 = COPY [[PTR_ADD]](p0)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $x0
   %ptr = alloca i8, i32 128
   %next = getelementptr i8, ptr %ptr, i32 42
   load volatile i8, ptr %ptr
@@ -174,11 +196,20 @@ define ptr @test_load_post_alloca() {
 
 ; Offset computation does not dominate the load we might be indexing.
 define ptr @test_load_post_gep_offset_after(ptr %ptr) {
-; CHECK-LABEL: name: test_load_post_gep_offset_after
-; CHECK: G_LOAD %
-; CHECK: BL @get_offset
-; CHECK: G_PTR_ADD
 
+  ; CHECK-LABEL: name: test_load_post_gep_offset_after
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK-NEXT:   BL @get_offset, csr_darwin_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit-def $x0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
+  ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[COPY1]](s64)
+  ; CHECK-NEXT:   $x0 = COPY [[PTR_ADD]](p0)
+  ; CHECK-NEXT:   RET_ReallyLR implicit $x0
   load volatile i8, ptr %ptr
   %offset = call i64 @get_offset()
   %next = getelementptr i8, ptr %ptr, i64 %offset
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-indexed-load-stores.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-indexed-load-stores.mir
new file mode 100644
index 000000000000000..e82a0c219068fde
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-indexed-load-stores.mir
@@ -0,0 +1,89 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass=legalizer -global-isel-abort=1 %s -o - | FileCheck %s
+
+---
+name:            post_store_s64
+body: |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: post_store_s64
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
+    ; CHECK-NEXT: %val:_(s64) = COPY $x1
+    ; CHECK-NEXT: %offset:_(s64) = G_CONSTANT i64 8
+    ; CHECK-NEXT: %writeback:_(p0) = G_INDEXED_STORE %val(s64), %ptr, %offset(s64), 0 :: (store (s64))
+    ; CHECK-NEXT: $x0 = COPY %writeback(p0)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %ptr:_(p0) = COPY $x0
+    %val:_(s64) = COPY $x1
+    %offset:_(s64) = G_CONSTANT i64 8
+    %writeback:_(p0) = G_INDEXED_STORE %val, %ptr, %offset, 0 :: (store (s64), align 8)
+    $x0 = COPY %writeback
+    RET_ReallyLR implicit $x0
+...
+---
+name:            post_store_v2s64
+body: |
+  bb.0:
+    liveins: $x0, $q0
+
+    ; CHECK-LABEL: name: post_store_v2s64
+    ; CHECK: liveins: $x0, $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
+    ; CHECK-NEXT: %val:_(<2 x s64>) = COPY $q0
+    ; CHECK-NEXT: %offset:_(s64) = G_CONSTANT i64 8
+    ; CHECK-NEXT: %writeback:_(p0) = G_INDEXED_STORE %val(<2 x s64>), %ptr, %offset(s64), 0 :: (store (<2 x s64>), align 8)
+    ; CHECK-NEXT: $x0 = COPY %writeback(p0)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %ptr:_(p0) = COPY $x0
+    %val:_(<2 x s64>) = COPY $q0
+    %offset:_(s64) = G_CONSTANT i64 8
+    %writeback:_(p0) = G_INDEXED_STORE %val, %ptr, %offset, 0 :: (store (<2 x s64>), align 8)
+    $x0 = COPY %writeback
+    RET_ReallyLR implicit $x0
+...
+---
+name:            post_load_s64
+body: |
+  bb.0:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: post_load_s64
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
+    ; CHECK-NEXT: %offset:_(s64) = G_CONSTANT i64 8
+    ; CHECK-NEXT: %dst:_(s64), %writeback:_(p0) = G_INDEXED_LOAD %ptr, %offset(s64), 0 :: (load (s64))
+    ; CHECK-NEXT: $x0 = COPY %writeback(p0)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %ptr:_(p0) = COPY $x0
+    %offset:_(s64) = G_CONSTANT i64 8
+    %dst:_(s64), %writeback:_(p0) = G_INDEXED_LOAD %ptr, %offset, 0 :: (load (s64), align 8)
+    $x0 = COPY %writeback
+    RET_ReallyLR implicit $x0
+...
+---
+name:            post_load_v2s64
+body: |
+  bb.0:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: post_load_v2s64
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
+    ; CHECK-NEXT: %offset:_(s64) = G_CONSTANT i64 8
+    ; CHECK-NEXT: %dst:_(<2 x s64>), %writeback:_(p0) = G_INDEXED_LOAD %ptr, %offset(s64), 0 :: (load (s64))
+    ; CHECK-NEXT: $x0 = COPY %writeback(p0)
+    ; CHECK-NEXT: $q0 = COPY %dst(<2 x s64>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0, implicit $q0
+    %ptr:_(p0) = COPY $x0
+    %offset:_(s64) = G_CONSTANT i64 8
+    %dst:_(<2 x s64>), %writeback:_(p0) = G_INDEXED_LOAD %ptr, %offset, 0 :: (load (s64), align 8)
+    $x0 = COPY %writeback
+    $q0 = COPY %dst
+    RET_ReallyLR implicit $x0, implicit $q0
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 549f36b2afd066f..d5f7507ec5dd767 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -174,20 +174,21 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_INDEXED_LOAD (opcode {{[0-9]+}}): 3 type indices, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_INDEXED_SEXTLOAD (opcode {{[0-9]+}}): 3 type indices, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_INDEXED_ZEXTLOAD (opcode {{[0-9]+}}): 3 type indices, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_STORE (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_INDEXED_STORE (opcode {{[0-9]+}}): 3 type indices, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_ATOMIC_CMPXCHG_WITH_SUCCESS (opcode {{[0-9]+}}): 3 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
index e40063def477ccb..42a40a62cc6d374 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -4,22 +4,10 @@
 ; RUN: llc < %s -mtriple=arm64_32-apple-ios -aarch64-redzone | FileCheck %s --check-prefixes=CHECK,CHECK32
 
 define ptr @store64(ptr %ptr, i64 %index, i64 %spacing) {
-; CHECK64-LABEL: store64:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    str x2, [x0], #8
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: store64:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #8
-; GISEL-NEXT:    str x2, [x8]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: store64:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    str x2, [x0], #8
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: store64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    str x2, [x0], #8
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i64, ptr %ptr, i64 1
   store i64 %spacing, ptr %ptr, align 4
   ret ptr %incdec.ptr
@@ -38,44 +26,20 @@ define ptr @store64idxpos256(ptr %ptr, i64 %index, i64 %spacing) {
 }
 
 define ptr @store64idxneg256(ptr %ptr, i64 %index, i64 %spacing) {
-; CHECK64-LABEL: store64idxneg256:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    str x2, [x0], #-256
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: store64idxneg256:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    sub x0, x0, #256
-; GISEL-NEXT:    str x2, [x8]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: store64idxneg256:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    str x2, [x0], #-256
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: store64idxneg256:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    str x2, [x0], #-256
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i64, ptr %ptr, i64 -32
   store i64 %spacing, ptr %ptr, align 4
   ret ptr %incdec.ptr
 }
 
 define ptr @store32(ptr %ptr, i32 %index, i32 %spacing) {
-; CHECK64-LABEL: store32:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    str w2, [x0], #4
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: store32:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #4
-; GISEL-NEXT:    str w2, [x8]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: store32:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    str w2, [x0], #4
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: store32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    str w2, [x0], #4
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i32, ptr %ptr, i64 1
   store i32 %spacing, ptr %ptr, align 4
   ret ptr %incdec.ptr
@@ -94,44 +58,20 @@ define ptr @store32idxpos256(ptr %ptr, i32 %index, i32 %spacing) {
 }
 
 define ptr @store32idxneg256(ptr %ptr, i32 %index, i32 %spacing) {
-; CHECK64-LABEL: store32idxneg256:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    str w2, [x0], #-256
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: store32idxneg256:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    sub x0, x0, #256
-; GISEL-NEXT:    str w2, [x8]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: store32idxneg256:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    str w2, [x0], #-256
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: store32idxneg256:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    str w2, [x0], #-256
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i32, ptr %ptr, i64 -64
   store i32 %spacing, ptr %ptr, align 4
   ret ptr %incdec.ptr
 }
 
 define ptr @store16(ptr %ptr, i16 %index, i16 %spacing) {
-; CHECK64-LABEL: store16:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    strh w2, [x0], #2
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: store16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #2
-; GISEL-NEXT:    strh w2, [x8]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: store16:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    strh w2, [x0], #2
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: store16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    strh w2, [x0], #2
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i16, ptr %ptr, i64 1
   store i16 %spacing, ptr %ptr, align 4
   ret ptr %incdec.ptr
@@ -150,44 +90,20 @@ define ptr @store16idxpos256(ptr %ptr, i16 %index, i16 %spacing) {
 }
 
 define ptr @store16idxneg256(ptr %ptr, i16 %index, i16 %spacing) {
-; CHECK64-LABEL: store16idxneg256:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    strh w2, [x0], #-256
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: store16idxneg256:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    sub x0, x0, #256
-; GISEL-NEXT:    strh w2, [x8]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: store16idxneg256:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    strh w2, [x0], #-256
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: store16idxneg256:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    strh w2, [x0], #-256
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i16, ptr %ptr, i64 -128
   store i16 %spacing, ptr %ptr, align 4
   ret ptr %incdec.ptr
 }
 
 define ptr @store8(ptr %ptr, i8 %index, i8 %spacing) {
-; CHECK64-LABEL: store8:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    strb w2, [x0], #1
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: store8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #1
-; GISEL-NEXT:    strb w2, [x8]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: store8:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    strb w2, [x0], #1
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: store8:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    strb w2, [x0], #1
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i8, ptr %ptr, i64 1
   store i8 %spacing, ptr %ptr, align 4
   ret ptr %incdec.ptr
@@ -206,44 +122,20 @@ define ptr @store8idxpos256(ptr %ptr, i8 %index, i8 %spacing) {
 }
 
 define ptr @store8idxneg256(ptr %ptr, i8 %index, i8 %spacing) {
-; CHECK64-LABEL: store8idxneg256:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    strb w2, [x0], #-256
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: store8idxneg256:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    sub x0, x0, #256
-; GISEL-NEXT:    strb w2, [x8]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: store8idxneg256:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    strb w2, [x0], #-256
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: store8idxneg256:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    strb w2, [x0], #-256
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i8, ptr %ptr, i64 -256
   store i8 %spacing, ptr %ptr, align 4
   ret ptr %incdec.ptr
 }
 
 define ptr @truncst64to32(ptr %ptr, i32 %index, i64 %spacing) {
-; CHECK64-LABEL: truncst64to32:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    str w2, [x0], #4
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: truncst64to32:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #4
-; GISEL-NEXT:    str w2, [x8]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: truncst64to32:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    str w2, [x0], #4
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: truncst64to32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    str w2, [x0], #4
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i32, ptr %ptr, i64 1
   %trunc = trunc i64 %spacing to i32
   store i32 %trunc, ptr %ptr, align 4
@@ -251,22 +143,10 @@ define ptr @truncst64to32(ptr %ptr, i32 %index, i64 %spacing) {
 }
 
 define ptr @truncst64to16(ptr %ptr, i16 %index, i64 %spacing) {
-; CHECK64-LABEL: truncst64to16:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    strh w2, [x0], #2
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: truncst64to16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #2
-; GISEL-NEXT:    strh w2, [x8]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: truncst64to16:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    strh w2, [x0], #2
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: truncst64to16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    strh w2, [x0], #2
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i16, ptr %ptr, i64 1
   %trunc = trunc i64 %spacing to i16
   store i16 %trunc, ptr %ptr, align 4
@@ -274,22 +154,10 @@ define ptr @truncst64to16(ptr %ptr, i16 %index, i64 %spacing) {
 }
 
 define ptr @truncst64to8(ptr %ptr, i8 %index, i64 %spacing) {
-; CHECK64-LABEL: truncst64to8:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    strb w2, [x0], #1
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: truncst64to8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #1
-; GISEL-NEXT:    strb w2, [x8]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: truncst64to8:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    strb w2, [x0], #1
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: truncst64to8:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    strb w2, [x0], #1
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i8, ptr %ptr, i64 1
   %trunc = trunc i64 %spacing to i8
   store i8 %trunc, ptr %ptr, align 4
@@ -298,66 +166,30 @@ define ptr @truncst64to8(ptr %ptr, i8 %index, i64 %spacing) {
 
 
 define ptr @storef16(ptr %ptr, half %index, half %spacing) nounwind {
-; CHECK64-LABEL: storef16:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    str h1, [x0], #2
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: storef16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #2
-; GISEL-NEXT:    str h1, [x8]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: storef16:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    str h1, [x0], #2
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: storef16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    str h1, [x0], #2
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds half, ptr %ptr, i64 1
   store half %spacing, ptr %ptr, align 2
   ret ptr %incdec.ptr
 }
 
 define ptr @storef32(ptr %ptr, float %index, float %spacing) {
-; CHECK64-LABEL: storef32:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    str s1, [x0], #4
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: storef32:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #4
-; GISEL-NEXT:    str s1, [x8]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: storef32:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    str s1, [x0], #4
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: storef32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    str s1, [x0], #4
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds float, ptr %ptr, i64 1
   store float %spacing, ptr %ptr, align 4
   ret ptr %incdec.ptr
 }
 
 define ptr @storef64(ptr %ptr, double %index, double %spacing) {
-; CHECK64-LABEL: storef64:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    str d1, [x0], #8
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: storef64:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #8
-; GISEL-NEXT:    str d1, [x8]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: storef64:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    str d1, [x0], #8
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: storef64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    str d1, [x0], #8
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds double, ptr %ptr, i64 1
   store double %spacing, ptr %ptr, align 4
   ret ptr %incdec.ptr
@@ -1112,8 +944,8 @@ define ptr @postidx_clobber(ptr %addr) nounwind noinline ssp {
 ; GISEL-LABEL: postidx_clobber:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #8
-; GISEL-NEXT:    str x8, [x8]
+; GISEL-NEXT:    str x0, [x8], #8
+; GISEL-NEXT:    mov x0, x8
 ; GISEL-NEXT:    ret
 ;
 ; CHECK32-LABEL: postidx_clobber:
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 2cab4932def0724..46563f6a8e089c4 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -647,20 +647,12 @@ define <8 x i8> @test_v8i8_pre_load(ptr %addr) {
 }
 
 define <8 x i8> @test_v8i8_post_load(ptr %addr) {
-; SDAG-LABEL: test_v8i8_post_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr d0, [x0], #40
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v8i8_post_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr d0, [x0]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v8i8_post_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr d0, [x0], #40
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <8 x i8>, ptr %addr, i32 5
   %val = load <8 x i8>, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -689,20 +681,12 @@ define void @test_v8i8_pre_store(<8 x i8> %in, ptr %addr) {
 }
 
 define void @test_v8i8_post_store(<8 x i8> %in, ptr %addr) {
-; SDAG-LABEL: test_v8i8_post_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str d0, [x0], #40
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v8i8_post_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str d0, [x0]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v8i8_post_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str d0, [x0], #40
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <8 x i8>, ptr %addr, i32 5
   store <8 x i8> %in, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -731,20 +715,12 @@ define <4 x i16> @test_v4i16_pre_load(ptr %addr) {
 }
 
 define <4 x i16> @test_v4i16_post_load(ptr %addr) {
-; SDAG-LABEL: test_v4i16_post_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr d0, [x0], #40
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v4i16_post_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr d0, [x0]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v4i16_post_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr d0, [x0], #40
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <4 x i16>, ptr %addr, i32 5
   %val = load <4 x i16>, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -773,20 +749,12 @@ define void @test_v4i16_pre_store(<4 x i16> %in, ptr %addr) {
 }
 
 define void @test_v4i16_post_store(<4 x i16> %in, ptr %addr) {
-; SDAG-LABEL: test_v4i16_post_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str d0, [x0], #40
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v4i16_post_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str d0, [x0]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v4i16_post_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str d0, [x0], #40
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <4 x i16>, ptr %addr, i32 5
   store <4 x i16> %in, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -815,20 +783,12 @@ define <2 x i32> @test_v2i32_pre_load(ptr %addr) {
 }
 
 define <2 x i32> @test_v2i32_post_load(ptr %addr) {
-; SDAG-LABEL: test_v2i32_post_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr d0, [x0], #40
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v2i32_post_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr d0, [x0]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v2i32_post_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr d0, [x0], #40
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <2 x i32>, ptr %addr, i32 5
   %val = load <2 x i32>, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -857,20 +817,12 @@ define void @test_v2i32_pre_store(<2 x i32> %in, ptr %addr) {
 }
 
 define void @test_v2i32_post_store(<2 x i32> %in, ptr %addr) {
-; SDAG-LABEL: test_v2i32_post_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str d0, [x0], #40
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v2i32_post_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str d0, [x0]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v2i32_post_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str d0, [x0], #40
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <2 x i32>, ptr %addr, i32 5
   store <2 x i32> %in, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -899,20 +851,12 @@ define <2 x float> @test_v2f32_pre_load(ptr %addr) {
 }
 
 define <2 x float> @test_v2f32_post_load(ptr %addr) {
-; SDAG-LABEL: test_v2f32_post_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr d0, [x0], #40
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v2f32_post_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr d0, [x0]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v2f32_post_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr d0, [x0], #40
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <2 x float>, ptr %addr, i32 5
   %val = load <2 x float>, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -941,20 +885,12 @@ define void @test_v2f32_pre_store(<2 x float> %in, ptr %addr) {
 }
 
 define void @test_v2f32_post_store(<2 x float> %in, ptr %addr) {
-; SDAG-LABEL: test_v2f32_post_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str d0, [x0], #40
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v2f32_post_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str d0, [x0]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v2f32_post_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str d0, [x0], #40
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <2 x float>, ptr %addr, i32 5
   store <2 x float> %in, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -992,10 +928,10 @@ define <1 x i64> @test_v1i64_post_load(ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v1i64_post_load:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr d0, [x0]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
+; CHECK-GISEL-NEXT:    ldr x8, [x0], #40
+; CHECK-GISEL-NEXT:    adrp x9, _ptr at PAGE
+; CHECK-GISEL-NEXT:    str x0, [x9, _ptr at PAGEOFF]
+; CHECK-GISEL-NEXT:    fmov d0, x8
 ; CHECK-GISEL-NEXT:    ret
   %newaddr = getelementptr <1 x i64>, ptr %addr, i32 5
   %val = load <1 x i64>, ptr %addr, align 8
@@ -1025,20 +961,12 @@ define void @test_v1i64_pre_store(<1 x i64> %in, ptr %addr) {
 }
 
 define void @test_v1i64_post_store(<1 x i64> %in, ptr %addr) {
-; SDAG-LABEL: test_v1i64_post_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str d0, [x0], #40
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v1i64_post_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str d0, [x0]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v1i64_post_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str d0, [x0], #40
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <1 x i64>, ptr %addr, i32 5
   store <1 x i64> %in, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1067,20 +995,12 @@ define <16 x i8> @test_v16i8_pre_load(ptr %addr) {
 }
 
 define <16 x i8> @test_v16i8_post_load(ptr %addr) {
-; SDAG-LABEL: test_v16i8_post_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr q0, [x0], #80
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v16i8_post_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr q0, [x0]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v16i8_post_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr q0, [x0], #80
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <16 x i8>, ptr %addr, i32 5
   %val = load <16 x i8>, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1109,20 +1029,12 @@ define void @test_v16i8_pre_store(<16 x i8> %in, ptr %addr) {
 }
 
 define void @test_v16i8_post_store(<16 x i8> %in, ptr %addr) {
-; SDAG-LABEL: test_v16i8_post_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str q0, [x0], #80
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v16i8_post_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str q0, [x0]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v16i8_post_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str q0, [x0], #80
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <16 x i8>, ptr %addr, i32 5
   store <16 x i8> %in, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1151,20 +1063,12 @@ define <8 x i16> @test_v8i16_pre_load(ptr %addr) {
 }
 
 define <8 x i16> @test_v8i16_post_load(ptr %addr) {
-; SDAG-LABEL: test_v8i16_post_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr q0, [x0], #80
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v8i16_post_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr q0, [x0]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v8i16_post_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr q0, [x0], #80
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <8 x i16>, ptr %addr, i32 5
   %val = load <8 x i16>, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1193,20 +1097,12 @@ define void @test_v8i16_pre_store(<8 x i16> %in, ptr %addr) {
 }
 
 define void @test_v8i16_post_store(<8 x i16> %in, ptr %addr) {
-; SDAG-LABEL: test_v8i16_post_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str q0, [x0], #80
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v8i16_post_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str q0, [x0]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v8i16_post_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str q0, [x0], #80
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <8 x i16>, ptr %addr, i32 5
   store <8 x i16> %in, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1235,20 +1131,12 @@ define <4 x i32> @test_v4i32_pre_load(ptr %addr) {
 }
 
 define <4 x i32> @test_v4i32_post_load(ptr %addr) {
-; SDAG-LABEL: test_v4i32_post_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr q0, [x0], #80
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v4i32_post_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr q0, [x0]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v4i32_post_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr q0, [x0], #80
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <4 x i32>, ptr %addr, i32 5
   %val = load <4 x i32>, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1277,20 +1165,12 @@ define void @test_v4i32_pre_store(<4 x i32> %in, ptr %addr) {
 }
 
 define void @test_v4i32_post_store(<4 x i32> %in, ptr %addr) {
-; SDAG-LABEL: test_v4i32_post_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str q0, [x0], #80
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v4i32_post_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str q0, [x0]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v4i32_post_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str q0, [x0], #80
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <4 x i32>, ptr %addr, i32 5
   store <4 x i32> %in, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1320,20 +1200,12 @@ define <4 x float> @test_v4f32_pre_load(ptr %addr) {
 }
 
 define <4 x float> @test_v4f32_post_load(ptr %addr) {
-; SDAG-LABEL: test_v4f32_post_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr q0, [x0], #80
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v4f32_post_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr q0, [x0]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v4f32_post_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr q0, [x0], #80
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <4 x float>, ptr %addr, i32 5
   %val = load <4 x float>, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1362,20 +1234,12 @@ define void @test_v4f32_pre_store(<4 x float> %in, ptr %addr) {
 }
 
 define void @test_v4f32_post_store(<4 x float> %in, ptr %addr) {
-; SDAG-LABEL: test_v4f32_post_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str q0, [x0], #80
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v4f32_post_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str q0, [x0]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v4f32_post_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str q0, [x0], #80
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <4 x float>, ptr %addr, i32 5
   store <4 x float> %in, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1405,20 +1269,12 @@ define <2 x i64> @test_v2i64_pre_load(ptr %addr) {
 }
 
 define <2 x i64> @test_v2i64_post_load(ptr %addr) {
-; SDAG-LABEL: test_v2i64_post_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr q0, [x0], #80
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v2i64_post_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr q0, [x0]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v2i64_post_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr q0, [x0], #80
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <2 x i64>, ptr %addr, i32 5
   %val = load <2 x i64>, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1447,20 +1303,12 @@ define void @test_v2i64_pre_store(<2 x i64> %in, ptr %addr) {
 }
 
 define void @test_v2i64_post_store(<2 x i64> %in, ptr %addr) {
-; SDAG-LABEL: test_v2i64_post_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str q0, [x0], #80
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v2i64_post_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str q0, [x0]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v2i64_post_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str q0, [x0], #80
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <2 x i64>, ptr %addr, i32 5
   store <2 x i64> %in, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1490,20 +1338,12 @@ define <2 x double> @test_v2f64_pre_load(ptr %addr) {
 }
 
 define <2 x double> @test_v2f64_post_load(ptr %addr) {
-; SDAG-LABEL: test_v2f64_post_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr q0, [x0], #80
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v2f64_post_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr q0, [x0]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v2f64_post_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr q0, [x0], #80
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <2 x double>, ptr %addr, i32 5
   %val = load <2 x double>, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1532,20 +1372,12 @@ define void @test_v2f64_pre_store(<2 x double> %in, ptr %addr) {
 }
 
 define void @test_v2f64_post_store(<2 x double> %in, ptr %addr) {
-; SDAG-LABEL: test_v2f64_post_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr at PAGE
-; SDAG-NEXT:    str q0, [x0], #80
-; SDAG-NEXT:    str x0, [x8, _ptr at PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v2f64_post_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr at PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str q0, [x0]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr at PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v2f64_post_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr at PAGE
+; CHECK-NEXT:    str q0, [x0], #80
+; CHECK-NEXT:    str x0, [x8, _ptr at PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <2 x double>, ptr %addr, i32 5
   store <2 x double> %in, ptr %addr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1560,9 +1392,8 @@ define ptr @test_v16i8_post_imm_st1_lane(<16 x i8> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v16i8_post_imm_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #1
-; CHECK-GISEL-NEXT:    st1.b { v0 }[3], [x8]
+; CHECK-GISEL-NEXT:    mov b0, v0[3]
+; CHECK-GISEL-NEXT:    str b0, [x0], #1
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <16 x i8> %in, i32 3
   store i8 %elt, ptr %addr
@@ -1580,9 +1411,8 @@ define ptr @test_v16i8_post_reg_st1_lane(<16 x i8> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v16i8_post_reg_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #2
-; CHECK-GISEL-NEXT:    st1.b { v0 }[3], [x8]
+; CHECK-GISEL-NEXT:    mov b0, v0[3]
+; CHECK-GISEL-NEXT:    str b0, [x0], #2
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <16 x i8> %in, i32 3
   store i8 %elt, ptr %addr
@@ -1600,9 +1430,8 @@ define ptr @test_v8i16_post_imm_st1_lane(<8 x i16> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v8i16_post_imm_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #2
-; CHECK-GISEL-NEXT:    st1.h { v0 }[3], [x8]
+; CHECK-GISEL-NEXT:    mov h0, v0[3]
+; CHECK-GISEL-NEXT:    str h0, [x0], #2
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <8 x i16> %in, i32 3
   store i16 %elt, ptr %addr
@@ -1620,9 +1449,8 @@ define ptr @test_v8i16_post_reg_st1_lane(<8 x i16> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v8i16_post_reg_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #4
-; CHECK-GISEL-NEXT:    st1.h { v0 }[3], [x8]
+; CHECK-GISEL-NEXT:    mov h0, v0[3]
+; CHECK-GISEL-NEXT:    str h0, [x0], #4
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <8 x i16> %in, i32 3
   store i16 %elt, ptr %addr
@@ -1639,9 +1467,8 @@ define ptr @test_v4i32_post_imm_st1_lane(<4 x i32> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v4i32_post_imm_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #4
-; CHECK-GISEL-NEXT:    st1.s { v0 }[3], [x8]
+; CHECK-GISEL-NEXT:    mov s0, v0[3]
+; CHECK-GISEL-NEXT:    str s0, [x0], #4
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <4 x i32> %in, i32 3
   store i32 %elt, ptr %addr
@@ -1659,9 +1486,8 @@ define ptr @test_v4i32_post_reg_st1_lane(<4 x i32> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v4i32_post_reg_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #8
-; CHECK-GISEL-NEXT:    st1.s { v0 }[3], [x8]
+; CHECK-GISEL-NEXT:    mov s0, v0[3]
+; CHECK-GISEL-NEXT:    str s0, [x0], #8
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <4 x i32> %in, i32 3
   store i32 %elt, ptr %addr
@@ -1678,9 +1504,8 @@ define ptr @test_v4f32_post_imm_st1_lane(<4 x float> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v4f32_post_imm_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #4
-; CHECK-GISEL-NEXT:    st1.s { v0 }[3], [x8]
+; CHECK-GISEL-NEXT:    mov s0, v0[3]
+; CHECK-GISEL-NEXT:    str s0, [x0], #4
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <4 x float> %in, i32 3
   store float %elt, ptr %addr
@@ -1698,9 +1523,8 @@ define ptr @test_v4f32_post_reg_st1_lane(<4 x float> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v4f32_post_reg_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #8
-; CHECK-GISEL-NEXT:    st1.s { v0 }[3], [x8]
+; CHECK-GISEL-NEXT:    mov s0, v0[3]
+; CHECK-GISEL-NEXT:    str s0, [x0], #8
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <4 x float> %in, i32 3
   store float %elt, ptr %addr
@@ -1717,9 +1541,8 @@ define ptr @test_v2i64_post_imm_st1_lane(<2 x i64> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v2i64_post_imm_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #8
-; CHECK-GISEL-NEXT:    st1.d { v0 }[1], [x8]
+; CHECK-GISEL-NEXT:    mov d0, v0[1]
+; CHECK-GISEL-NEXT:    str d0, [x0], #8
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <2 x i64> %in, i64 1
   store i64 %elt, ptr %addr
@@ -1737,9 +1560,8 @@ define ptr @test_v2i64_post_reg_st1_lane(<2 x i64> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v2i64_post_reg_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #16
-; CHECK-GISEL-NEXT:    st1.d { v0 }[1], [x8]
+; CHECK-GISEL-NEXT:    mov d0, v0[1]
+; CHECK-GISEL-NEXT:    str d0, [x0], #16
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <2 x i64> %in, i64 1
   store i64 %elt, ptr %addr
@@ -1756,9 +1578,8 @@ define ptr @test_v2f64_post_imm_st1_lane(<2 x double> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v2f64_post_imm_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #8
-; CHECK-GISEL-NEXT:    st1.d { v0 }[1], [x8]
+; CHECK-GISEL-NEXT:    mov d0, v0[1]
+; CHECK-GISEL-NEXT:    str d0, [x0], #8
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <2 x double> %in, i32 1
   store double %elt, ptr %addr
@@ -1776,9 +1597,8 @@ define ptr @test_v2f64_post_reg_st1_lane(<2 x double> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v2f64_post_reg_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #16
-; CHECK-GISEL-NEXT:    st1.d { v0 }[1], [x8]
+; CHECK-GISEL-NEXT:    mov d0, v0[1]
+; CHECK-GISEL-NEXT:    str d0, [x0], #16
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <2 x double> %in, i32 1
   store double %elt, ptr %addr
@@ -1796,10 +1616,9 @@ define ptr @test_v8i8_post_imm_st1_lane(<8 x i8> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v8i8_post_imm_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #1
 ; CHECK-GISEL-NEXT:    ; kill: def $d0 killed $d0 def $q0
-; CHECK-GISEL-NEXT:    st1.b { v0 }[3], [x8]
+; CHECK-GISEL-NEXT:    mov b0, v0[3]
+; CHECK-GISEL-NEXT:    str b0, [x0], #1
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <8 x i8> %in, i32 3
   store i8 %elt, ptr %addr
@@ -1818,10 +1637,9 @@ define ptr @test_v8i8_post_reg_st1_lane(<8 x i8> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v8i8_post_reg_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #2
 ; CHECK-GISEL-NEXT:    ; kill: def $d0 killed $d0 def $q0
-; CHECK-GISEL-NEXT:    st1.b { v0 }[3], [x8]
+; CHECK-GISEL-NEXT:    mov b0, v0[3]
+; CHECK-GISEL-NEXT:    str b0, [x0], #2
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <8 x i8> %in, i32 3
   store i8 %elt, ptr %addr
@@ -1839,10 +1657,9 @@ define ptr @test_v4i16_post_imm_st1_lane(<4 x i16> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v4i16_post_imm_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #2
 ; CHECK-GISEL-NEXT:    ; kill: def $d0 killed $d0 def $q0
-; CHECK-GISEL-NEXT:    st1.h { v0 }[3], [x8]
+; CHECK-GISEL-NEXT:    mov h0, v0[3]
+; CHECK-GISEL-NEXT:    str h0, [x0], #2
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <4 x i16> %in, i32 3
   store i16 %elt, ptr %addr
@@ -1861,10 +1678,9 @@ define ptr @test_v4i16_post_reg_st1_lane(<4 x i16> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v4i16_post_reg_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #4
 ; CHECK-GISEL-NEXT:    ; kill: def $d0 killed $d0 def $q0
-; CHECK-GISEL-NEXT:    st1.h { v0 }[3], [x8]
+; CHECK-GISEL-NEXT:    mov h0, v0[3]
+; CHECK-GISEL-NEXT:    str h0, [x0], #4
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <4 x i16> %in, i32 3
   store i16 %elt, ptr %addr
@@ -1882,10 +1698,9 @@ define ptr @test_v2i32_post_imm_st1_lane(<2 x i32> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v2i32_post_imm_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #4
 ; CHECK-GISEL-NEXT:    ; kill: def $d0 killed $d0 def $q0
-; CHECK-GISEL-NEXT:    st1.s { v0 }[1], [x8]
+; CHECK-GISEL-NEXT:    mov s0, v0[1]
+; CHECK-GISEL-NEXT:    str s0, [x0], #4
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <2 x i32> %in, i32 1
   store i32 %elt, ptr %addr
@@ -1904,10 +1719,9 @@ define ptr @test_v2i32_post_reg_st1_lane(<2 x i32> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v2i32_post_reg_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #8
 ; CHECK-GISEL-NEXT:    ; kill: def $d0 killed $d0 def $q0
-; CHECK-GISEL-NEXT:    st1.s { v0 }[1], [x8]
+; CHECK-GISEL-NEXT:    mov s0, v0[1]
+; CHECK-GISEL-NEXT:    str s0, [x0], #8
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <2 x i32> %in, i32 1
   store i32 %elt, ptr %addr
@@ -1925,10 +1739,9 @@ define ptr @test_v2f32_post_imm_st1_lane(<2 x float> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v2f32_post_imm_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #4
 ; CHECK-GISEL-NEXT:    ; kill: def $d0 killed $d0 def $q0
-; CHECK-GISEL-NEXT:    st1.s { v0 }[1], [x8]
+; CHECK-GISEL-NEXT:    mov s0, v0[1]
+; CHECK-GISEL-NEXT:    str s0, [x0], #4
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <2 x float> %in, i32 1
   store float %elt, ptr %addr
@@ -1947,10 +1760,9 @@ define ptr @test_v2f32_post_reg_st1_lane(<2 x float> %in, ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v2f32_post_reg_st1_lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    mov x8, x0
-; CHECK-GISEL-NEXT:    add x0, x0, #8
 ; CHECK-GISEL-NEXT:    ; kill: def $d0 killed $d0 def $q0
-; CHECK-GISEL-NEXT:    st1.s { v0 }[1], [x8]
+; CHECK-GISEL-NEXT:    mov s0, v0[1]
+; CHECK-GISEL-NEXT:    str s0, [x0], #8
 ; CHECK-GISEL-NEXT:    ret
   %elt = extractelement <2 x float> %in, i32 1
   store float %elt, ptr %addr
@@ -13791,9 +13603,9 @@ define <16 x i8> @test_v16i8_post_imm_ld1r(ptr %bar, ptr %ptr) {
 ;
 ; CHECK-GISEL-LABEL: test_v16i8_post_imm_ld1r:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ld1r.16b { v0 }, [x0]
-; CHECK-GISEL-NEXT:    add x8, x0, #1
-; CHECK-GISEL-NEXT:    str x8, [x1]
+; CHECK-GISEL-NEXT:    ldrb w8, [x0], #1
+; CHECK-GISEL-NEXT:    str x0, [x1]
+; CHECK-GISEL-NEXT:    dup.16b v0, w8
 ; CHECK-GISEL-NEXT:    ret
   %tmp1 = load i8, ptr %bar
   %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
@@ -13861,9 +13673,9 @@ define <8 x i8> @test_v8i8_post_imm_ld1r(ptr %bar, ptr %ptr) {
 ;
 ; CHECK-GISEL-LABEL: test_v8i8_post_imm_ld1r:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ld1r.8b { v0 }, [x0]
-; CHECK-GISEL-NEXT:    add x8, x0, #1
-; CHECK-GISEL-NEXT:    str x8, [x1]
+; CHECK-GISEL-NEXT:    ldrb w8, [x0], #1
+; CHECK-GISEL-NEXT:    str x0, [x1]
+; CHECK-GISEL-NEXT:    dup.8b v0, w8
 ; CHECK-GISEL-NEXT:    ret
   %tmp1 = load i8, ptr %bar
   %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
@@ -13915,9 +13727,9 @@ define <8 x i16> @test_v8i16_post_imm_ld1r(ptr %bar, ptr %ptr) {
 ;
 ; CHECK-GISEL-LABEL: test_v8i16_post_imm_ld1r:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ld1r.8h { v0 }, [x0]
-; CHECK-GISEL-NEXT:    add x8, x0, #2
-; CHECK-GISEL-NEXT:    str x8, [x1]
+; CHECK-GISEL-NEXT:    ldrh w8, [x0], #2
+; CHECK-GISEL-NEXT:    str x0, [x1]
+; CHECK-GISEL-NEXT:    dup.8h v0, w8
 ; CHECK-GISEL-NEXT:    ret
   %tmp1 = load i16, ptr %bar
   %tmp2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
@@ -13970,9 +13782,9 @@ define <4 x i16> @test_v4i16_post_imm_ld1r(ptr %bar, ptr %ptr) {
 ;
 ; CHECK-GISEL-LABEL: test_v4i16_post_imm_ld1r:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ld1r.4h { v0 }, [x0]
-; CHECK-GISEL-NEXT:    add x8, x0, #2
-; CHECK-GISEL-NEXT:    str x8, [x1]
+; CHECK-GISEL-NEXT:    ldrh w8, [x0], #2
+; CHECK-GISEL-NEXT:    str x0, [x1]
+; CHECK-GISEL-NEXT:    dup.4h v0, w8
 ; CHECK-GISEL-NEXT:    ret
   %tmp1 = load i16, ptr %bar
   %tmp2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
@@ -14017,9 +13829,9 @@ define <4 x i32> @test_v4i32_post_imm_ld1r(ptr %bar, ptr %ptr) {
 ;
 ; CHECK-GISEL-LABEL: test_v4i32_post_imm_ld1r:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ld1r.4s { v0 }, [x0]
-; CHECK-GISEL-NEXT:    add x8, x0, #4
-; CHECK-GISEL-NEXT:    str x8, [x1]
+; CHECK-GISEL-NEXT:    ldr w8, [x0], #4
+; CHECK-GISEL-NEXT:    str x0, [x1]
+; CHECK-GISEL-NEXT:    dup.4s v0, w8
 ; CHECK-GISEL-NEXT:    ret
   %tmp1 = load i32, ptr %bar
   %tmp2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp1, i32 0
@@ -14064,9 +13876,9 @@ define <2 x i32> @test_v2i32_post_imm_ld1r(ptr %bar, ptr %ptr) {
 ;
 ; CHECK-GISEL-LABEL: test_v2i32_post_imm_ld1r:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ld1r.2s { v0 }, [x0]
-; CHECK-GISEL-NEXT:    add x8, x0, #4
-; CHECK-GISEL-NEXT:    str x8, [x1]
+; CHECK-GISEL-NEXT:    ldr w8, [x0], #4
+; CHECK-GISEL-NEXT:    str x0, [x1]
+; CHECK-GISEL-NEXT:    dup.2s v0, w8
 ; CHECK-GISEL-NEXT:    ret
   %tmp1 = load i32, ptr %bar
   %tmp2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmp1, i32 0
@@ -14107,9 +13919,9 @@ define <2 x i64> @test_v2i64_post_imm_ld1r(ptr %bar, ptr %ptr) {
 ;
 ; CHECK-GISEL-LABEL: test_v2i64_post_imm_ld1r:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ld1r.2d { v0 }, [x0]
-; CHECK-GISEL-NEXT:    add x8, x0, #8
-; CHECK-GISEL-NEXT:    str x8, [x1]
+; CHECK-GISEL-NEXT:    ldr x8, [x0], #8
+; CHECK-GISEL-NEXT:    str x0, [x1]
+; CHECK-GISEL-NEXT:    dup.2d v0, x8
 ; CHECK-GISEL-NEXT:    ret
   %tmp1 = load i64, ptr %bar
   %tmp2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmp1, i32 0
@@ -14150,9 +13962,9 @@ define <4 x float> @test_v4f32_post_imm_ld1r(ptr %bar, ptr %ptr) {
 ;
 ; CHECK-GISEL-LABEL: test_v4f32_post_imm_ld1r:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ld1r.4s { v0 }, [x0]
-; CHECK-GISEL-NEXT:    add x8, x0, #4
-; CHECK-GISEL-NEXT:    str x8, [x1]
+; CHECK-GISEL-NEXT:    ldr s0, [x0], #4
+; CHECK-GISEL-NEXT:    str x0, [x1]
+; CHECK-GISEL-NEXT:    dup.4s v0, v0[0]
 ; CHECK-GISEL-NEXT:    ret
   %tmp1 = load float, ptr %bar
   %tmp2 = insertelement <4 x float> <float undef, float undef, float undef, float undef>, float %tmp1, i32 0
@@ -14197,9 +14009,9 @@ define <2 x float> @test_v2f32_post_imm_ld1r(ptr %bar, ptr %ptr) {
 ;
 ; CHECK-GISEL-LABEL: test_v2f32_post_imm_ld1r:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ld1r.2s { v0 }, [x0]
-; CHECK-GISEL-NEXT:    add x8, x0, #4
-; CHECK-GISEL-NEXT:    str x8, [x1]
+; CHECK-GISEL-NEXT:    ldr s0, [x0], #4
+; CHECK-GISEL-NEXT:    str x0, [x1]
+; CHECK-GISEL-NEXT:    dup.2s v0, v0[0]
 ; CHECK-GISEL-NEXT:    ret
   %tmp1 = load float, ptr %bar
   %tmp2 = insertelement <2 x float> <float undef, float undef>, float %tmp1, i32 0
@@ -14240,9 +14052,9 @@ define <2 x double> @test_v2f64_post_imm_ld1r(ptr %bar, ptr %ptr) {
 ;
 ; CHECK-GISEL-LABEL: test_v2f64_post_imm_ld1r:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ld1r.2d { v0 }, [x0]
-; CHECK-GISEL-NEXT:    add x8, x0, #8
-; CHECK-GISEL-NEXT:    str x8, [x1]
+; CHECK-GISEL-NEXT:    ldr d0, [x0], #8
+; CHECK-GISEL-NEXT:    str x0, [x1]
+; CHECK-GISEL-NEXT:    dup.2d v0, v0[0]
 ; CHECK-GISEL-NEXT:    ret
   %tmp1 = load double, ptr %bar
   %tmp2 = insertelement <2 x double> <double undef, double undef>, double %tmp1, i32 0
@@ -14283,10 +14095,9 @@ define <16 x i8> @test_v16i8_post_imm_ld1lane(ptr %bar, ptr %ptr, <16 x i8> %A)
 ;
 ; CHECK-GISEL-LABEL: test_v16i8_post_imm_ld1lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr b1, [x0]
-; CHECK-GISEL-NEXT:    add x8, x0, #1
-; CHECK-GISEL-NEXT:    str x8, [x1]
-; CHECK-GISEL-NEXT:    mov.b v0[1], v1[0]
+; CHECK-GISEL-NEXT:    ldrb w8, [x0], #1
+; CHECK-GISEL-NEXT:    str x0, [x1]
+; CHECK-GISEL-NEXT:    mov.b v0[1], w8
 ; CHECK-GISEL-NEXT:    ret
   %tmp1 = load i8, ptr %bar
   %tmp2 = insertelement <16 x i8> %A, i8 %tmp1, i32 1
@@ -14327,11 +14138,10 @@ define <8 x i8> @test_v8i8_post_imm_ld1lane(ptr %bar, ptr %ptr, <8 x i8> %A) {
 ;
 ; CHECK-GISEL-LABEL: test_v8i8_post_imm_ld1lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr b1, [x0]
+; CHECK-GISEL-NEXT:    ldrb w8, [x0], #1
 ; CHECK-GISEL-NEXT:    ; kill: def $d0 killed $d0 def $q0
-; CHECK-GISEL-NEXT:    add x8, x0, #1
-; CHECK-GISEL-NEXT:    str x8, [x1]
-; CHECK-GISEL-NEXT:    mov.b v0[1], v1[0]
+; CHECK-GISEL-NEXT:    str x0, [x1]
+; CHECK-GISEL-NEXT:    mov.b v0[1], w8
 ; CHECK-GISEL-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-GISEL-NEXT:    ret
   %tmp1 = load i8, ptr %bar
@@ -14375,10 +14185,9 @@ define <8 x i16> @test_v8i16_post_imm_ld1lane(ptr %bar, ptr %ptr, <8 x i16> %A)
 ;
 ; CHECK-GISEL-LABEL: test_v8i16_post_imm_ld1lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr h1, [x0]
-; CHECK-GISEL-NEXT:    add x8, x0, #2
-; CHECK-GISEL-NEXT:    str x8, [x1]
-; CHECK-GISEL-NEXT:    mov.h v0[1], v1[0]
+; CHECK-GISEL-NEXT:    ldrh w8, [x0], #2
+; CHECK-GISEL-NEXT:    str x0, [x1]
+; CHECK-GISEL-NEXT:    mov.h v0[1], w8
 ; CHECK-GISEL-NEXT:    ret
   %tmp1 = load i16, ptr %bar
   %tmp2 = insertelement <8 x i16> %A, i16 %tmp1, i32 1
@@ -14420,11 +14229,10 @@ define <4 x i16> @test_v4i16_post_imm_ld1lane(ptr %bar, ptr %ptr, <4 x i16> %A)
 ;
 ; CHECK-GISEL-LABEL: test_v4i16_post_imm_ld1lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr h1, [x0]
+; CHECK-GISEL-NEXT:    ldrh w8, [x0], #2
 ; CHECK-GISEL-NEXT:    ; kill: def $d0 killed $d0 def $q0
-; CHECK-GISEL-NEXT:    add x8, x0, #2
-; CHECK-GISEL-NEXT:    str x8, [x1]
-; CHECK-GISEL-NEXT:    mov.h v0[1], v1[0]
+; CHECK-GISEL-NEXT:    str x0, [x1]
+; CHECK-GISEL-NEXT:    mov.h v0[1], w8
 ; CHECK-GISEL-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-GISEL-NEXT:    ret
   %tmp1 = load i16, ptr %bar
@@ -14469,10 +14277,9 @@ define <4 x i32> @test_v4i32_post_imm_ld1lane(ptr %bar, ptr %ptr, <4 x i32> %A)
 ;
 ; CHECK-GISEL-LABEL: test_v4i32_post_imm_ld1lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr s1, [x0]
-; CHECK-GISEL-NEXT:    add x8, x0, #4
-; CHECK-GISEL-NEXT:    str x8, [x1]
-; CHECK-GISEL-NEXT:    mov.s v0[1], v1[0]
+; CHECK-GISEL-NEXT:    ldr w8, [x0], #4
+; CHECK-GISEL-NEXT:    str x0, [x1]
+; CHECK-GISEL-NEXT:    mov.s v0[1], w8
 ; CHECK-GISEL-NEXT:    ret
   %tmp1 = load i32, ptr %bar
   %tmp2 = insertelement <4 x i32> %A, i32 %tmp1, i32 1
@@ -14514,11 +14321,10 @@ define <2 x i32> @test_v2i32_post_imm_ld1lane(ptr %bar, ptr %ptr, <2 x i32> %A)
 ;
 ; CHECK-GISEL-LABEL: test_v2i32_post_imm_ld1lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr s1, [x0]
+; CHECK-GISEL-NEXT:    ldr w8, [x0], #4
 ; CHECK-GISEL-NEXT:    ; kill: def $d0 killed $d0 def $q0
-; CHECK-GISEL-NEXT:    add x8, x0, #4
-; CHECK-GISEL-NEXT:    str x8, [x1]
-; CHECK-GISEL-NEXT:    mov.s v0[1], v1[0]
+; CHECK-GISEL-NEXT:    str x0, [x1]
+; CHECK-GISEL-NEXT:    mov.s v0[1], w8
 ; CHECK-GISEL-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-GISEL-NEXT:    ret
   %tmp1 = load i32, ptr %bar
@@ -14563,10 +14369,9 @@ define <2 x i64> @test_v2i64_post_imm_ld1lane(ptr %bar, ptr %ptr, <2 x i64> %A)
 ;
 ; CHECK-GISEL-LABEL: test_v2i64_post_imm_ld1lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr d1, [x0]
-; CHECK-GISEL-NEXT:    add x8, x0, #8
-; CHECK-GISEL-NEXT:    str x8, [x1]
-; CHECK-GISEL-NEXT:    mov.d v0[1], v1[0]
+; CHECK-GISEL-NEXT:    ldr x8, [x0], #8
+; CHECK-GISEL-NEXT:    str x0, [x1]
+; CHECK-GISEL-NEXT:    mov.d v0[1], x8
 ; CHECK-GISEL-NEXT:    ret
   %tmp1 = load i64, ptr %bar
   %tmp2 = insertelement <2 x i64> %A, i64 %tmp1, i32 1
@@ -14606,9 +14411,8 @@ define <4 x float> @test_v4f32_post_imm_ld1lane(ptr %bar, ptr %ptr, <4 x float>
 ;
 ; CHECK-GISEL-LABEL: test_v4f32_post_imm_ld1lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr s1, [x0]
-; CHECK-GISEL-NEXT:    add x8, x0, #4
-; CHECK-GISEL-NEXT:    str x8, [x1]
+; CHECK-GISEL-NEXT:    ldr s1, [x0], #4
+; CHECK-GISEL-NEXT:    str x0, [x1]
 ; CHECK-GISEL-NEXT:    mov.s v0[1], v1[0]
 ; CHECK-GISEL-NEXT:    ret
   %tmp1 = load float, ptr %bar
@@ -14651,10 +14455,9 @@ define <2 x float> @test_v2f32_post_imm_ld1lane(ptr %bar, ptr %ptr, <2 x float>
 ;
 ; CHECK-GISEL-LABEL: test_v2f32_post_imm_ld1lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr s1, [x0]
+; CHECK-GISEL-NEXT:    ldr s1, [x0], #4
 ; CHECK-GISEL-NEXT:    ; kill: def $d0 killed $d0 def $q0
-; CHECK-GISEL-NEXT:    add x8, x0, #4
-; CHECK-GISEL-NEXT:    str x8, [x1]
+; CHECK-GISEL-NEXT:    str x0, [x1]
 ; CHECK-GISEL-NEXT:    mov.s v0[1], v1[0]
 ; CHECK-GISEL-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-GISEL-NEXT:    ret
@@ -14700,9 +14503,8 @@ define <2 x double> @test_v2f64_post_imm_ld1lane(ptr %bar, ptr %ptr, <2 x double
 ;
 ; CHECK-GISEL-LABEL: test_v2f64_post_imm_ld1lane:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr d1, [x0]
-; CHECK-GISEL-NEXT:    add x8, x0, #8
-; CHECK-GISEL-NEXT:    str x8, [x1]
+; CHECK-GISEL-NEXT:    ldr d1, [x0], #8
+; CHECK-GISEL-NEXT:    str x0, [x1]
 ; CHECK-GISEL-NEXT:    mov.d v0[1], v1[0]
 ; CHECK-GISEL-NEXT:    ret
   %tmp1 = load double, ptr %bar

>From 109ad5ecfc11f8b5683d9ba8ea4888b62ab90bec Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara at apple.com>
Date: Wed, 18 Oct 2023 16:24:40 -0700
Subject: [PATCH 2/3] clang-format fixes

---
 llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp  | 5 ++---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index a8425db6584f61c..552601a90862c0a 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -954,8 +954,7 @@ static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
 
 /// Return true if 'MI' is a load or a store that may be fold it's address
 /// operand into the load / store addressing mode.
-static bool canFoldInAddressingMode(GLoadStore *MI,
-                                    const TargetLowering &TLI,
+static bool canFoldInAddressingMode(GLoadStore *MI, const TargetLowering &TLI,
                                     MachineRegisterInfo &MRI) {
   TargetLowering::AddrMode AM;
   auto *MF = MI->getMF();
@@ -995,7 +994,7 @@ unsigned getIndexedOpc(unsigned LdStOpc) {
 } // namespace
 
 bool CombinerHelper::isIndexedLoadStoreLegal(GLoadStore &LdSt) const {
-    // Check for legality.
+  // Check for legality.
   LLT PtrTy = MRI.getType(LdSt.getPointerReg());
   LLT Ty = MRI.getType(LdSt.getReg(0));
   LLT MemTy = LdSt.getMMO().getMemoryType();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b20c5823371c226..576d89255b64b3d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23623,7 +23623,7 @@ bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
   // HACK
   if (IsPre)
     return false; // Until we implement.
-  
+
   auto CstOffset = getIConstantVRegVal(Offset, MRI);
   if (!CstOffset || CstOffset->isZero())
     return false;

>From 1b3a6f263d3fdbbc60819290a02f012346f58a47 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara at apple.com>
Date: Thu, 19 Oct 2023 11:39:11 -0700
Subject: [PATCH 3/3] Address review comments.

---
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 24 +++++++------------
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp |  6 ++---
 2 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 552601a90862c0a..1cccddfd972221c 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -963,8 +963,7 @@ static bool canFoldInAddressingMode(GLoadStore *MI, const TargetLowering &TLI,
     return false;
 
   AM.HasBaseReg = true;
-  auto CstOff = getIConstantVRegVal(Addr->getOffsetReg(), MRI);
-  if (CstOff)
+  if (auto CstOff = getIConstantVRegVal(Addr->getOffsetReg(), MRI))
     AM.BaseOffs = CstOff->getSExtValue(); // [reg +/- imm]
   else
     AM.Scale = 1; // [reg +/- reg]
@@ -976,8 +975,7 @@ static bool canFoldInAddressingMode(GLoadStore *MI, const TargetLowering &TLI,
       MI->getMMO().getAddrSpace());
 }
 
-namespace {
-unsigned getIndexedOpc(unsigned LdStOpc) {
+static unsigned getIndexedOpc(unsigned LdStOpc) {
   switch (LdStOpc) {
   case TargetOpcode::G_LOAD:
     return TargetOpcode::G_INDEXED_LOAD;
@@ -991,7 +989,6 @@ unsigned getIndexedOpc(unsigned LdStOpc) {
     llvm_unreachable("Unexpected opcode");
   }
 }
-} // namespace
 
 bool CombinerHelper::isIndexedLoadStoreLegal(GLoadStore &LdSt) const {
   // Check for legality.
@@ -1024,8 +1021,7 @@ bool CombinerHelper::findPostIndexCandidate(GLoadStore &LdSt, Register &Addr,
   // G_STORE %val(s64), %baseptr(p0)
   // %offset:_(s64) = G_CONSTANT i64 -256
   // %new_addr:_(p0) = G_PTR_ADD %baseptr, %offset(s64)
-  auto &MF = *LdSt.getParent()->getParent();
-  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+  const auto &TLI = getTargetLowering();
 
   Register Ptr = LdSt.getPointerReg();
   // If the store is the only use, don't bother.
@@ -1065,6 +1061,7 @@ bool CombinerHelper::findPostIndexCandidate(GLoadStore &LdSt, Register &Addr,
 
     // Make sure the offset calculation is before the potentially indexed op.
     MachineInstr *OffsetDef = MRI.getVRegDef(Offset);
+    RematOffset = false;
     if (!dominates(*OffsetDef, LdSt)) {
       // If the offset however is just a G_CONSTANT, we can always just
       // rematerialize it where we need it.
@@ -1080,12 +1077,10 @@ bool CombinerHelper::findPostIndexCandidate(GLoadStore &LdSt, Register &Addr,
       // If the user is a later load/store that can be post-indexed, then don't
       // combine this one.
       auto *BasePtrLdSt = dyn_cast<GLoadStore>(&BasePtrUse);
-      if (BasePtrLdSt && BasePtrLdSt != &LdSt) {
-        if (dominates(LdSt, *BasePtrLdSt)) {
-          if (isIndexedLoadStoreLegal(*BasePtrLdSt))
-            return false;
-        }
-      }
+      if (BasePtrLdSt && BasePtrLdSt != &LdSt &&
+          dominates(LdSt, *BasePtrLdSt) &&
+          isIndexedLoadStoreLegal(*BasePtrLdSt))
+        return false;
 
       // Now we're looking for the key G_PTR_ADD instruction, which contains
       // the offset add that we want to fold.
@@ -1097,10 +1092,9 @@ bool CombinerHelper::findPostIndexCandidate(GLoadStore &LdSt, Register &Addr,
           if (BaseUseUse.getParent() != LdSt.getParent())
             return false;
 
-          if (auto *UseUseLdSt = dyn_cast<GLoadStore>(&BaseUseUse)) {
+          if (auto *UseUseLdSt = dyn_cast<GLoadStore>(&BaseUseUse))
             if (canFoldInAddressingMode(UseUseLdSt, TLI, MRI))
               return false;
-          }
         }
         if (!dominates(LdSt, BasePtrUse))
           return false; // All use must be dominated by the load/store.
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 5e96bc67d8ded13..b74c4021d3e4efc 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -607,10 +607,8 @@ bool AArch64RegisterBankInfo::onlyDefinesFP(const MachineInstr &MI,
 
 bool AArch64RegisterBankInfo::isLoadFromFPType(const MachineInstr &MI) const {
   // GMemOperation because we also want to match indexed loads.
-  auto *Load = dyn_cast<GMemOperation>(&MI);
-
-  const auto &MMO = Load->getMMO();
-  const Value *LdVal = MMO.getValue();
+  auto *MemOp = cast<GMemOperation>(&MI);
+  const Value *LdVal = MemOp->getMMO().getValue();
   if (!LdVal)
     return false;
 



More information about the llvm-commits mailing list