[llvm] [AArch64] merge index address with large offset into base address (PR #75343)

Wed Dec 20 03:07:35 PST 2023

https://github.com/vfdff updated https://github.com/llvm/llvm-project/pull/75343

>From ce8a0dec50c67d70dc7aa3fdf3b4608425d8cc8b Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde at huawei.com>
Date: Fri, 10 Nov 2023 07:29:03 -0500
Subject: [PATCH 1/3] [AArch64] merge index address with large offset into base
 address

A case for this transformation, https://gcc.godbolt.org/z/nhYcWq1WE
Fold
  mov     w8, #56952
  movk    w8, #15, lsl #16
  ldrb    w0, [x0, x8]
into
  add     x0, x0, 1036288
  ldrb    w0, [x0, 3704]

Only LDRBBroX is supported for the first time.
Fix https://github.com/llvm/llvm-project/issues/71917
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  10 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.h    |   3 +
 .../AArch64/AArch64LoadStoreOptimizer.cpp     | 229 ++++++++++++++++++
 llvm/test/CodeGen/AArch64/arm64-addrmode.ll   |   5 +-
 .../AArch64/large-offset-ldr-merge.mir        |  47 ++++
 5 files changed, 291 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 50cbd3672fbd0d..dbc4885057401c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4089,6 +4089,16 @@ AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
   return MI.getOperand(Idx);
 }
 
+const MachineOperand &
+AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+  case AArch64::LDRBBroX:
+    return MI.getOperand(4);
+  }
+}
+
 static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
                                               Register Reg) {
   if (MI.getParent() == nullptr)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index e97ff0a9758d69..6ad1ef643a6897 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -111,6 +111,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
   /// Returns the immediate offset operator of a load/store.
   static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI);
 
+  /// Returns the shift amount operator of a load/store.
+  static const MachineOperand &getLdStAmountOp(const MachineInstr &MI);
+
   /// Returns whether the instruction is FP or NEON.
   static bool isFpOrNEON(const MachineInstr &MI);
 
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index dc6d5b8950c34a..b435b3ce03e7ee 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -62,6 +62,8 @@ STATISTIC(NumUnscaledPairCreated,
           "Number of load/store from unscaled generated");
 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
+STATISTIC(NumConstOffsetFolded,
+          "Number of const offset of index address folded");
 
 DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
               "Controls which pairs are considered for renaming");
@@ -75,6 +77,11 @@ static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
                                      cl::Hidden);
 
+// The LdStConstLimit limits how far we search for const offset instructions
+// when we form index address load/store instructions.
+static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
+                                        cl::init(10), cl::Hidden);
+
 // Enable register renaming to find additional store pairing opportunities.
 static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
                                     cl::init(true), cl::Hidden);
@@ -171,6 +178,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
                                 int UnscaledOffset, unsigned Limit);
 
+  // Scan the instruction list to find a register assigned with a const
+  // value that can be combined with the current instruction (a load or store)
+  // using base addressing with writeback. Scan forwards.
+  MachineBasicBlock::iterator
+  findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
+                                  unsigned &Offset);
+
   // Scan the instruction list to find a base register update that can
   // be combined with the current instruction (a load or store) using
   // pre or post indexed addressing with writeback. Scan backwards.
@@ -182,11 +196,19 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
                             unsigned BaseReg, int Offset);
 
+  bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
+                              unsigned IndexReg, unsigned &Offset);
+
   // Merge a pre- or post-index base register update into a ld/st instruction.
   MachineBasicBlock::iterator
   mergeUpdateInsn(MachineBasicBlock::iterator I,
                   MachineBasicBlock::iterator Update, bool IsPreIdx);
 
+  MachineBasicBlock::iterator
+  mergeConstOffsetInsn(MachineBasicBlock::iterator I,
+                       MachineBasicBlock::iterator Update, unsigned Offset,
+                       int Scale);
+
   // Find and merge zero store instructions.
   bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
 
@@ -199,6 +221,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Find and merge a base register updates before or after a ld/st instruction.
   bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
 
+  // Find and merge a index ldr/st instructions into a base ld/st instruction.
+  bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
+
   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -481,6 +506,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
   }
 }
 
+static unsigned getBaseAddressOpcode(unsigned Opc) {
+  // TODO: Add more index address loads/stores.
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no base address equivalent!");
+  case AArch64::LDRBBroX:
+    return AArch64::LDRBBui;
+  }
+}
+
 static unsigned getPostIndexedOpcode(unsigned Opc) {
   switch (Opc) {
   default:
@@ -722,6 +757,20 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
   }
 }
 
+// Make sure this is a reg+reg Ld/St
+static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  default:
+    return false;
+  // Scaled instructions.
+  // TODO: Add more index address loads/stores.
+  case AArch64::LDRBBroX:
+    Scale = 1;
+    return true;
+  }
+}
+
 static bool isRewritableImplicitDef(unsigned Opc) {
   switch (Opc) {
   default:
@@ -2018,6 +2067,63 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
   return NextI;
 }
 
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
+                                          MachineBasicBlock::iterator Update,
+                                          unsigned Offset, int Scale) {
+  assert((Update->getOpcode() == AArch64::MOVKWi) &&
+         "Unexpected const mov instruction to merge!");
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator NextI = next_nodbg(I, E);
+  MachineBasicBlock::iterator PrevI = prev_nodbg(Update, E);
+  MachineInstr &MemMI = *I;
+  unsigned Mask = (1 << 12) * Scale - 1;
+  unsigned Low = Offset & Mask;
+  unsigned High = Offset - Low;
+  Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
+  Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
+  MachineInstrBuilder AddMIB, MemMIB;
+
+  // Add IndexReg, BaseReg, High (the BaseReg may be SP)
+  AddMIB =
+      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(AArch64::ADDXri))
+          .addDef(IndexReg)
+          .addUse(BaseReg)
+          .addImm(High >> 12) // shifted value
+          .addImm(12);        // shift 12
+  (void)AddMIB;
+  // Ld/St DestReg, IndexReg, Imm12
+  unsigned NewOpc = getBaseAddressOpcode(I->getOpcode());
+  MemMIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+               .add(getLdStRegOp(MemMI))
+               .add(AArch64InstrInfo::getLdStOffsetOp(MemMI))
+               .addImm(Low / Scale)
+               .setMemRefs(I->memoperands())
+               .setMIFlags(I->mergeFlagsWith(*Update));
+  (void)MemMIB;
+
+  ++NumConstOffsetFolded;
+  LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
+  LLVM_DEBUG(dbgs() << "    Replacing instructions:\n    ");
+  LLVM_DEBUG(PrevI->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(Update->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(I->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
+  LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions for the block.
+  I->eraseFromParent();
+  PrevI->eraseFromParent();
+  Update->eraseFromParent();
+
+  return NextI;
+}
+
 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
                                                MachineInstr &MI,
                                                unsigned BaseReg, int Offset) {
@@ -2065,6 +2171,31 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
   return false;
 }
 
+bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
+                                                 MachineInstr &MI,
+                                                 unsigned IndexReg,
+                                                 unsigned &Offset) {
+  // The update instruction source and destination register must be the
+  // same as the load/store index register.
+  if (MI.getOpcode() == AArch64::MOVKWi &&
+      TRI->isSuperOrSubRegisterEq(IndexReg, MI.getOperand(1).getReg())) {
+
+    // movz + movk hold a large offset of a Ld/St instruction.
+    MachineBasicBlock::iterator B = MI.getParent()->begin();
+    MachineBasicBlock::iterator MBBI = &MI;
+    MBBI = prev_nodbg(MBBI, B);
+    MachineInstr &MovzMI = *MBBI;
+    if (MovzMI.getOpcode() == AArch64::MOVZWi) {
+      unsigned Low = MovzMI.getOperand(1).getImm();
+      unsigned High = MI.getOperand(2).getImm() << MI.getOperand(3).getImm();
+      Offset = High + Low;
+      // 12-bit optionally shifted immediates are legal for adds.
+      return Offset >> 24 == 0;
+    }
+  }
+  return false;
+}
+
 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
     MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
   MachineBasicBlock::iterator E = I->getParent()->end();
@@ -2220,6 +2351,60 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   return E;
 }
 
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
+    MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
+  MachineBasicBlock::iterator B = I->getParent()->begin();
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineInstr &MemMI = *I;
+  MachineBasicBlock::iterator MBBI = I;
+
+  // If the load is the first instruction in the block, there's obviously
+  // not any matching load or store.
+  if (MBBI == B)
+    return E;
+
+  // Make sure the IndexReg is killed and the shift amount is zero.
+  // TODO: Relex this restriction to extend, simplify processing now.
+  if (!AArch64InstrInfo::getLdStOffsetOp(MemMI).isKill() ||
+      !AArch64InstrInfo::getLdStAmountOp(MemMI).isImm() ||
+      (AArch64InstrInfo::getLdStAmountOp(MemMI).getImm() != 0))
+    return E;
+
+  Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
+
+  // Track which register units have been modified and used between the first
+  // insn (inclusive) and the second insn.
+  ModifiedRegUnits.clear();
+  UsedRegUnits.clear();
+  unsigned Count = 0;
+  do {
+    MBBI = prev_nodbg(MBBI, B);
+    MachineInstr &MI = *MBBI;
+
+    // Don't count transient instructions towards the search limit since there
+    // may be different numbers of them if e.g. debug information is present.
+    if (!MI.isTransient())
+      ++Count;
+
+    // If we found a match, return it.
+    if (isMatchingMovConstInsn(*I, MI, IndexReg, Offset)) {
+      return MBBI;
+    }
+
+    // Update the status of what the instruction clobbered and used.
+    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
+
+    // Otherwise, if the index register is used or modified, we have no match,
+    // so return early.
+    if (!ModifiedRegUnits.available(IndexReg) ||
+        !UsedRegUnits.available(IndexReg))
+      return E;
+
+  } while (MBBI != B && Count < Limit);
+  return E;
+}
+
 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
     MachineBasicBlock::iterator &MBBI) {
   MachineInstr &MI = *MBBI;
@@ -2404,6 +2589,34 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
   return false;
 }
 
+bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI,
+                                              int Scale) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock::iterator E = MI.getParent()->end();
+  MachineBasicBlock::iterator Update;
+
+  // Don't know how to handle unscaled pre/post-index versions below, so bail.
+  if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
+    return false;
+
+  // Look back to try to find a const offset for index LdSt instruction. For
+  // example,
+  // mov x8, #LargeImm   ; = a * (1<<12) + imm12
+  // ldr x1, [x0, x8]
+  // merged into:
+  // add x8, x0, a * (1<<12)
+  // ldr x1, [x8, imm12]
+  unsigned Offset;
+  Update = findMatchingConstOffsetBackward(MBBI, LdStConstLimit, Offset);
+  if (Update != E && (Offset & (Scale - 1)) == 0) {
+    // Merge the imm12 into the ld/st.
+    MBBI = mergeConstOffsetInsn(MBBI, Update, Offset, Scale);
+    return true;
+  }
+
+  return false;
+}
+
 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
                                         bool EnableNarrowZeroStOpt) {
 
@@ -2482,6 +2695,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       ++MBBI;
   }
 
+  // 5) Find a register assigned with a const value that can be combined with
+  // into the load or store. e.g.,
+  //        mov x8, #LargeImm   ; = a * (1<<12) + imm12
+  //        ldr x1, [x0, x8]
+  //        ; becomes
+  //        add x8, x0, a * (1<<12)
+  //        ldr x1, [x8, imm12]
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    int Scale;
+    if (isMergeableIndexLdSt(*MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale))
+      Modified = true;
+    else
+      ++MBBI;
+  }
+
   return Modified;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
index 3d4749a7b8e7df..e457fbdc34e3c4 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -213,9 +213,8 @@ define void @t17(i64 %a) {
 define i32 @LdOffset_i8(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #56952 // =0xde78
-; CHECK-NEXT:    movk w8, #15, lsl #16
-; CHECK-NEXT:    ldrb w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT:    ldrb w0, [x8, #3704]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
   %val = load i8, ptr %arrayidx, align 1
diff --git a/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
new file mode 100644
index 00000000000000..15b6700398ea08
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
@@ -0,0 +1,47 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass aarch64-ldst-opt %s -o - | FileCheck %s
+
+
+---
+name:            LdOffset
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0', virtual-reg: '' }
+body:             |
+  bb.0.entry:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: LdOffset
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x8 = ADDXri $x0, 253, 12
+    ; CHECK-NEXT: renamable $w0 = LDRBBui killed renamable $x8, 3704
+    ; CHECK-NEXT: RET undef $lr, implicit $w0
+    renamable $w8 = MOVZWi 56952, 0
+    renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
+    renamable $w0 = LDRBBroX killed renamable $x0, killed renamable $x8, 0, 0
+    RET undef $lr, implicit $w0
+...
+
+# Negative test: the IndexReg missing killed flags
+---
+name:            LdOffset_missing_killed
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0', virtual-reg: '' }
+body:             |
+  bb.0.entry:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: LdOffset_missing_killed
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $w8 = MOVZWi 56952, 0
+    ; CHECK-NEXT: renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
+    ; CHECK-NEXT: renamable $w0 = LDRBBroX killed renamable $x0, renamable $x8, 0, 0
+    ; CHECK-NEXT: RET undef $lr, implicit $w0
+    renamable $w8 = MOVZWi 56952, 0
+    renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
+    renamable $w0 = LDRBBroX killed renamable $x0, renamable $x8, 0, 0
+    RET undef $lr, implicit $w0
+...

>From b11ed555c31328a824e3b50ae8ed16a8d2482b66 Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde at huawei.com>
Date: Mon, 18 Dec 2023 00:51:48 -0500
Subject: [PATCH 2/3] [AArch64] Fold more load.x into load.i with large offset

The list of load.x is refer to canFoldIntoAddrMode on D152828.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  | 12 +++++
 .../AArch64/AArch64LoadStoreOptimizer.cpp     | 46 ++++++++++++++++++-
 llvm/test/CodeGen/AArch64/arm64-addrmode.ll   | 10 ++--
 3 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index dbc4885057401c..41a0907ae0310d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4094,7 +4094,19 @@ AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     llvm_unreachable("Unexpected opcode");
+  case AArch64::LDRBroX:
   case AArch64::LDRBBroX:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSBWroX:
+  case AArch64::LDRHroX:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSHWroX:
+  case AArch64::LDRWroX:
+  case AArch64::LDRSWroX:
+  case AArch64::LDRDroX:
+  case AArch64::LDRXroX:
+  case AArch64::LDRQroX:
     return MI.getOperand(4);
   }
 }
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index b435b3ce03e7ee..e19c1e1a9395a9 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -511,8 +511,32 @@ static unsigned getBaseAddressOpcode(unsigned Opc) {
   switch (Opc) {
   default:
     llvm_unreachable("Opcode has no base address equivalent!");
+  case AArch64::LDRBroX:
+    return AArch64::LDRBui;
   case AArch64::LDRBBroX:
     return AArch64::LDRBBui;
+  case AArch64::LDRSBXroX:
+    return AArch64::LDRSBXui;
+  case AArch64::LDRSBWroX:
+    return AArch64::LDRSBWui;
+  case AArch64::LDRHroX:
+    return AArch64::LDRHui;
+  case AArch64::LDRHHroX:
+    return AArch64::LDRHHui;
+  case AArch64::LDRSHXroX:
+    return AArch64::LDRSHXui;
+  case AArch64::LDRSHWroX:
+    return AArch64::LDRSHWui;
+  case AArch64::LDRWroX:
+    return AArch64::LDRWui;
+  case AArch64::LDRSWroX:
+    return AArch64::LDRSWui;
+  case AArch64::LDRDroX:
+    return AArch64::LDRDui;
+  case AArch64::LDRXroX:
+    return AArch64::LDRXui;
+  case AArch64::LDRQroX:
+    return AArch64::LDRQui;
   }
 }
 
@@ -764,10 +788,30 @@ static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
   default:
     return false;
   // Scaled instructions.
-  // TODO: Add more index address loads/stores.
+  // TODO: Add more index address stores.
+  case AArch64::LDRBroX:
   case AArch64::LDRBBroX:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSBWroX:
     Scale = 1;
     return true;
+  case AArch64::LDRHroX:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSHWroX:
+    Scale = 2;
+    return true;
+  case AArch64::LDRWroX:
+  case AArch64::LDRSWroX:
+    Scale = 4;
+    return true;
+  case AArch64::LDRDroX:
+  case AArch64::LDRXroX:
+    Scale = 8;
+    return true;
+  case AArch64::LDRQroX:
+    Scale = 16;
+    return true;
   }
 }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
index e457fbdc34e3c4..36c8c0d7db03b7 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -225,9 +225,8 @@ define i32 @LdOffset_i8(ptr %a)  {
 define i32 @LdOffset_i16(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
-; CHECK-NEXT:    movk w8, #31, lsl #16
-; CHECK-NEXT:    ldrsh w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #506, lsl #12 // =2072576
+; CHECK-NEXT:    ldrsh w0, [x8, #7408]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
   %val = load i16, ptr %arrayidx, align 2
@@ -238,9 +237,8 @@ define i32 @LdOffset_i16(ptr %a)  {
 define i32 @LdOffset_i32(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #31200 // =0x79e0
-; CHECK-NEXT:    movk w8, #63, lsl #16
-; CHECK-NEXT:    ldr w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #1012, lsl #12 // =4145152
+; CHECK-NEXT:    ldr w0, [x8, #14816]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
   %val = load i32, ptr %arrayidx, align 4

>From cec531962aba54c444d1da43178aa5aa3c4048ae Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde at huawei.com>
Date: Wed, 20 Dec 2023 03:33:16 -0500
Subject: [PATCH 3/3] [AArch64] Address comment and add more tests

---
 .../AArch64/AArch64LoadStoreOptimizer.cpp     |   4 +-
 llvm/test/CodeGen/AArch64/arm64-addrmode.ll   | 195 +++++++++++++++++-
 2 files changed, 195 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index e19c1e1a9395a9..4a95bee30c8bf3 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -180,7 +180,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
 
   // Scan the instruction list to find a register assigned with a const
   // value that can be combined with the current instruction (a load or store)
-  // using base addressing with writeback. Scan forwards.
+  // using base addressing with writeback. Scan backwards.
   MachineBasicBlock::iterator
   findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
                                   unsigned &Offset);
@@ -221,7 +221,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Find and merge a base register updates before or after a ld/st instruction.
   bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
 
-  // Find and merge a index ldr/st instructions into a base ld/st instruction.
+  // Find and merge a index ldr/st instruction into a base ld/st instruction.
   bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
 
   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
index 36c8c0d7db03b7..a69d641b6512f8 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -210,11 +210,24 @@ define void @t17(i64 %a) {
   ret void
 }
 
-define i32 @LdOffset_i8(ptr %a)  {
+; LDRBBroX
+define i8 @LdOffset_i8(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
 ; CHECK-NEXT:    ldrb w0, [x8, #3704]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
+  %val = load i8, ptr %arrayidx, align 1
+  ret i8 %val
+}
+
+; LDRBBroX
+define i32 @LdOffset_i8_zext32(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_zext32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT:    ldrb w0, [x8, #3704]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
   %val = load i8, ptr %arrayidx, align 1
@@ -222,10 +235,75 @@ define i32 @LdOffset_i8(ptr %a)  {
   ret i32 %conv
 }
 
-define i32 @LdOffset_i16(ptr %a)  {
+; LDRSBWroX
+define i32 @LdOffset_i8_sext32(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_sext32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT:    ldrsb w0, [x8, #3704]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
+  %val = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %val to i32
+  ret i32 %conv
+}
+
+; LDRBBroX
+define i64 @LdOffset_i8_zext64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_zext64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT:    ldrb w0, [x8, #3704]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
+  %val = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %val to i64
+  ret i64 %conv
+}
+
+; LDRSBXroX
+define i64 @LdOffset_i8_sext64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_sext64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT:    ldrsb x0, [x8, #3704]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
+  %val = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %val to i64
+  ret i64 %conv
+}
+
+; LDRHHroX
+define i16 @LdOffset_i16(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add x8, x0, #506, lsl #12 // =2072576
+; CHECK-NEXT:    ldrh w0, [x8, #7408]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
+  %val = load i16, ptr %arrayidx, align 2
+  ret i16 %val
+}
+
+; LDRHHroX
+define i32 @LdOffset_i16_zext32(ptr %a)  {
+; CHECK-LABEL: LdOffset_i16_zext32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #506, lsl #12 // =2072576
+; CHECK-NEXT:    ldrh w0, [x8, #7408]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
+  %val = load i16, ptr %arrayidx, align 2
+  %conv = zext i16 %val to i32
+  ret i32 %conv
+}
+
+; LDRSHWroX
+define i32 @LdOffset_i16_sext32(ptr %a)  {
+; CHECK-LABEL: LdOffset_i16_sext32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #506, lsl #12 // =2072576
 ; CHECK-NEXT:    ldrsh w0, [x8, #7408]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
@@ -234,6 +312,33 @@ define i32 @LdOffset_i16(ptr %a)  {
   ret i32 %conv
 }
 
+; LDRHHroX
+define i64 @LdOffset_i16_zext64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i16_zext64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #506, lsl #12 // =2072576
+; CHECK-NEXT:    ldrh w0, [x8, #7408]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
+  %val = load i16, ptr %arrayidx, align 2
+  %conv = zext i16 %val to i64
+  ret i64 %conv
+}
+
+; LDRSHXroX
+define i64 @LdOffset_i16_sext64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i16_sext64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #506, lsl #12 // =2072576
+; CHECK-NEXT:    ldrsh x0, [x8, #7408]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
+  %val = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %val to i64
+  ret i64 %conv
+}
+
+; LDRWroX
 define i32 @LdOffset_i32(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i32:
 ; CHECK:       // %bb.0:
@@ -245,6 +350,68 @@ define i32 @LdOffset_i32(ptr %a)  {
   ret i32 %val
 }
 
+; LDRWroX
+define i64 @LdOffset_i32_zext64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i32_zext64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #1012, lsl #12 // =4145152
+; CHECK-NEXT:    ldr w0, [x8, #14816]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
+  %val = load i32, ptr %arrayidx, align 2
+  %conv = zext i32 %val to i64
+  ret i64 %conv
+}
+
+; LDRSWroX
+define i64 @LdOffset_i32_sext64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i32_sext64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #1012, lsl #12 // =4145152
+; CHECK-NEXT:    ldrsw x0, [x8, #14816]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
+  %val = load i32, ptr %arrayidx, align 2
+  %conv = sext i32 %val to i64
+  ret i64 %conv
+}
+
+; LDRXroX
+define i64 @LdOffset_i64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #2024, lsl #12 // =8290304
+; CHECK-NEXT:    ldr x0, [x8, #29632]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 1039992
+  %val = load i64, ptr %arrayidx, align 4
+  ret i64 %val
+}
+
+; LDRDroX
+define <2 x i32> @LdOffset_v2i32(ptr %a)  {
+; CHECK-LABEL: LdOffset_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #2024, lsl #12 // =8290304
+; CHECK-NEXT:    ldr d0, [x8, #29632]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds <2 x i32>, ptr %a, i64 1039992
+  %val = load <2 x i32>, ptr %arrayidx, align 4
+  ret <2 x i32> %val
+}
+
+; LDRQroX
+define <2 x i64> @LdOffset_v2i64(ptr %a)  {
+; CHECK-LABEL: LdOffset_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #4048, lsl #12 // =16580608
+; CHECK-NEXT:    ldr q0, [x8, #59264]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds <2 x i64>, ptr %a, i64 1039992
+  %val = load <2 x i64>, ptr %arrayidx, align 4
+  ret <2 x i64> %val
+}
+
 define i64 @LdOffset_i64_multi_offset(ptr %a) {
 ; CHECK-LABEL: LdOffset_i64_multi_offset:
 ; CHECK:       // %bb.0:
@@ -292,3 +459,27 @@ define i32 @LdOffset_i16_odd_offset(ptr nocapture noundef readonly %a)  {
   ret i32 %conv
 }
 
+; Already encoded with a signle mov MOVNWi
+define i8 @LdOffset_i8_movnwi(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_movnwi:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #16777215 // =0xffffff
+; CHECK-NEXT:    ldrb w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 16777215
+  %val = load i8, ptr %arrayidx, align 1
+  ret i8 %val
+}
+
+; Negative test: the offset is too large to encoded with a add
+define i8 @LdOffset_i8_too_large(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_too_large:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    movk w8, #256, lsl #16
+; CHECK-NEXT:    ldrb w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 16777217
+  %val = load i8, ptr %arrayidx, align 1
+  ret i8 %val
+}