[llvm] [AArch64] merge index address with large offset into base address (PR #72187)

Mon Dec 4 04:38:56 PST 2023

https://github.com/vfdff updated https://github.com/llvm/llvm-project/pull/72187

>From 1dcb12bd4302e876830cf945d50732241b5ff54c Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde at huawei.com>
Date: Fri, 1 Dec 2023 14:32:27 +0800
Subject: [PATCH 1/3] [AArch64] Precommit tests for PR71917

---
 llvm/test/CodeGen/AArch64/arm64-addrmode.ll | 87 +++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
index 69c558d9d5599..72c979c5a844d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -209,3 +209,90 @@ define void @t17(i64 %a) {
   %3 = load volatile i64, ptr %2, align 8
   ret void
 }
+
+define i32 @LdOffset_i8(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #56952 // =0xde78
+; CHECK-NEXT:    movk w8, #15, lsl #16
+; CHECK-NEXT:    ldrb w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
+  %val = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %val to i32
+  ret i32 %conv
+}
+
+define i32 @LdOffset_i16(ptr %a)  {
+; CHECK-LABEL: LdOffset_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
+; CHECK-NEXT:    movk w8, #31, lsl #16
+; CHECK-NEXT:    ldrsh w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
+  %val = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %val to i32
+  ret i32 %conv
+}
+
+define i32 @LdOffset_i32(ptr %a)  {
+; CHECK-LABEL: LdOffset_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #31200 // =0x79e0
+; CHECK-NEXT:    movk w8, #63, lsl #16
+; CHECK-NEXT:    ldr w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
+  %val = load i32, ptr %arrayidx, align 4
+  ret i32 %val
+}
+
+define i64 @LdOffset_i64_multi_offset(ptr %a) {
+; CHECK-LABEL: LdOffset_i64_multi_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #2031, lsl #12 // =8318976
+; CHECK-NEXT:    add x8, x8, #960
+; CHECK-NEXT:    ldr x9, [x8]
+; CHECK-NEXT:    ldr x8, [x8, #2056]
+; CHECK-NEXT:    add x0, x8, x9
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 1039992
+  %val0 = load i64, ptr %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds i64, ptr %a, i64 1040249
+  %val1 = load i64, ptr %arrayidx1, align 8
+  %add = add nsw i64 %val1, %val0
+  ret i64 %add
+}
+
+define i64 @LdOffset_i64_multi_offset_with_commmon_base(ptr %a) {
+; CHECK-LABEL: LdOffset_i64_multi_offset_with_commmon_base:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #507, lsl #12 // =2076672
+; CHECK-NEXT:    ldr x9, [x8, #26464]
+; CHECK-NEXT:    ldr x8, [x8, #26496]
+; CHECK-NEXT:    add x0, x8, x9
+; CHECK-NEXT:    ret
+  %b = getelementptr inbounds i16, ptr %a, i64 1038336
+  %arrayidx = getelementptr inbounds i64, ptr %b, i64 3308
+  %val0 = load i64, ptr %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds i64, ptr %b, i64 3312
+  %val1 = load i64, ptr %arrayidx1, align 8
+  %add = add nsw i64 %val1, %val0
+  ret i64 %add
+}
+
+; Negative test: the offset is odd
+define i32 @LdOffset_i16_odd_offset(ptr nocapture noundef readonly %a)  {
+; CHECK-LABEL: LdOffset_i16_odd_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #56953 // =0xde79
+; CHECK-NEXT:    movk w8, #15, lsl #16
+; CHECK-NEXT:    ldrsh w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039993
+  %val = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %val to i32
+  ret i32 %conv
+}
+

>From 275bbce64f74138d23d45e261c1aa92b5d84357a Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde at huawei.com>
Date: Tue, 28 Nov 2023 07:05:51 -0500
Subject: [PATCH 2/3] [CGP][AArch64] Rebase the common base offset for better
 ISel

When all the large const offsets masked with the same value from bit-12 to bit-23.
Fold
  add     x8, x0, #2031, lsl #12
  add     x8, x8, #960
  ldr     x9, [x8, x8]
  ldr     x8, [x8, #2056]

into
  add     x8, x0, #2031, lsl #12
  ldr     x9, [x8, #960]
  ldr     x8, [x8, #3016]
---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |  4 +
 llvm/include/llvm/CodeGen/TargetLowering.h    |  8 +-
 llvm/lib/CodeGen/CodeGenPrepare.cpp           | 79 ++++++++++++-------
 .../Target/AArch64/AArch64ISelLowering.cpp    | 14 ++++
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  3 +
 llvm/test/CodeGen/AArch64/arm64-addrmode.ll   |  5 +-
 .../AArch64/large-offset-gep.ll               | 57 ++++++-------
 7 files changed, 105 insertions(+), 65 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 7a8f36da58cec..6cba7146e8296 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -342,6 +342,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
   }
 
+  int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) {
+    return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset);
+  }
+
   unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
                              Type *ScalarValTy) const {
     auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 9ebcc28c38ae6..db802bddbcc23 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -31,8 +31,8 @@
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/LowLevelTypeUtils.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -2720,6 +2720,12 @@ class TargetLoweringBase {
                                      Type *Ty, unsigned AddrSpace,
                                      Instruction *I = nullptr) const;
 
+  /// Return the prefered common base offset.
+  virtual int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset,
+                                                 int64_t MaxOffset) const {
+    return 0;
+  }
+
   /// Return true if the specified immediate is legal icmp immediate, that is
   /// the target has icmp instructions which can compare a register against the
   /// immediate without having to materialize the immediate into a register.
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 07dc718ee3a38..dd3a0af5d789e 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -6099,6 +6099,55 @@ bool CodeGenPrepare::splitLargeGEPOffsets() {
     int64_t BaseOffset = LargeOffsetGEPs.begin()->second;
     Value *NewBaseGEP = nullptr;
 
+    auto createNewBase = [&](int64_t BaseOffset, Value *OldBase,
+                             GetElementPtrInst *GEP) {
+      LLVMContext &Ctx = GEP->getContext();
+      Type *PtrIdxTy = DL->getIndexType(GEP->getType());
+      Type *I8PtrTy =
+          PointerType::get(Ctx, GEP->getType()->getPointerAddressSpace());
+      Type *I8Ty = Type::getInt8Ty(Ctx);
+
+      BasicBlock::iterator NewBaseInsertPt;
+      BasicBlock *NewBaseInsertBB;
+      if (auto *BaseI = dyn_cast<Instruction>(OldBase)) {
+        // If the base of the struct is an instruction, the new base will be
+        // inserted close to it.
+        NewBaseInsertBB = BaseI->getParent();
+        if (isa<PHINode>(BaseI))
+          NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
+        else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(BaseI)) {
+          NewBaseInsertBB =
+              SplitEdge(NewBaseInsertBB, Invoke->getNormalDest(), DT.get(), LI);
+          NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
+        } else
+          NewBaseInsertPt = std::next(BaseI->getIterator());
+      } else {
+        // If the current base is an argument or global value, the new base
+        // will be inserted to the entry block.
+        NewBaseInsertBB = &BaseGEP->getFunction()->getEntryBlock();
+        NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
+      }
+      IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt);
+      // Create a new base.
+      Value *BaseIndex = ConstantInt::get(PtrIdxTy, BaseOffset);
+      NewBaseGEP = OldBase;
+      if (NewBaseGEP->getType() != I8PtrTy)
+        NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy);
+      NewBaseGEP =
+          NewBaseBuilder.CreateGEP(I8Ty, NewBaseGEP, BaseIndex, "splitgep");
+      NewGEPBases.insert(NewBaseGEP);
+      return;
+    };
+
+    // Check whether all the offsets can be encoded with prefered common base.
+    if (int64_t PreferBase = TLI->getPreferredLargeGEPBaseOffset(
+            LargeOffsetGEPs.front().second, LargeOffsetGEPs.back().second)) {
+      BaseOffset = PreferBase;
+      // Create a new base if the offset of the BaseGEP can be decoded with one
+      // instruction.
+      createNewBase(BaseOffset, OldBase, BaseGEP);
+    }
+
     auto *LargeOffsetGEP = LargeOffsetGEPs.begin();
     while (LargeOffsetGEP != LargeOffsetGEPs.end()) {
       GetElementPtrInst *GEP = LargeOffsetGEP->first;
@@ -6131,35 +6180,7 @@ bool CodeGenPrepare::splitLargeGEPOffsets() {
       if (!NewBaseGEP) {
         // Create a new base if we don't have one yet.  Find the insertion
         // pointer for the new base first.
-        BasicBlock::iterator NewBaseInsertPt;
-        BasicBlock *NewBaseInsertBB;
-        if (auto *BaseI = dyn_cast<Instruction>(OldBase)) {
-          // If the base of the struct is an instruction, the new base will be
-          // inserted close to it.
-          NewBaseInsertBB = BaseI->getParent();
-          if (isa<PHINode>(BaseI))
-            NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
-          else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(BaseI)) {
-            NewBaseInsertBB =
-                SplitEdge(NewBaseInsertBB, Invoke->getNormalDest(), DT.get(), LI);
-            NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
-          } else
-            NewBaseInsertPt = std::next(BaseI->getIterator());
-        } else {
-          // If the current base is an argument or global value, the new base
-          // will be inserted to the entry block.
-          NewBaseInsertBB = &BaseGEP->getFunction()->getEntryBlock();
-          NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt();
-        }
-        IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt);
-        // Create a new base.
-        Value *BaseIndex = ConstantInt::get(PtrIdxTy, BaseOffset);
-        NewBaseGEP = OldBase;
-        if (NewBaseGEP->getType() != I8PtrTy)
-          NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy);
-        NewBaseGEP =
-            NewBaseBuilder.CreateGEP(I8Ty, NewBaseGEP, BaseIndex, "splitgep");
-        NewGEPBases.insert(NewBaseGEP);
+        createNewBase(BaseOffset, OldBase, GEP);
       }
 
       IRBuilder<> Builder(GEP);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d42ae4ff93a44..ab2ebe3fe3e17 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15977,6 +15977,20 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
                                                           AM.Scale);
 }
 
+// Check whether the 2 offsets belong to the same imm24 range, and their high
+// 12bits are same, then their high part can be decoded with the offset of add.
+int64_t
+AArch64TargetLowering::getPreferredLargeGEPBaseOffset(int64_t MinOffset,
+                                                      int64_t MaxOffset) const {
+  int64_t HighPart = MinOffset & ~0xfffULL;
+  if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
+    // Rebase the value to an integer multiple of imm12.
+    return HighPart;
+  }
+
+  return 0;
+}
+
 bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
   // Consider splitting large offset of struct or array.
   return true;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 2a039488f2a9a..6fd33472e1d39 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -679,6 +679,9 @@ class AArch64TargetLowering : public TargetLowering {
                              unsigned AS,
                              Instruction *I = nullptr) const override;
 
+  int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset,
+                                         int64_t MaxOffset) const override;
+
   /// Return true if an FMA operation is faster than a pair of fmul and fadd
   /// instructions. fmuladd intrinsics will be expanded to FMAs when this method
   /// returns true, otherwise fmuladd is expanded to fmul + fadd.
diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
index 72c979c5a844d..3d4749a7b8e7d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -252,9 +252,8 @@ define i64 @LdOffset_i64_multi_offset(ptr %a) {
 ; CHECK-LABEL: LdOffset_i64_multi_offset:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add x8, x0, #2031, lsl #12 // =8318976
-; CHECK-NEXT:    add x8, x8, #960
-; CHECK-NEXT:    ldr x9, [x8]
-; CHECK-NEXT:    ldr x8, [x8, #2056]
+; CHECK-NEXT:    ldr x9, [x8, #960]
+; CHECK-NEXT:    ldr x8, [x8, #3016]
 ; CHECK-NEXT:    add x0, x8, x9
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i64, ptr %a, i64 1039992
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll
index 080b3dd75ee9a..097575ca86bcc 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll
@@ -6,18 +6,17 @@
 define void @test1(ptr %s, i32 %n) {
 ; CHECK-LABEL: test1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    mov w10, #40000 // =0x9c40
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    add x9, x9, x10
-; CHECK-NEXT:    cmp w8, w1
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    mov w9, wzr
+; CHECK-NEXT:    add x8, x8, #9, lsl #12 // =36864
+; CHECK-NEXT:    cmp w9, w1
 ; CHECK-NEXT:    b.ge .LBB0_2
 ; CHECK-NEXT:  .LBB0_1: // %while_body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    str w8, [x9, #4]
-; CHECK-NEXT:    add w8, w8, #1
-; CHECK-NEXT:    str w8, [x9]
-; CHECK-NEXT:    cmp w8, w1
+; CHECK-NEXT:    str w9, [x8, #3140]
+; CHECK-NEXT:    add w9, w9, #1
+; CHECK-NEXT:    str w9, [x8, #3136]
+; CHECK-NEXT:    cmp w9, w1
 ; CHECK-NEXT:    b.lt .LBB0_1
 ; CHECK-NEXT:  .LBB0_2: // %while_end
 ; CHECK-NEXT:    ret
@@ -47,16 +46,15 @@ define void @test2(ptr %struct, i32 %n) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    cbz x0, .LBB1_3
 ; CHECK-NEXT:  // %bb.1: // %while_cond.preheader
-; CHECK-NEXT:    mov w8, #40000 // =0x9c40
 ; CHECK-NEXT:    mov w9, wzr
-; CHECK-NEXT:    add x8, x0, x8
+; CHECK-NEXT:    add x8, x0, #9, lsl #12 // =36864
 ; CHECK-NEXT:    cmp w9, w1
 ; CHECK-NEXT:    b.ge .LBB1_3
 ; CHECK-NEXT:  .LBB1_2: // %while_body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    str w9, [x8, #4]
+; CHECK-NEXT:    str w9, [x8, #3140]
 ; CHECK-NEXT:    add w9, w9, #1
-; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    str w9, [x8, #3136]
 ; CHECK-NEXT:    cmp w9, w1
 ; CHECK-NEXT:    b.lt .LBB1_2
 ; CHECK-NEXT:  .LBB1_3: // %while_end
@@ -89,16 +87,15 @@ define void @test3(ptr %s1, ptr %s2, i1 %cond, i32 %n) {
 ; CHECK-NEXT:    csel x8, x1, x0, ne
 ; CHECK-NEXT:    cbz x8, .LBB2_3
 ; CHECK-NEXT:  // %bb.1: // %while_cond.preheader
-; CHECK-NEXT:    mov w10, #40000 // =0x9c40
 ; CHECK-NEXT:    mov w9, wzr
-; CHECK-NEXT:    add x8, x8, x10
+; CHECK-NEXT:    add x8, x8, #9, lsl #12 // =36864
 ; CHECK-NEXT:    cmp w9, w3
 ; CHECK-NEXT:    b.ge .LBB2_3
 ; CHECK-NEXT:  .LBB2_2: // %while_body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    str w9, [x8, #4]
+; CHECK-NEXT:    str w9, [x8, #3140]
 ; CHECK-NEXT:    add w9, w9, #1
-; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    str w9, [x8, #3136]
 ; CHECK-NEXT:    cmp w9, w3
 ; CHECK-NEXT:    b.lt .LBB2_2
 ; CHECK-NEXT:  .LBB2_3: // %while_end
@@ -141,17 +138,15 @@ define void @test4(i32 %n) uwtable personality ptr @__FrameHandler {
 ; CHECK-NEXT:    .cfi_personality 156, DW.ref.__FrameHandler
 ; CHECK-NEXT:    .cfi_lsda 28, .Lexception0
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w21, -24
 ; CHECK-NEXT:    .cfi_offset w30, -32
 ; CHECK-NEXT:    .cfi_remember_state
 ; CHECK-NEXT:    mov w19, w0
-; CHECK-NEXT:    mov w21, wzr
-; CHECK-NEXT:    mov w20, #40000 // =0x9c40
+; CHECK-NEXT:    mov w20, wzr
 ; CHECK-NEXT:  .LBB3_1: // %while_cond
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:  .Ltmp0:
@@ -159,23 +154,22 @@ define void @test4(i32 %n) uwtable personality ptr @__FrameHandler {
 ; CHECK-NEXT:  .Ltmp1:
 ; CHECK-NEXT:  // %bb.2: // %while_cond_x.split
 ; CHECK-NEXT:    // in Loop: Header=BB3_1 Depth=1
-; CHECK-NEXT:    add x8, x0, x20
-; CHECK-NEXT:    cmp w21, w19
-; CHECK-NEXT:    str wzr, [x8]
+; CHECK-NEXT:    add x8, x0, #9, lsl #12 // =36864
+; CHECK-NEXT:    cmp w20, w19
+; CHECK-NEXT:    str wzr, [x8, #3136]
 ; CHECK-NEXT:    b.ge .LBB3_4
 ; CHECK-NEXT:  // %bb.3: // %while_body
 ; CHECK-NEXT:    // in Loop: Header=BB3_1 Depth=1
-; CHECK-NEXT:    str w21, [x8, #4]
-; CHECK-NEXT:    add w21, w21, #1
-; CHECK-NEXT:    str w21, [x8]
+; CHECK-NEXT:    str w20, [x8, #3140]
+; CHECK-NEXT:    add w20, w20, #1
+; CHECK-NEXT:    str w20, [x8, #3136]
 ; CHECK-NEXT:    b .LBB3_1
 ; CHECK-NEXT:  .LBB3_4: // %while_end
 ; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w19
 ; CHECK-NEXT:    .cfi_restore w20
-; CHECK-NEXT:    .cfi_restore w21
 ; CHECK-NEXT:    .cfi_restore w30
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB3_5: // %cleanup
@@ -223,14 +217,13 @@ define void @test5(ptr %s, i32 %n) {
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    mov w9, wzr
 ; CHECK-NEXT:    add x8, x8, #19, lsl #12 // =77824
-; CHECK-NEXT:    add x8, x8, #2176
 ; CHECK-NEXT:    cmp w9, w1
 ; CHECK-NEXT:    b.ge .LBB4_2
 ; CHECK-NEXT:  .LBB4_1: // %while_body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    str w9, [x8, #4]
+; CHECK-NEXT:    str w9, [x8, #2180]
 ; CHECK-NEXT:    add w9, w9, #1
-; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    str w9, [x8, #2176]
 ; CHECK-NEXT:    cmp w9, w1
 ; CHECK-NEXT:    b.lt .LBB4_1
 ; CHECK-NEXT:  .LBB4_2: // %while_end

>From 50da4ca9487821d2f3697206ff8276a333d6a7a5 Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde at huawei.com>
Date: Mon, 20 Nov 2023 01:13:43 -0500
Subject: [PATCH 3/3] [AArch64] merge index address with large offset into base
 address

A case for this transformation, https://gcc.godbolt.org/z/nhYcWq1WE
```
Fold
  mov     w8, #56952
  movk    w8, #15, lsl #16
  ldrb    w0, [x0, x8]
into
  add     x0, x0, 1036288
  ldrb    w0, [x0, 3704]
```
Only support single use base, multi-use scenes are supported by PR74046.
Fix https://github.com/llvm/llvm-project/issues/71917

TODO: support the multiple-uses with reuseing common base offset.
https://gcc.godbolt.org/z/Mr7srTjnz
---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 62 +++++++++++++++++++
 llvm/test/CodeGen/AArch64/arm64-addrmode.ll   | 15 ++---
 2 files changed, 68 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 7617dccdeee39..bfec3d17e2cd7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AArch64ExpandImm.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
@@ -1069,6 +1070,41 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedBitWidth(SDValue N, bool IsSigned
   return true;
 }
 
+// 16-bit optionally shifted immediates are legal for single mov.
+static bool isLegalSingleMOVImmediate(int64_t Immed) {
+  if (Immed == std::numeric_limits<int64_t>::min()) {
+    LLVM_DEBUG(dbgs() << "Illegal single mov imm " << Immed
+                      << ": avoid UB for INT64_MIN\n");
+    return false;
+  }
+
+  // Calculate how many moves we will need to materialize this constant.
+  SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
+  AArch64_IMM::expandMOVImm(Immed, 64, Insn);
+  return Insn.size() == 1;
+}
+
+// Check whether a unsigned vaule is not in the immediate range of mov but in
+// the immediate range of imm24. The "Size" argument is the size in bytes of the
+// memory reference.
+static bool isPreferredBaseAddrMode(const TargetLowering *TLI, int64_t ImmOff,
+                                    unsigned Size) {
+  if ((ImmOff & (Size - 1)) != 0 || ImmOff < 0)
+    return false;
+
+  // If the immediate already can be encoded in mov, then just keep the existing
+  // logic.
+  if (isLegalSingleMOVImmediate(ImmOff))
+    return false;
+
+  // For a imm24, its low imm12 can be fold as the immediate of load or store,
+  // and its high part can be encoded in an add.
+  int64_t HighPart = ImmOff & ~0xfffULL;
+  if (TLI->isLegalAddImmediate(HighPart))
+    return true;
+  return false;
+}
+
 /// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
 /// immediate" address.  The "Size" argument is the size in bytes of the memory
 /// reference, which determines the scale.
@@ -1110,6 +1146,24 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
         OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
         return true;
       }
+
+      // Perfer [Reg + imm] mode.
+      //     ADD  BaseReg, WideImmediate & 0x0fff000
+      //     LDR  X2, [BaseReg, WideImmediate & 0x0fff]
+      SDValue LHS = N.getOperand(0);
+      if (isPreferredBaseAddrMode(TLI, RHSC, Size)) {
+        int64_t ImmOffUnScale = RHSC;
+        int64_t ImmOffLow = ImmOffUnScale & 0x0fff;
+        int64_t ImmOffHigh = RHSC - ImmOffLow;
+        SDValue ImmHighSDV =
+            CurDAG->getTargetConstant(ImmOffHigh >> 12, dl, MVT::i64);
+        Base = SDValue(CurDAG->getMachineNode(
+                           AArch64::ADDXri, dl, MVT::i64, LHS, ImmHighSDV,
+                           CurDAG->getTargetConstant(12, dl, MVT::i32)),
+                       0);
+        OffImm = CurDAG->getTargetConstant(ImmOffLow >> Scale, dl, MVT::i64);
+        return true;
+      }
     }
   }
 
@@ -1351,6 +1405,14 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
     return true;
   }
 
+  // Perfer [Reg + imm] mode, so skip this scenarios.
+  if (auto *OffsetC = dyn_cast<ConstantSDNode>(RHS)) {
+    int64_t ImmOff = (int64_t)OffsetC->getZExtValue();
+    const TargetLowering *TLI = getTargetLowering();
+    if (isPreferredBaseAddrMode(TLI, ImmOff, Size)) {
+      return false;
+    }
+  }
   // Match any non-shifted, non-extend, non-immediate add expression.
   Base = LHS;
   Offset = RHS;
diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
index 3d4749a7b8e7d..b0b26c8836d85 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -213,9 +213,8 @@ define void @t17(i64 %a) {
 define i32 @LdOffset_i8(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #56952 // =0xde78
-; CHECK-NEXT:    movk w8, #15, lsl #16
-; CHECK-NEXT:    ldrb w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT:    ldrb w0, [x8, #3704]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
   %val = load i8, ptr %arrayidx, align 1
@@ -226,9 +225,8 @@ define i32 @LdOffset_i8(ptr %a)  {
 define i32 @LdOffset_i16(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
-; CHECK-NEXT:    movk w8, #31, lsl #16
-; CHECK-NEXT:    ldrsh w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #507, lsl #12 // =2076672
+; CHECK-NEXT:    ldrsh w0, [x8, #3312]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
   %val = load i16, ptr %arrayidx, align 2
@@ -239,9 +237,8 @@ define i32 @LdOffset_i16(ptr %a)  {
 define i32 @LdOffset_i32(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #31200 // =0x79e0
-; CHECK-NEXT:    movk w8, #63, lsl #16
-; CHECK-NEXT:    ldr w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #1015, lsl #12 // =4157440
+; CHECK-NEXT:    ldr w0, [x8, #2528]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
   %val = load i32, ptr %arrayidx, align 4