[llvm] [AArch64] merge index address with large offset into base address (PR #72187)

Mon Dec 4 22:17:58 PST 2023

https://github.com/vfdff updated https://github.com/llvm/llvm-project/pull/72187

>From e5296d3648ed08eba3e9b48f0323a177afe97d38 Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde at huawei.com>
Date: Mon, 20 Nov 2023 01:13:43 -0500
Subject: [PATCH] [AArch64] merge index address with large offset into base
 address

A case for this transformation, https://gcc.godbolt.org/z/nhYcWq1WE
```
Fold
  mov     w8, #56952
  movk    w8, #15, lsl #16
  ldrb    w0, [x0, x8]
into
  add     x0, x0, 1036288
  ldrb    w0, [x0, 3704]
```
Only support single use base, multi-use scenes are supported by PR74046.
Fix https://github.com/llvm/llvm-project/issues/71917

TODO: support the multiple-uses with reuseing common base offset.
https://gcc.godbolt.org/z/Mr7srTjnz
---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 62 +++++++++++++++++++
 llvm/test/CodeGen/AArch64/arm64-addrmode.ll   | 15 ++---
 2 files changed, 68 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 2f49e9a6b37cc..70ba5784edec2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AArch64ExpandImm.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
@@ -1074,6 +1075,41 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedBitWidth(SDValue N, bool IsSigned
   return true;
 }
 
+// 16-bit optionally shifted immediates are legal for single mov.
+static bool isLegalSingleMOVImmediate(int64_t Immed) {
+  if (Immed == std::numeric_limits<int64_t>::min()) {
+    LLVM_DEBUG(dbgs() << "Illegal single mov imm " << Immed
+                      << ": avoid UB for INT64_MIN\n");
+    return false;
+  }
+
+  // Calculate how many moves we will need to materialize this constant.
+  SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
+  AArch64_IMM::expandMOVImm(Immed, 64, Insn);
+  return Insn.size() == 1;
+}
+
+// Check whether a unsigned vaule is not in the immediate range of mov but in
+// the immediate range of imm24. The "Size" argument is the size in bytes of the
+// memory reference.
+static bool isPreferredBaseAddrMode(const TargetLowering *TLI, int64_t ImmOff,
+                                    unsigned Size) {
+  if ((ImmOff & (Size - 1)) != 0 || ImmOff < 0)
+    return false;
+
+  // If the immediate already can be encoded in mov, then just keep the existing
+  // logic.
+  if (isLegalSingleMOVImmediate(ImmOff))
+    return false;
+
+  // For a imm24, its low imm12 can be fold as the immediate of load or store,
+  // and its high part can be encoded in an add.
+  int64_t HighPart = ImmOff & ~0xfffULL;
+  if (TLI->isLegalAddImmediate(HighPart))
+    return true;
+  return false;
+}
+
 /// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
 /// immediate" address.  The "Size" argument is the size in bytes of the memory
 /// reference, which determines the scale.
@@ -1115,6 +1151,24 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
         OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64);
         return true;
       }
+
+      // Perfer [Reg + imm] mode.
+      //     ADD  BaseReg, WideImmediate & 0x0fff000
+      //     LDR  X2, [BaseReg, WideImmediate & 0x0fff]
+      SDValue LHS = N.getOperand(0);
+      if (isPreferredBaseAddrMode(TLI, RHSC, Size)) {
+        int64_t ImmOffUnScale = RHSC;
+        int64_t ImmOffLow = ImmOffUnScale & 0x0fff;
+        int64_t ImmOffHigh = RHSC - ImmOffLow;
+        SDValue ImmHighSDV =
+            CurDAG->getTargetConstant(ImmOffHigh >> 12, dl, MVT::i64);
+        Base = SDValue(CurDAG->getMachineNode(
+                           AArch64::ADDXri, dl, MVT::i64, LHS, ImmHighSDV,
+                           CurDAG->getTargetConstant(12, dl, MVT::i32)),
+                       0);
+        OffImm = CurDAG->getTargetConstant(ImmOffLow >> Scale, dl, MVT::i64);
+        return true;
+      }
     }
   }
 
@@ -1356,6 +1410,14 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
     return true;
   }
 
+  // Perfer [Reg + imm] mode, so skip this scenarios.
+  if (auto *OffsetC = dyn_cast<ConstantSDNode>(RHS)) {
+    int64_t ImmOff = (int64_t)OffsetC->getZExtValue();
+    const TargetLowering *TLI = getTargetLowering();
+    if (isPreferredBaseAddrMode(TLI, ImmOff, Size)) {
+      return false;
+    }
+  }
   // Match any non-shifted, non-extend, non-immediate add expression.
   Base = LHS;
   Offset = RHS;
diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
index 3d4749a7b8e7d..b0b26c8836d85 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -213,9 +213,8 @@ define void @t17(i64 %a) {
 define i32 @LdOffset_i8(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #56952 // =0xde78
-; CHECK-NEXT:    movk w8, #15, lsl #16
-; CHECK-NEXT:    ldrb w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT:    ldrb w0, [x8, #3704]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
   %val = load i8, ptr %arrayidx, align 1
@@ -226,9 +225,8 @@ define i32 @LdOffset_i8(ptr %a)  {
 define i32 @LdOffset_i16(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
-; CHECK-NEXT:    movk w8, #31, lsl #16
-; CHECK-NEXT:    ldrsh w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #507, lsl #12 // =2076672
+; CHECK-NEXT:    ldrsh w0, [x8, #3312]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
   %val = load i16, ptr %arrayidx, align 2
@@ -239,9 +237,8 @@ define i32 @LdOffset_i16(ptr %a)  {
 define i32 @LdOffset_i32(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #31200 // =0x79e0
-; CHECK-NEXT:    movk w8, #63, lsl #16
-; CHECK-NEXT:    ldr w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #1015, lsl #12 // =4157440
+; CHECK-NEXT:    ldr w0, [x8, #2528]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
   %val = load i32, ptr %arrayidx, align 4