[llvm] [AMDGPU] Introduce a DAG combine for folding offsets into addresses (PR #80264)

Thu Feb 1 02:35:01 PST 2024

https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/80264

This can reduce code size when there are loads from the same global with
different constant offsets.

Ported from the AArch equivalent: https://reviews.llvm.org/D45199

TODO: add more compelling test cases


>From 953ea03d6727a3ebdd3675ea5666362a15cb0714 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 1 Feb 2024 10:03:34 +0000
Subject: [PATCH] [AMDGPU] Introduce a DAG combine for folding offsets into
 addresses

This can reduce code size when there are loads from the same global with
different constant offsets.

Ported from the AArch equivalent: https://reviews.llvm.org/D45199

TODO: add more compelling test cases
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 74 ++++++++++++++-----
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |  1 +
 llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll  |  8 +-
 .../AMDGPU/lower-module-lds-via-hybrid.ll     |  6 +-
 .../AMDGPU/lower-module-lds-via-table.ll      | 41 +++++-----
 5 files changed, 81 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3d4adb16a2716..6fb8608703fd6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -910,7 +910,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                        ISD::SIGN_EXTEND_INREG,
                        ISD::EXTRACT_VECTOR_ELT,
                        ISD::INSERT_VECTOR_ELT,
-                       ISD::FCOPYSIGN});
+                       ISD::FCOPYSIGN,
+                       ISD::GlobalAddress});
 
   if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
     setTargetDAGCombine(ISD::FP_ROUND);
@@ -7133,22 +7134,7 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
 
 bool
 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
-  // OSes that use ELF REL relocations (instead of RELA) can only store a
-  // 32-bit addend in the instruction, so it is not safe to allow offset folding
-  // which can create arbitrary 64-bit addends. (This is only a problem for
-  // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
-  // the high 32 bits of the addend.)
-  //
-  // This should be kept in sync with how HasRelocationAddend is initialized in
-  // the constructor of ELFAMDGPUAsmBackend.
-  if (!Subtarget->isAmdHsaOS())
-    return false;
-
-  // We can fold offsets for anything that doesn't require a GOT relocation.
-  return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
-          GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
-          GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
-         !shouldEmitGOTReloc(GA->getGlobal());
+  return false;
 }
 
 static SDValue
@@ -13303,6 +13289,58 @@ SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
   return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
 }
 
+// If all users of the globaladdr are of the form (globaladdr + constant), find
+// the smallest constant, fold it into the globaladdr's offset and rewrite the
+// globaladdr as (globaladdr + constant) - constant.
+SDValue
+SITargetLowering::performGlobalAddressCombine(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  auto *GA = cast<GlobalAddressSDNode>(N);
+
+  // We can fold offsets for anything that doesn't require a GOT relocation.
+  if ((GA->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS &&
+       GA->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
+       GA->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
+      shouldEmitGOTReloc(GA->getGlobal()))
+    return SDValue();
+
+  uint64_t MinOffset = -1ull;
+  for (SDNode *N : GA->uses()) {
+    if (N->getOpcode() != ISD::ADD)
+      return SDValue();
+    auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
+    if (!C)
+      C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!C)
+      return SDValue();
+    MinOffset = std::min(MinOffset, C->getZExtValue());
+  }
+  uint64_t Offset = MinOffset + GA->getOffset();
+
+  // Require that the new offset is larger than the existing one. Otherwise, we
+  // can end up oscillating between two possible DAGs, for example,
+  // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
+  if (Offset <= (uint64_t)GA->getOffset())
+    return SDValue();
+
+  // OSes that use ELF REL relocations (instead of RELA) can only store an
+  // unsigned 32-bit addend in the instruction, so it is not safe to allow
+  // offset folding which can create arbitrary 64-bit addends. (This is only a
+  // problem for R_AMDGPU_*32_HI relocations since other relocation types are
+  // unaffected by the high 32 bits of the addend.)
+  //
+  // This should be kept in sync with how HasRelocationAddend is initialized in
+  // the constructor of ELFAMDGPUAsmBackend.
+  if (!Subtarget->isAmdHsaOS() && !isUInt<32>(Offset))
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(GA);
+  SDValue Result = DAG.getGlobalAddress(GA->getGlobal(), DL, MVT::i64, Offset);
+  return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
+                     DAG.getConstant(MinOffset, DL, MVT::i64));
+}
+
 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
                                           const SDNode *N0,
                                           const SDNode *N1) const {
@@ -14489,6 +14527,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performInsertVectorEltCombine(N, DCI);
   case ISD::FP_ROUND:
     return performFPRoundCombine(N, DCI);
+  case ISD::GlobalAddress:
+    return performGlobalAddressCombine(N, DCI);
   case ISD::LOAD: {
     if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
       return Widened;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index e436c23af5bca..299c6a9d79cbe 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -206,6 +206,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performGlobalAddressCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
   unsigned getFusedOpcode(const SelectionDAG &DAG,
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 66f31bbf7afe0..59c2749067cbc 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -46,8 +46,8 @@ define void @use_extern_normal() #0 {
 ; CHECK-NEXT:    s_ashr_i32 s5, s15, 31
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x4048f5c3
 ; CHECK-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; CHECK-NEXT:    s_add_u32 s4, s4, s6
-; CHECK-NEXT:    s_addc_u32 s5, s5, s7
+; CHECK-NEXT:    s_add_u32 s4, s6, s4
+; CHECK-NEXT:    s_addc_u32 s5, s7, s5
 ; CHECK-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s4
@@ -70,8 +70,8 @@ define void @use_extern_overalign() #0 {
 ; CHECK-NEXT:    s_ashr_i32 s5, s15, 31
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x42280000
 ; CHECK-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; CHECK-NEXT:    s_add_u32 s4, s4, s6
-; CHECK-NEXT:    s_addc_u32 s5, s5, s7
+; CHECK-NEXT:    s_add_u32 s4, s6, s4
+; CHECK-NEXT:    s_addc_u32 s5, s7, s5
 ; CHECK-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
index bb7c43f76c8a1..bb3ed67485451 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -84,8 +84,8 @@ define void @f2() {
 ; GCN-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table at rel32@hi+12
 ; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
-; GCN-NEXT:    s_add_u32 s4, s4, s6
-; GCN-NEXT:    s_addc_u32 s5, s5, s7
+; GCN-NEXT:    s_add_u32 s4, s6, s4
+; GCN-NEXT:    s_addc_u32 s5, s7, s5
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
@@ -206,14 +206,12 @@ define amdgpu_kernel void @k23() {
 ; GCN-NEXT:    s_addc_u32 s5, s5, f2 at gotpcrel32@hi+12
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GCN-NEXT:    s_mov_b32 s15, 1
-
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f3 at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, f3 at gotpcrel32@hi+12
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index 4d73436c519bd..ab0e1a39096d8 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -49,8 +49,8 @@ define void @f0() {
 ; GCN-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table at rel32@hi+12
 ; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 4
-; GCN-NEXT:    s_add_u32 s4, s4, s6
-; GCN-NEXT:    s_addc_u32 s5, s5, s7
+; GCN-NEXT:    s_add_u32 s4, s6, s4
+; GCN-NEXT:    s_addc_u32 s5, s7, s5
 ; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
@@ -87,12 +87,12 @@ define void @f1() {
 ; GCN-NEXT:    s_mov_b32 s4, s15
 ; GCN-NEXT:    s_ashr_i32 s5, s15, 31
 ; GCN-NEXT:    s_getpc_b64 s[6:7]
-; GCN-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table at rel32@lo+8
-; GCN-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table at rel32@hi+16
+; GCN-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table at rel32@hi+12
 ; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 4
-; GCN-NEXT:    s_add_u32 s4, s4, s6
-; GCN-NEXT:    s_addc_u32 s5, s5, s7
-; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 s4, s6, s4
+; GCN-NEXT:    s_addc_u32 s5, s7, s5
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0x1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_mov_b32 m0, -1
@@ -128,12 +128,12 @@ define void @f2() {
 ; GCN-NEXT:    s_mov_b32 s4, s15
 ; GCN-NEXT:    s_ashr_i32 s5, s15, 31
 ; GCN-NEXT:    s_getpc_b64 s[6:7]
-; GCN-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table at rel32@lo+12
-; GCN-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table at rel32@hi+20
+; GCN-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table at rel32@hi+12
 ; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 4
-; GCN-NEXT:    s_add_u32 s4, s4, s6
-; GCN-NEXT:    s_addc_u32 s5, s5, s7
-; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 s4, s6, s4
+; GCN-NEXT:    s_addc_u32 s5, s7, s5
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0x2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    s_mov_b32 m0, -1
@@ -169,12 +169,12 @@ define void @f3() {
 ; GCN-NEXT:    s_mov_b32 s4, s15
 ; GCN-NEXT:    s_ashr_i32 s5, s15, 31
 ; GCN-NEXT:    s_getpc_b64 s[6:7]
-; GCN-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table at rel32@lo+16
-; GCN-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table at rel32@hi+24
+; GCN-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table at rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table at rel32@hi+12
 ; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 4
-; GCN-NEXT:    s_add_u32 s4, s4, s6
-; GCN-NEXT:    s_addc_u32 s5, s5, s7
-; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 s4, s6, s4
+; GCN-NEXT:    s_addc_u32 s5, s7, s5
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0x3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    s_mov_b32 m0, -1
@@ -237,18 +237,15 @@ define amdgpu_kernel void @k01() {
 ; GCN-NEXT:    s_addc_u32 s5, s5, f0 at gotpcrel32@hi+12
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GCN-NEXT:    s_mov_b32 s15, 0
-
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f1 at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, f1 at gotpcrel32@hi+12
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    s_endpgm
-
   call void @f0()
   call void @f1()
   ret void
@@ -276,14 +273,12 @@ define amdgpu_kernel void @k23() {
 ; GCN-NEXT:    s_addc_u32 s5, s5, f2 at gotpcrel32@hi+12
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GCN-NEXT:    s_mov_b32 s15, 2
-
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, f3 at gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, f3 at gotpcrel32@hi+12
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    s_endpgm
@@ -318,7 +313,6 @@ define amdgpu_kernel void @k123() {
 ; GCN-NEXT:    s_addc_u32 s5, s5, f1 at gotpcrel32@hi+12
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GCN-NEXT:    s_mov_b32 s15, 1
-
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
@@ -331,7 +325,6 @@ define amdgpu_kernel void @k123() {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GCN-NEXT:    ds_write_b8 v0, v1 offset:2
-
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    s_endpgm
   call void @f1()