[llvm] [AMDGPU] Introduce a DAG combine for folding offsets into addresses (PR #80264)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 1 02:35:01 PST 2024
https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/80264
This can reduce code size when there are loads from the same global with
different constant offsets.
Ported from the AArch equivalent: https://reviews.llvm.org/D45199
TODO: add more compelling test cases
>From 953ea03d6727a3ebdd3675ea5666362a15cb0714 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 1 Feb 2024 10:03:34 +0000
Subject: [PATCH] [AMDGPU] Introduce a DAG combine for folding offsets into
addresses
This can reduce code size when there are loads from the same global with
different constant offsets.
Ported from the AArch equivalent: https://reviews.llvm.org/D45199
TODO: add more compelling test cases
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 74 ++++++++++++++-----
llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 +
llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll | 8 +-
.../AMDGPU/lower-module-lds-via-hybrid.ll | 6 +-
.../AMDGPU/lower-module-lds-via-table.ll | 41 +++++-----
5 files changed, 81 insertions(+), 49 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3d4adb16a2716..6fb8608703fd6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -910,7 +910,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::SIGN_EXTEND_INREG,
ISD::EXTRACT_VECTOR_ELT,
ISD::INSERT_VECTOR_ELT,
- ISD::FCOPYSIGN});
+ ISD::FCOPYSIGN,
+ ISD::GlobalAddress});
if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
setTargetDAGCombine(ISD::FP_ROUND);
@@ -7133,22 +7134,7 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
bool
SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
- // OSes that use ELF REL relocations (instead of RELA) can only store a
- // 32-bit addend in the instruction, so it is not safe to allow offset folding
- // which can create arbitrary 64-bit addends. (This is only a problem for
- // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
- // the high 32 bits of the addend.)
- //
- // This should be kept in sync with how HasRelocationAddend is initialized in
- // the constructor of ELFAMDGPUAsmBackend.
- if (!Subtarget->isAmdHsaOS())
- return false;
-
- // We can fold offsets for anything that doesn't require a GOT relocation.
- return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
- GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
- GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
- !shouldEmitGOTReloc(GA->getGlobal());
+ return false;
}
static SDValue
@@ -13303,6 +13289,58 @@ SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
}
+// If all users of the globaladdr are of the form (globaladdr + constant), find
+// the smallest constant, fold it into the globaladdr's offset and rewrite the
+// globaladdr as (globaladdr + constant) - constant.
+SDValue
+SITargetLowering::performGlobalAddressCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ auto *GA = cast<GlobalAddressSDNode>(N);
+
+ // We can fold offsets for anything that doesn't require a GOT relocation.
+ if ((GA->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS &&
+ GA->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
+ GA->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
+ shouldEmitGOTReloc(GA->getGlobal()))
+ return SDValue();
+
+ uint64_t MinOffset = -1ull;
+ for (SDNode *N : GA->uses()) {
+ if (N->getOpcode() != ISD::ADD)
+ return SDValue();
+ auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
+ if (!C)
+ C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!C)
+ return SDValue();
+ MinOffset = std::min(MinOffset, C->getZExtValue());
+ }
+ uint64_t Offset = MinOffset + GA->getOffset();
+
+ // Require that the new offset is larger than the existing one. Otherwise, we
+ // can end up oscillating between two possible DAGs, for example,
+ // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
+ if (Offset <= (uint64_t)GA->getOffset())
+ return SDValue();
+
+ // OSes that use ELF REL relocations (instead of RELA) can only store an
+ // unsigned 32-bit addend in the instruction, so it is not safe to allow
+ // offset folding which can create arbitrary 64-bit addends. (This is only a
+ // problem for R_AMDGPU_*32_HI relocations since other relocation types are
+ // unaffected by the high 32 bits of the addend.)
+ //
+ // This should be kept in sync with how HasRelocationAddend is initialized in
+ // the constructor of ELFAMDGPUAsmBackend.
+ if (!Subtarget->isAmdHsaOS() && !isUInt<32>(Offset))
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(GA);
+ SDValue Result = DAG.getGlobalAddress(GA->getGlobal(), DL, MVT::i64, Offset);
+ return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
+ DAG.getConstant(MinOffset, DL, MVT::i64));
+}
+
unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0,
const SDNode *N1) const {
@@ -14489,6 +14527,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performInsertVectorEltCombine(N, DCI);
case ISD::FP_ROUND:
return performFPRoundCombine(N, DCI);
+ case ISD::GlobalAddress:
+ return performGlobalAddressCombine(N, DCI);
case ISD::LOAD: {
if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
return Widened;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index e436c23af5bca..299c6a9d79cbe 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -206,6 +206,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performGlobalAddressCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
unsigned getFusedOpcode(const SelectionDAG &DAG,
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 66f31bbf7afe0..59c2749067cbc 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -46,8 +46,8 @@ define void @use_extern_normal() #0 {
; CHECK-NEXT: s_ashr_i32 s5, s15, 31
; CHECK-NEXT: v_mov_b32_e32 v0, 0x4048f5c3
; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; CHECK-NEXT: s_add_u32 s4, s4, s6
-; CHECK-NEXT: s_addc_u32 s5, s5, s7
+; CHECK-NEXT: s_add_u32 s4, s6, s4
+; CHECK-NEXT: s_addc_u32 s5, s7, s5
; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s4
@@ -70,8 +70,8 @@ define void @use_extern_overalign() #0 {
; CHECK-NEXT: s_ashr_i32 s5, s15, 31
; CHECK-NEXT: v_mov_b32_e32 v0, 0x42280000
; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; CHECK-NEXT: s_add_u32 s4, s4, s6
-; CHECK-NEXT: s_addc_u32 s5, s5, s7
+; CHECK-NEXT: s_add_u32 s4, s6, s4
+; CHECK-NEXT: s_addc_u32 s5, s7, s5
; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
index bb7c43f76c8a1..bb3ed67485451 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -84,8 +84,8 @@ define void @f2() {
; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table at rel32@lo+4
; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table at rel32@hi+12
; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; GCN-NEXT: s_add_u32 s4, s4, s6
-; GCN-NEXT: s_addc_u32 s5, s5, s7
+; GCN-NEXT: s_add_u32 s4, s6, s4
+; GCN-NEXT: s_addc_u32 s5, s7, s5
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s4
@@ -206,14 +206,12 @@ define amdgpu_kernel void @k23() {
; GCN-NEXT: s_addc_u32 s5, s5, f2 at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_mov_b32 s15, 1
-
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f3 at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, f3 at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index 4d73436c519bd..ab0e1a39096d8 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -49,8 +49,8 @@ define void @f0() {
; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table at rel32@lo+4
; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table at rel32@hi+12
; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4
-; GCN-NEXT: s_add_u32 s4, s4, s6
-; GCN-NEXT: s_addc_u32 s5, s5, s7
+; GCN-NEXT: s_add_u32 s4, s6, s4
+; GCN-NEXT: s_addc_u32 s5, s7, s5
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
@@ -87,12 +87,12 @@ define void @f1() {
; GCN-NEXT: s_mov_b32 s4, s15
; GCN-NEXT: s_ashr_i32 s5, s15, 31
; GCN-NEXT: s_getpc_b64 s[6:7]
-; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table at rel32@lo+8
-; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table at rel32@hi+16
+; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table at rel32@hi+12
; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4
-; GCN-NEXT: s_add_u32 s4, s4, s6
-; GCN-NEXT: s_addc_u32 s5, s5, s7
-; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
+; GCN-NEXT: s_add_u32 s4, s6, s4
+; GCN-NEXT: s_addc_u32 s5, s7, s5
+; GCN-NEXT: s_load_dword s4, s[4:5], 0x1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_mov_b32 m0, -1
@@ -128,12 +128,12 @@ define void @f2() {
; GCN-NEXT: s_mov_b32 s4, s15
; GCN-NEXT: s_ashr_i32 s5, s15, 31
; GCN-NEXT: s_getpc_b64 s[6:7]
-; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table at rel32@lo+12
-; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table at rel32@hi+20
+; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table at rel32@hi+12
; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4
-; GCN-NEXT: s_add_u32 s4, s4, s6
-; GCN-NEXT: s_addc_u32 s5, s5, s7
-; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
+; GCN-NEXT: s_add_u32 s4, s6, s4
+; GCN-NEXT: s_addc_u32 s5, s7, s5
+; GCN-NEXT: s_load_dword s4, s[4:5], 0x2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: s_mov_b32 m0, -1
@@ -169,12 +169,12 @@ define void @f3() {
; GCN-NEXT: s_mov_b32 s4, s15
; GCN-NEXT: s_ashr_i32 s5, s15, 31
; GCN-NEXT: s_getpc_b64 s[6:7]
-; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table at rel32@lo+16
-; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table at rel32@hi+24
+; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table at rel32@hi+12
; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4
-; GCN-NEXT: s_add_u32 s4, s4, s6
-; GCN-NEXT: s_addc_u32 s5, s5, s7
-; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
+; GCN-NEXT: s_add_u32 s4, s6, s4
+; GCN-NEXT: s_addc_u32 s5, s7, s5
+; GCN-NEXT: s_load_dword s4, s[4:5], 0x3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_mov_b32 m0, -1
@@ -237,18 +237,15 @@ define amdgpu_kernel void @k01() {
; GCN-NEXT: s_addc_u32 s5, s5, f0 at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_mov_b32 s15, 0
-
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f1 at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, f1 at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_endpgm
-
call void @f0()
call void @f1()
ret void
@@ -276,14 +273,12 @@ define amdgpu_kernel void @k23() {
; GCN-NEXT: s_addc_u32 s5, s5, f2 at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_mov_b32 s15, 2
-
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, f3 at gotpcrel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, f3 at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_endpgm
@@ -318,7 +313,6 @@ define amdgpu_kernel void @k123() {
; GCN-NEXT: s_addc_u32 s5, s5, f1 at gotpcrel32@hi+12
; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GCN-NEXT: s_mov_b32 s15, 1
-
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, 0
@@ -331,7 +325,6 @@ define amdgpu_kernel void @k123() {
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v1
; GCN-NEXT: ds_write_b8 v0, v1 offset:2
-
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: s_endpgm
call void @f1()
More information about the llvm-commits
mailing list