[llvm-branch-commits] [llvm] [AMDGPU] Support one immediate folding for global load (PR #178608)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Jan 29 06:24:35 PST 2026
https://github.com/ruiling updated https://github.com/llvm/llvm-project/pull/178608
>From 0b11a343e00706e2a207c35f3aac2321d73de60b Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Thu, 29 Jan 2026 14:36:01 +0800
Subject: [PATCH 1/3] [AMDGPU] Support one immediate folding for global load
The address calculation may happen on i32 and be sign extended to the
i64 offset.
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 30 +++++++++-
.../AMDGPU/AMDGPUInstructionSelector.cpp | 59 +++++++++++++++----
.../CodeGen/AMDGPU/load-saddr-offset-imm.ll | 16 ++---
3 files changed, 80 insertions(+), 25 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 1446c84ef733b..1d6a7b4988528 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1981,6 +1981,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
bool NeedIOffset) const {
int64_t ImmOffset = 0;
ScaleOffset = false;
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
// Match the immediate offset first, which canonically is moved as low as
// possible.
@@ -1988,7 +1989,6 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
SDValue LHS, RHS;
if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
- const SIInstrInfo *TII = Subtarget->getInstrInfo();
if (NeedIOffset &&
TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
@@ -2037,13 +2037,37 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
LHS = Addr.getOperand(0);
if (!LHS->isDivergent()) {
- // add (i64 sgpr), (*_extend (i32 vgpr))
RHS = Addr.getOperand(1);
- ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());
+
if (SDValue ExtRHS = matchExtFromI32orI32(
RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
+ // add (i64 sgpr), (*_extend (scale (i32 vgpr)))
SAddr = LHS;
VOffset = ExtRHS;
+ if (NeedIOffset && !ImmOffset &&
+ CurDAG->isBaseWithConstantOffset(ExtRHS)) {
+ // add (i64 sgpr), (*_extend (add (scale (i32 vgpr)), (i32 imm)))
+ int64_t COffset =
+ cast<ConstantSDNode>(ExtRHS.getOperand(1))->getSExtValue();
+ if (TII->isLegalFLATOffset(COffset, AMDGPUAS::GLOBAL_ADDRESS,
+ SIInstrFlags::FlatGlobal)) {
+ VOffset = ExtRHS.getOperand(0);
+ ImmOffset =
+ cast<ConstantSDNode>(ExtRHS.getOperand(1))->getSExtValue();
+ }
+ }
+
+ ScaleOffset =
+ SelectScaleOffset(N, VOffset, Subtarget->hasSignedGVSOffset());
+ } else {
+ // add (i64 sgpr), (scale (*_extend (i32 vgpr)))
+ ScaleOffset =
+ SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());
+ if (SDValue ExtRHS = matchExtFromI32orI32(
+ RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
+ SAddr = LHS;
+ VOffset = ExtRHS;
+ }
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index e239e6f56cb44..6fdcca3443588 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5843,24 +5843,59 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
// It's possible voffset is an SGPR here, but the copy to VGPR will be
// inserted later.
- bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
- Subtarget->hasSignedGVSOffset());
if (Register VOffset = matchExtendFromS32OrS32(
PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
+ if (NeedIOffset && !ImmOffset) {
+ MachineInstr *VOffsetDef = getDefIgnoringCopies(VOffset, *MRI);
+ if (VOffsetDef->getOpcode() == TargetOpcode::G_ADD) {
+ Register RHS = VOffsetDef->getOperand(2).getReg();
+ std::optional<ValueAndVReg> RHSValReg =
+ getIConstantVRegValWithLookThrough(RHS, *MRI);
+ if (RHSValReg &&
+ TII.isLegalFLATOffset(RHSValReg->Value.getSExtValue(),
+ AMDGPUAS::GLOBAL_ADDRESS,
+ SIInstrFlags::FlatGlobal)) {
+ VOffset = VOffsetDef->getOperand(1).getReg();
+ ImmOffset = RHSValReg->Value.getSExtValue();
+ }
+ }
+ }
+
+ bool ScaleOffset =
+ selectScaleOffset(Root, VOffset, Subtarget->hasSignedGVSOffset());
if (NeedIOffset)
- return {{[=](MachineInstrBuilder &MIB) { // saddr
- MIB.addReg(SAddr);
- },
- [=](MachineInstrBuilder &MIB) { // voffset
- MIB.addReg(VOffset);
- },
- [=](MachineInstrBuilder &MIB) { // offset
- MIB.addImm(ImmOffset);
- },
- [=](MachineInstrBuilder &MIB) { // cpol
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); },
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); },
+ [=](MachineInstrBuilder &MIB) {
MIB.addImm(CPolBits |
(ScaleOffset ? AMDGPU::CPol::SCAL : 0));
}}};
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); },
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); },
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addImm(CPolBits |
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
+ }}};
+ } else {
+ bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
+ Subtarget->hasSignedGVSOffset());
+ if (Register VOffset = matchExtendFromS32OrS32(
+ PtrBaseOffset, Subtarget->hasSignedGVSOffset()))
+ if (NeedIOffset)
+ return {{[=](MachineInstrBuilder &MIB) { // saddr
+ MIB.addReg(SAddr);
+ },
+ [=](MachineInstrBuilder &MIB) { // voffset
+ MIB.addReg(VOffset);
+ },
+ [=](MachineInstrBuilder &MIB) { // offset
+ MIB.addImm(ImmOffset);
+ },
+ [=](MachineInstrBuilder &MIB) { // cpol
+ MIB.addImm(CPolBits |
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
+ }}};
return {{[=](MachineInstrBuilder &MIB) { // saddr
MIB.addReg(SAddr);
},
diff --git a/llvm/test/CodeGen/AMDGPU/load-saddr-offset-imm.ll b/llvm/test/CodeGen/AMDGPU/load-saddr-offset-imm.ll
index a1e229d09b777..fd26847d83cf8 100644
--- a/llvm/test/CodeGen/AMDGPU/load-saddr-offset-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-saddr-offset-imm.ll
@@ -10,8 +10,8 @@ define amdgpu_ps <2 x float> @global_load_saddr_offset_imm(ptr addrspace(1) inre
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_lshl_add_u32 v0, v0, 3, 0x80
-; GFX12-SDAG-NEXT: global_load_b64 v[0:1], v0, s[2:3]
+; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-SDAG-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:128
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
@@ -19,9 +19,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_offset_imm(ptr addrspace(1) inre
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u32 v0, v0, 3, 0x80
-; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v0, s[2:3]
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:128 scale_offset
; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX1250-SDAG-NEXT: ; return to shader part epilog
;
@@ -29,8 +27,8 @@ define amdgpu_ps <2 x float> @global_load_saddr_offset_imm(ptr addrspace(1) inre
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_lshl_add_u32 v0, v0, 3, 0x80
-; GFX12-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3]
+; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:128
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: ; return to shader part epilog
;
@@ -38,9 +36,7 @@ define amdgpu_ps <2 x float> @global_load_saddr_offset_imm(ptr addrspace(1) inre
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_lshl_add_u32 v0, v0, 3, 0x80
-; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3]
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:128 scale_offset
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: ; return to shader part epilog
%v = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
>From 1da76c3735d9741b62f7724ead42a8eaa01bd674 Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Thu, 29 Jan 2026 22:17:43 +0800
Subject: [PATCH 2/3] [AMDGPU] add back missing parenthesis
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 23 ++++++++++---------
1 file changed, 12 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 6fdcca3443588..fe9ac1bed1741 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5881,7 +5881,7 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
Subtarget->hasSignedGVSOffset());
if (Register VOffset = matchExtendFromS32OrS32(
- PtrBaseOffset, Subtarget->hasSignedGVSOffset()))
+ PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
if (NeedIOffset)
return {{[=](MachineInstrBuilder &MIB) { // saddr
MIB.addReg(SAddr);
@@ -5896,16 +5896,17 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
MIB.addImm(CPolBits |
(ScaleOffset ? AMDGPU::CPol::SCAL : 0));
}}};
- return {{[=](MachineInstrBuilder &MIB) { // saddr
- MIB.addReg(SAddr);
- },
- [=](MachineInstrBuilder &MIB) { // voffset
- MIB.addReg(VOffset);
- },
- [=](MachineInstrBuilder &MIB) { // cpol
- MIB.addImm(CPolBits |
- (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
- }}};
+ return {{[=](MachineInstrBuilder &MIB) { // saddr
+ MIB.addReg(SAddr);
+ },
+ [=](MachineInstrBuilder &MIB) { // voffset
+ MIB.addReg(VOffset);
+ },
+ [=](MachineInstrBuilder &MIB) { // cpol
+ MIB.addImm(CPolBits |
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
+ }}};
+ }
}
}
}
>From 068c3d289ab29c7493c14e8dfd5be2c718cb9c28 Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Thu, 29 Jan 2026 22:23:37 +0800
Subject: [PATCH 3/3] [AMDGPU] Address review comment
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 1d6a7b4988528..e8c722bd6f500 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2052,8 +2052,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
if (TII->isLegalFLATOffset(COffset, AMDGPUAS::GLOBAL_ADDRESS,
SIInstrFlags::FlatGlobal)) {
VOffset = ExtRHS.getOperand(0);
- ImmOffset =
- cast<ConstantSDNode>(ExtRHS.getOperand(1))->getSExtValue();
+ ImmOffset = COffset;
}
}
More information about the llvm-branch-commits
mailing list