[llvm] [AMDGPU] Optimize the register uses if offset inlinable (PR #101676)
Pankaj Dwivedi via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 2 11:32:35 PDT 2024
https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/101676
>From ddf67cf7c2626478c16b85f7f7055592a63f0aa3 Mon Sep 17 00:00:00 2001
From: PankajDwivedi-25 <pankajkumar.divedi at amd.com>
Date: Fri, 2 Aug 2024 19:20:04 +0530
Subject: [PATCH 1/2] [AMDGPU] Fold the frame index offset into v_mad if
inlinable
---
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 45 ++++++++++++-------
.../eliminate-frame-index-s-mov-b32.mir | 3 +-
.../AMDGPU/materialize-frame-index-sgpr.ll | 6 +--
3 files changed, 32 insertions(+), 22 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ba49f4a309ebb..14428bfb1ac45 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2568,26 +2568,39 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
} else
Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
} else {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
- TmpResultReg)
- .addImm(Offset);
assert(Offset > 0 &&
isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
"offset is unsafe for v_mad_u32_u24");
- // We start with a frame pointer with a wave space value, and an
- // offset in lane-space. We are materializing a lane space
- // value. We can either do a right shift of the frame pointer to
- // get to lane space, or a left shift of the offset to get to
- // wavespace. We can right shift after the computation to get
- // back to the desired per-lane value.
- // We are using the mad_u32_u24 primarily as an add with no
- // carry out clobber.
+
+ // We start with a frame pointer with a wave space value, and
+ // an offset in lane-space. We are materializing a lane space
+ // value. We can either do a right shift of the frame pointer
+ // to get to lane space, or a left shift of the offset to get
+ // to wavespace. We can right shift after the computation to
+ // get back to the desired per-lane value. We are using the
+ // mad_u32_u24 primarily as an add with no carry out clobber.
+ bool IsInlinableLiteral = AMDGPU::isInlinableLiteral32(
+ Offset, ST.hasInv2PiInlineImm());
+ if (!IsInlinableLiteral)
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
+ TmpResultReg)
+ .addImm(Offset);
+
Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
- TmpResultReg)
- .addReg(TmpResultReg, RegState::Kill)
- .addImm(ST.getWavefrontSize())
- .addReg(FrameReg)
- .addImm(0);
+ TmpResultReg);
+
+ if (!IsInlinableLiteral) {
+ Add.addReg(TmpResultReg, RegState::Kill)
+ .addImm(ST.getWavefrontSize())
+ .addReg(FrameReg)
+ .addImm(0);
+ } else {
+ // We fold the offset into mad itself if its inlinable.
+ Add.addImm(Offset)
+ .addImm(ST.getWavefrontSize())
+ .addReg(FrameReg)
+ .addImm(0);
+ }
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
TmpResultReg)
.addImm(ST.getWavefrontSizeLog2())
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
index 08c2904c601ad..acc84183b3a27 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
@@ -706,8 +706,7 @@ body: |
; GFX8-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
- ; GFX8-NEXT: $vgpr0 = V_MOV_B32_e32 64, implicit $exec
- ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 killed $vgpr0, 64, $sgpr32, 0, implicit $exec
+ ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 64, 64, $sgpr32, 0, implicit $exec
; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $vgpr0, implicit $exec
; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
; GFX8-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
index 87cfaec208897..6346406fa8941 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -835,12 +835,11 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX7-NEXT: v_writelane_b32 v21, s56, 25
; GFX7-NEXT: v_writelane_b32 v21, s57, 26
; GFX7-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT: v_mov_b32_e32 v22, 16
; GFX7-NEXT: v_writelane_b32 v21, s58, 27
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX7-NEXT: ;;#ASMEND
-; GFX7-NEXT: v_mad_u32_u24 v22, v22, 64, s32
+; GFX7-NEXT: v_mad_u32_u24 v22, 16, 64, s32
; GFX7-NEXT: v_lshrrev_b32_e32 v22, 6, v22
; GFX7-NEXT: v_writelane_b32 v21, s59, 28
; GFX7-NEXT: v_readfirstlane_b32 s59, v22
@@ -918,12 +917,11 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
; GFX8-NEXT: v_writelane_b32 v21, s56, 25
; GFX8-NEXT: v_writelane_b32 v21, s57, 26
; GFX8-NEXT: s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT: v_mov_b32_e32 v22, 16
; GFX8-NEXT: v_writelane_b32 v21, s58, 27
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
; GFX8-NEXT: ;;#ASMEND
-; GFX8-NEXT: v_mad_u32_u24 v22, v22, 64, s32
+; GFX8-NEXT: v_mad_u32_u24 v22, 16, 64, s32
; GFX8-NEXT: v_lshrrev_b32_e32 v22, 6, v22
; GFX8-NEXT: v_writelane_b32 v21, s59, 28
; GFX8-NEXT: v_readfirstlane_b32 s59, v22
>From 0082e0656d35ec54aeed187a30553b480193133c Mon Sep 17 00:00:00 2001
From: PankajDwivedi-25 <pankajkumar.divedi at amd.com>
Date: Sat, 3 Aug 2024 00:02:18 +0530
Subject: [PATCH 2/2] [WIP] refactored
---
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 14 +++++---------
1 file changed, 5 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 14428bfb1ac45..dd4e0d53202d4 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2581,26 +2581,22 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// mad_u32_u24 primarily as an add with no carry out clobber.
bool IsInlinableLiteral = AMDGPU::isInlinableLiteral32(
Offset, ST.hasInv2PiInlineImm());
- if (!IsInlinableLiteral)
+ if (!IsInlinableLiteral) {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
TmpResultReg)
.addImm(Offset);
+ }
Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
TmpResultReg);
if (!IsInlinableLiteral) {
- Add.addReg(TmpResultReg, RegState::Kill)
- .addImm(ST.getWavefrontSize())
- .addReg(FrameReg)
- .addImm(0);
+ Add.addReg(TmpResultReg, RegState::Kill);
} else {
// We fold the offset into mad itself if its inlinable.
- Add.addImm(Offset)
- .addImm(ST.getWavefrontSize())
- .addReg(FrameReg)
- .addImm(0);
+ Add.addImm(Offset);
}
+ Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
TmpResultReg)
.addImm(ST.getWavefrontSizeLog2())
More information about the llvm-commits
mailing list