[llvm] [WIP] Optimize S_MOV frame index elimination support (PR #101322)
Pankaj Dwivedi via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 31 23:46:17 PDT 2024
https://github.com/PankajDwivedi-25 updated https://github.com/llvm/llvm-project/pull/101322
>From fd618c4fa19d9956378183646d53bfac855fedfa Mon Sep 17 00:00:00 2001
From: PankajDwivedi-25 <pankajkumar.divedi at amd.com>
Date: Wed, 31 Jul 2024 16:46:35 +0530
Subject: [PATCH 1/3] [WIP] fold offset in V_MAD_U32_U24_e64 if inlinable
---
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 48 ++++++++++++-------
.../eliminate-frame-index-s-mov-b32.mir | 3 +-
2 files changed, 33 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index e824e95610a65..9b8753e5268bf 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2568,30 +2568,46 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
} else
Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
} else {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
- TmpResultReg)
- .addImm(Offset);
assert(Offset > 0 &&
isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
"offset is unsafe for v_mad_u32_u24");
- // We start with a frame pointer with a wave space value, and an
- // offset in lane-space. We are materializing a lane space
- // value. We can either do a right shift of the frame pointer to
- // get to lane space, or a left shift of the offset to get to
- // wavespace. We can right shift after the computation to get
- // back to the desired per-lane value.
- // We are using the mad_u32_u24 primarily as an add with no
- // carry out clobber.
- Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
+ if (AMDGPU::isInlinableLiteral32(Offset,
+ ST.hasInv2PiInlineImm())) {
+ // We fold the offset into mad itself if its inlinable.
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+ TmpResultReg)
+ .addImm(ST.getWavefrontSizeLog2())
+ .addReg(FrameReg);
+ Add =
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
+ TmpResultReg)
+ .addReg(TmpResultReg, RegState::Kill)
+ .addImm(1)
+ .addImm(Offset)
+ .addImm(0);
+ } else {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
+ TmpResultReg)
+ .addImm(Offset);
+ // We start with a frame pointer with a wave space value, and
+ // an offset in lane-space. We are materializing a lane space
+ // value. We can either do a right shift of the frame pointer
+ // to get to lane space, or a left shift of the offset to get
+ // to wavespace. We can right shift after the computation to
+ // get back to the desired per-lane value. We are using the
+ // mad_u32_u24 primarily as an add with no carry out clobber.
+ Add =
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
TmpResultReg)
.addReg(TmpResultReg, RegState::Kill)
.addImm(ST.getWavefrontSize())
.addReg(FrameReg)
.addImm(0);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
- TmpResultReg)
- .addImm(ST.getWavefrontSizeLog2())
- .addReg(FrameReg);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+ TmpResultReg)
+ .addImm(ST.getWavefrontSizeLog2())
+ .addReg(FrameReg);
+ }
}
Register NewDest = IsCopy ? ResultReg
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
index 78fb25a76d25e..7931aab7262e1 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
@@ -706,9 +706,8 @@ body: |
; GFX8-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
- ; GFX8-NEXT: $vgpr0 = V_MOV_B32_e32 64, implicit $exec
- ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 killed $vgpr0, 64, $sgpr32, 0, implicit $exec
; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+ ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 killed $vgpr0, 1, 64, 0, implicit $exec
; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
; GFX8-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX8-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
>From 133753207fd165fab27cc792f6008b488b45ae74 Mon Sep 17 00:00:00 2001
From: PankajDwivedi-25 <pankajkumar.divedi at amd.com>
Date: Wed, 31 Jul 2024 19:05:35 +0530
Subject: [PATCH 2/3] [WIP] refactored
---
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 49 +++++++------------
.../eliminate-frame-index-s-mov-b32.mir | 2 +-
2 files changed, 20 insertions(+), 31 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 9b8753e5268bf..cce1ead4066ca 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2571,42 +2571,31 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
assert(Offset > 0 &&
isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
"offset is unsafe for v_mad_u32_u24");
+
+ // We are using the mad_u32_u24 primarily as an add with no
+ // carry out clobber. We start with a frame pointer with a wave
+ // space value, and an offset in lane-space. We are
+ // materializing a lane space value. We can either do a right
+ // shift of the frame pointer to get to lane space, or a left
+ // shift of the offset to get to wavespace. We can right shift
+ // after the computation to get back to the desired per-lane
+ // value.
+ Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
+ TmpResultReg)
+ .addReg(TmpResultReg, RegState::Kill);
+ BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+ TmpResultReg)
+ .addImm(ST.getWavefrontSizeLog2())
+ .addReg(FrameReg);
if (AMDGPU::isInlinableLiteral32(Offset,
ST.hasInv2PiInlineImm())) {
// We fold the offset into mad itself if its inlinable.
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
- TmpResultReg)
- .addImm(ST.getWavefrontSizeLog2())
- .addReg(FrameReg);
- Add =
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
- TmpResultReg)
- .addReg(TmpResultReg, RegState::Kill)
- .addImm(1)
- .addImm(Offset)
- .addImm(0);
+ Add.addImm(1).addImm(Offset).addImm(0);
} else {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
+ BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_MOV_B32_e32),
TmpResultReg)
.addImm(Offset);
- // We start with a frame pointer with a wave space value, and
- // an offset in lane-space. We are materializing a lane space
- // value. We can either do a right shift of the frame pointer
- // to get to lane space, or a left shift of the offset to get
- // to wavespace. We can right shift after the computation to
- // get back to the desired per-lane value. We are using the
- // mad_u32_u24 primarily as an add with no carry out clobber.
- Add =
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
- TmpResultReg)
- .addReg(TmpResultReg, RegState::Kill)
- .addImm(ST.getWavefrontSize())
- .addReg(FrameReg)
- .addImm(0);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
- TmpResultReg)
- .addImm(ST.getWavefrontSizeLog2())
- .addReg(FrameReg);
+ Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
index 7931aab7262e1..d2826fddd381c 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
@@ -900,9 +900,9 @@ body: |
; GFX8-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+ ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
; GFX8-NEXT: $vgpr0 = V_MOV_B32_e32 68, implicit $exec
; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 killed $vgpr0, 64, $sgpr32, 0, implicit $exec
- ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
; GFX8-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX8-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
>From 4d1edad3c4ff2faa6b043ec00d5b80d5dde23e4a Mon Sep 17 00:00:00 2001
From: PankajDwivedi-25 <pankajkumar.divedi at amd.com>
Date: Thu, 1 Aug 2024 12:16:01 +0530
Subject: [PATCH 3/3] [WIP] refactored
---
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 38 +++++++++++--------
.../eliminate-frame-index-s-mov-b32.mir | 2 +-
2 files changed, 23 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index cce1ead4066ca..0c4d797366327 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2571,31 +2571,37 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
assert(Offset > 0 &&
isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
"offset is unsafe for v_mad_u32_u24");
-
- // We are using the mad_u32_u24 primarily as an add with no
- // carry out clobber. We start with a frame pointer with a wave
- // space value, and an offset in lane-space. We are
- // materializing a lane space value. We can either do a right
- // shift of the frame pointer to get to lane space, or a left
- // shift of the offset to get to wavespace. We can right shift
- // after the computation to get back to the desired per-lane
- // value.
+ if (AMDGPU::isInlinableLiteral32(Offset,
+ ST.hasInv2PiInlineImm()))
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+ TmpResultReg)
+ .addImm(ST.getWavefrontSizeLog2())
+ .addReg(FrameReg);
+
+ // We start with a frame pointer with a wave space value, and
+ // an offset in lane-space. We are materializing a lane space
+ // value. We can either do a right shift of the frame pointer
+ // to get to lane space, or a left shift of the offset to get
+ // to wavespace. We can right shift after the computation to
+ // get back to the desired per-lane value. We are using the
+ // mad_u32_u24 primarily as an add with no carry out clobber.
Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
TmpResultReg)
.addReg(TmpResultReg, RegState::Kill);
- BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
- TmpResultReg)
- .addImm(ST.getWavefrontSizeLog2())
- .addReg(FrameReg);
+
if (AMDGPU::isInlinableLiteral32(Offset,
- ST.hasInv2PiInlineImm())) {
+ ST.hasInv2PiInlineImm()))
// We fold the offset into mad itself if its inlinable.
Add.addImm(1).addImm(Offset).addImm(0);
- } else {
+ else {
+ Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_MOV_B32_e32),
TmpResultReg)
.addImm(Offset);
- Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+ TmpResultReg)
+ .addImm(ST.getWavefrontSizeLog2())
+ .addReg(FrameReg);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
index d2826fddd381c..7931aab7262e1 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
@@ -900,9 +900,9 @@ body: |
; GFX8-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
- ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
; GFX8-NEXT: $vgpr0 = V_MOV_B32_e32 68, implicit $exec
; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 killed $vgpr0, 64, $sgpr32, 0, implicit $exec
+ ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
; GFX8-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX8-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
More information about the llvm-commits
mailing list