[llvm] [WIP] Optimize S_MOV frame index elimination support (PR #101322)

Wed Jul 31 04:23:09 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Pankaj Dwivedi (PankajDwivedi-25)

<details>
<summary>Changes</summary>

Fold the frame index offset into v_mad if inlinable.

---
Full diff: https://github.com/llvm/llvm-project/pull/101322.diff


2 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (+32-16) 
- (modified) llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir (+1-2) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index e824e95610a65..9b8753e5268bf 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2568,30 +2568,46 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                 } else
                   Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
               } else {
-                BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
-                        TmpResultReg)
-                    .addImm(Offset);
                 assert(Offset > 0 &&
                        isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
                        "offset is unsafe for v_mad_u32_u24");
-                // We start with a frame pointer with a wave space value, and an
-                // offset in lane-space. We are materializing a lane space
-                // value. We can either do a right shift of the frame pointer to
-                // get to lane space, or a left shift of the offset to get to
-                // wavespace. We can right shift after the computation to get
-                // back to the desired per-lane value.
-                // We are using the mad_u32_u24 primarily as an add with no
-                // carry out clobber.
-                Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
+                if (AMDGPU::isInlinableLiteral32(Offset,
+                                                 ST.hasInv2PiInlineImm())) {
+                  // We fold the offset into mad itself if its inlinable.
+                  BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+                          TmpResultReg)
+                      .addImm(ST.getWavefrontSizeLog2())
+                      .addReg(FrameReg);
+                  Add =
+                      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
+                              TmpResultReg)
+                          .addReg(TmpResultReg, RegState::Kill)
+                          .addImm(1)
+                          .addImm(Offset)
+                          .addImm(0);
+                } else {
+                  BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
+                          TmpResultReg)
+                      .addImm(Offset);
+                  // We start with a frame pointer with a wave space value, and
+                  // an offset in lane-space. We are materializing a lane space
+                  // value. We can either do a right shift of the frame pointer
+                  // to get to lane space, or a left shift of the offset to get
+                  // to wavespace. We can right shift after the computation to
+                  // get back to the desired per-lane value. We are using the
+                  // mad_u32_u24 primarily as an add with no carry out clobber.
+                  Add =
+                      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
                               TmpResultReg)
                           .addReg(TmpResultReg, RegState::Kill)
                           .addImm(ST.getWavefrontSize())
                           .addReg(FrameReg)
                           .addImm(0);
-                BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
-                        TmpResultReg)
-                    .addImm(ST.getWavefrontSizeLog2())
-                    .addReg(FrameReg);
+                  BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+                          TmpResultReg)
+                      .addImm(ST.getWavefrontSizeLog2())
+                      .addReg(FrameReg);
+                }
               }
 
               Register NewDest = IsCopy ? ResultReg
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
index 78fb25a76d25e..7931aab7262e1 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
@@ -706,9 +706,8 @@ body:   |
     ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
     ; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
     ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
-    ; GFX8-NEXT: $vgpr0 = V_MOV_B32_e32 64, implicit $exec
-    ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 killed $vgpr0, 64, $sgpr32, 0, implicit $exec
     ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+    ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 killed $vgpr0, 1, 64, 0, implicit $exec
     ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
     ; GFX8-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX8-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15

``````````

</details>


https://github.com/llvm/llvm-project/pull/101322