[llvm] 42bae9c - [AMDGPU] Optimize the register uses if offset inlinable (#101676)

Mon Aug 5 00:19:36 PDT 2024

Author: Pankaj Dwivedi
Date: 2024-08-05T12:49:33+05:30
New Revision: 42bae9c542c2995ed2b4555d9e25c5ea0b5dc99f

URL: https://github.com/llvm/llvm-project/commit/42bae9c542c2995ed2b4555d9e25c5ea0b5dc99f
DIFF: https://github.com/llvm/llvm-project/commit/42bae9c542c2995ed2b4555d9e25c5ea0b5dc99f.diff

LOG: [AMDGPU] Optimize the register uses if offset inlinable (#101676)

Fold the frame index offset into v_mad if inlinable.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
    llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ba49f4a309ebb..dd4e0d53202d4 100644

--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2568,26 +2568,35 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                 } else
                   Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
               } else {
-                BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
-                        TmpResultReg)
-                    .addImm(Offset);
                 assert(Offset > 0 &&
                        isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
                        "offset is unsafe for v_mad_u32_u24");
-                // We start with a frame pointer with a wave space value, and an
-                // offset in lane-space. We are materializing a lane space
-                // value. We can either do a right shift of the frame pointer to
-                // get to lane space, or a left shift of the offset to get to
-                // wavespace. We can right shift after the computation to get
-                // back to the desired per-lane value.
-                // We are using the mad_u32_u24 primarily as an add with no
-                // carry out clobber.
+
+                // We start with a frame pointer with a wave space value, and
+                // an offset in lane-space. We are materializing a lane space
+                // value. We can either do a right shift of the frame pointer
+                // to get to lane space, or a left shift of the offset to get
+                // to wavespace. We can right shift after the computation to
+                // get back to the desired per-lane value. We are using the
+                // mad_u32_u24 primarily as an add with no carry out clobber.
+                bool IsInlinableLiteral = AMDGPU::isInlinableLiteral32(
+                    Offset, ST.hasInv2PiInlineImm());
+                if (!IsInlinableLiteral) {
+                  BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
+                          TmpResultReg)
+                      .addImm(Offset);
+                }
+
                 Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
-                              TmpResultReg)
-                          .addReg(TmpResultReg, RegState::Kill)
-                          .addImm(ST.getWavefrontSize())
-                          .addReg(FrameReg)
-                          .addImm(0);
+                              TmpResultReg);
+
+                if (!IsInlinableLiteral) {
+                  Add.addReg(TmpResultReg, RegState::Kill);
+                } else {
+                  // We fold the offset into mad itself if its inlinable.
+                  Add.addImm(Offset);
+                }
+                Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
                 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
                         TmpResultReg)
                     .addImm(ST.getWavefrontSizeLog2())

diff  --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
index 08c2904c601ad..0714def30053d 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-mov-b32.mir
@@ -681,10 +681,10 @@ body:   |
 
 
 ---
-name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+name: materialize_fi_s_mov_b32_offset_24_live_scc_live_vcc_no_sgpr
 tracksRegLiveness: true
 stack:
-  - { id: 0, type: default, size: 64, alignment: 16, stack-id: default }
+  - { id: 0, type: default, size: 24, alignment: 16, stack-id: default }
   - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
 machineFunctionInfo:
   stackPtrOffsetReg: '$sgpr32'
@@ -693,7 +693,7 @@ body:   |
     liveins: $sgpr4, $sgpr5, $vgpr0
 
 
-    ; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+    ; GFX8-LABEL: name: materialize_fi_s_mov_b32_offset_24_live_scc_live_vcc_no_sgpr
     ; GFX8: liveins: $sgpr4, $sgpr5, $vgpr0
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
@@ -706,8 +706,7 @@ body:   |
     ; GFX8-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
     ; GFX8-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
     ; GFX8-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
-    ; GFX8-NEXT: $vgpr0 = V_MOV_B32_e32 64, implicit $exec
-    ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 killed $vgpr0, 64, $sgpr32, 0, implicit $exec
+    ; GFX8-NEXT: $vgpr0 = V_MAD_U32_U24_e64 24, 64, $sgpr32, 0, implicit $exec
     ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $vgpr0, implicit $exec
     ; GFX8-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
     ; GFX8-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
@@ -720,7 +719,7 @@ body:   |
     ; GFX8-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
     ; GFX8-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
     ;
-    ; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+    ; GFX900-LABEL: name: materialize_fi_s_mov_b32_offset_24_live_scc_live_vcc_no_sgpr
     ; GFX900: liveins: $sgpr4, $sgpr5, $vgpr0
     ; GFX900-NEXT: {{  $}}
     ; GFX900-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
@@ -734,7 +733,7 @@ body:   |
     ; GFX900-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
     ; GFX900-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
     ; GFX900-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
-    ; GFX900-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX900-NEXT: $vgpr0 = V_ADD_U32_e32 24, killed $vgpr0, implicit $exec
     ; GFX900-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
     ; GFX900-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX900-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
@@ -746,7 +745,7 @@ body:   |
     ; GFX900-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
     ; GFX900-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
     ;
-    ; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+    ; GFX90A-LABEL: name: materialize_fi_s_mov_b32_offset_24_live_scc_live_vcc_no_sgpr
     ; GFX90A: liveins: $sgpr4, $sgpr5, $vgpr0
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
@@ -760,7 +759,7 @@ body:   |
     ; GFX90A-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec
     ; GFX90A-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
     ; GFX90A-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
-    ; GFX90A-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX90A-NEXT: $vgpr0 = V_ADD_U32_e32 24, killed $vgpr0, implicit $exec
     ; GFX90A-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
     ; GFX90A-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX90A-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
@@ -772,7 +771,7 @@ body:   |
     ; GFX90A-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
     ; GFX90A-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
     ;
-    ; GFX1010-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+    ; GFX1010-LABEL: name: materialize_fi_s_mov_b32_offset_24_live_scc_live_vcc_no_sgpr
     ; GFX1010: liveins: $sgpr4, $sgpr5, $vgpr0
     ; GFX1010-NEXT: {{  $}}
     ; GFX1010-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
@@ -786,7 +785,7 @@ body:   |
     ; GFX1010-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
     ; GFX1010-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
     ; GFX1010-NEXT: $vgpr0 = V_LSHRREV_B32_e64 5, $sgpr32, implicit $exec
-    ; GFX1010-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+    ; GFX1010-NEXT: $vgpr0 = V_ADD_U32_e32 24, killed $vgpr0, implicit $exec
     ; GFX1010-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
     ; GFX1010-NEXT: S_NOP 0, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
     ; GFX1010-NEXT: S_NOP 0, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
@@ -798,7 +797,7 @@ body:   |
     ; GFX1010-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
     ; GFX1010-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
     ;
-    ; GFX1100-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+    ; GFX1100-LABEL: name: materialize_fi_s_mov_b32_offset_24_live_scc_live_vcc_no_sgpr
     ; GFX1100: liveins: $sgpr4, $sgpr5, $vgpr0
     ; GFX1100-NEXT: {{  $}}
     ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
@@ -811,7 +810,7 @@ body:   |
     ; GFX1100-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
     ; GFX1100-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
     ; GFX1100-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
-    ; GFX1100-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 64, implicit-def $scc, implicit $scc
+    ; GFX1100-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 24, implicit-def $scc, implicit $scc
     ; GFX1100-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc
     ; GFX1100-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi
     ; GFX1100-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi
@@ -825,7 +824,7 @@ body:   |
     ; GFX1100-NEXT: S_NOP 0, implicit $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
     ; GFX1100-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc_lo
     ;
-    ; GFX1200-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc_live_vcc_no_sgpr
+    ; GFX1200-LABEL: name: materialize_fi_s_mov_b32_offset_24_live_scc_live_vcc_no_sgpr
     ; GFX1200: liveins: $sgpr4, $sgpr5, $vgpr0
     ; GFX1200-NEXT: {{  $}}
     ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
@@ -838,7 +837,7 @@ body:   |
     ; GFX1200-NEXT: S_NOP 0, implicit-def $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63
     ; GFX1200-NEXT: V_CMP_EQ_U32_e32 0, killed $vgpr0, implicit-def $vcc_lo, implicit $exec
     ; GFX1200-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
-    ; GFX1200-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 64, implicit-def $scc, implicit $scc
+    ; GFX1200-NEXT: $vcc_hi = S_ADDC_U32 $sgpr32, 24, implicit-def $scc, implicit $scc
     ; GFX1200-NEXT: S_BITCMP1_B32 $vcc_hi, 0, implicit-def $scc
     ; GFX1200-NEXT: $vcc_hi = S_BITSET0_B32 0, $vcc_hi
     ; GFX1200-NEXT: renamable $sgpr4 = S_MOV_B32 killed $vcc_hi
@@ -1070,3 +1069,4 @@ body:             |
   S_ENDPGM 0, implicit $sgpr4, implicit $scc, implicit killed $vcc
 
 ...
+

diff  --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
index 87cfaec208897..6346406fa8941 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll
@@ -835,12 +835,11 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX7-NEXT:    v_writelane_b32 v21, s56, 25
 ; GFX7-NEXT:    v_writelane_b32 v21, s57, 26
 ; GFX7-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX7-NEXT:    v_mov_b32_e32 v22, 16
 ; GFX7-NEXT:    v_writelane_b32 v21, s58, 27
 ; GFX7-NEXT:    ;;#ASMSTART
 ; GFX7-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX7-NEXT:    ;;#ASMEND
-; GFX7-NEXT:    v_mad_u32_u24 v22, v22, 64, s32
+; GFX7-NEXT:    v_mad_u32_u24 v22, 16, 64, s32
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 6, v22
 ; GFX7-NEXT:    v_writelane_b32 v21, s59, 28
 ; GFX7-NEXT:    v_readfirstlane_b32 s59, v22
@@ -918,12 +917,11 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs__lowe
 ; GFX8-NEXT:    v_writelane_b32 v21, s56, 25
 ; GFX8-NEXT:    v_writelane_b32 v21, s57, 26
 ; GFX8-NEXT:    s_and_b64 s[4:5], 0, exec
-; GFX8-NEXT:    v_mov_b32_e32 v22, 16
 ; GFX8-NEXT:    v_writelane_b32 v21, s58, 27
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], s58, v[0:15], v[16:20], vcc
 ; GFX8-NEXT:    ;;#ASMEND
-; GFX8-NEXT:    v_mad_u32_u24 v22, v22, 64, s32
+; GFX8-NEXT:    v_mad_u32_u24 v22, 16, 64, s32
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v22, 6, v22
 ; GFX8-NEXT:    v_writelane_b32 v21, s59, 28
 ; GFX8-NEXT:    v_readfirstlane_b32 s59, v22