[llvm] [AMDGPU] Inplace FI elimination during PEI for scalar copy instruction (PR #99556)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 18 13:00:44 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Pankaj Dwivedi (PankajDwivedi-25)
<details>
<summary>Changes</summary>
During prologepilog pass while trying to eliminate frame index for scalar copy instruction we ended up in a case where two spills are trying to use same emergency stack slot which leads to almost a dead end with one emergency stack slot.
This is solved by avoiding spilling at the first place, using in-place frame index elimination for scalar copy instruction.
in-place frame index elimination for scalar copy instruction solves the issue but still this patch might not be covering all the corner cases.
Co-authored by @<!-- -->arsenm
---
Patch is 39.87 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/99556.diff
3 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (+52-18)
- (modified) llvm/test/CodeGen/AMDGPU/frame-index.mir (+324)
- (added) llvm/test/CodeGen/AMDGPU/reduce-frame-index.mir (+176)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 8a315aa822786..05153383a3a21 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2449,7 +2449,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
? &AMDGPU::SReg_32RegClass
: &AMDGPU::VGPR_32RegClass;
bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
- MI->getOpcode() == AMDGPU::V_MOV_B32_e64;
+ MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
+ MI->getOpcode() == AMDGPU::S_MOV_B32;
Register ResultReg =
IsCopy ? MI->getOperand(0).getReg()
: RS->scavengeRegisterBackwards(*RC, MI, false, 0);
@@ -2458,7 +2459,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (Offset == 0) {
unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
: AMDGPU::V_LSHRREV_B32_e64;
- auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg);
+ Register TmpResultReg = ResultReg;
+ if (IsSALU && LiveSCC) {
+ TmpResultReg = RS->scavengeRegisterBackwards(
+ AMDGPU::VGPR_32RegClass, MI, false, 0);
+ }
+
+ auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
// For V_LSHRREV, the operands are reversed (the shift count goes
// first).
@@ -2468,11 +2475,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (IsSALU && !LiveSCC)
Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
if (IsSALU && LiveSCC) {
- Register NewDest = RS->scavengeRegisterBackwards(
- AMDGPU::SReg_32RegClass, Shift, false, 0);
+ Register NewDest =
+ IsCopy ? ResultReg
+ : RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
+ Shift, false, 0);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
NewDest)
- .addReg(ResultReg);
+ .addReg(TmpResultReg);
ResultReg = NewDest;
}
} else {
@@ -2523,22 +2532,47 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// We may have 1 free scratch SGPR even though a carry out is
// unavailable. Only one additional mov is needed.
- Register TmpScaledReg = RS->scavengeRegisterBackwards(
- AMDGPU::SReg_32_XM0RegClass, MI, false, 0, false);
- Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
-
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
- .addReg(FrameReg)
- .addImm(ST.getWavefrontSizeLog2());
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
- .addReg(ScaledReg, RegState::Kill)
- .addImm(Offset);
+ Register TmpScaledReg =
+ IsCopy && IsSALU
+ ? ResultReg
+ : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
+ MI, false, 0, false);
+ Register ScaledReg =
+ TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
+ Register TmpResultReg = ScaledReg;
+
+ if (!LiveSCC) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
+ .addReg(FrameReg)
+ .addImm(ST.getWavefrontSizeLog2());
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
+ .addReg(TmpResultReg, RegState::Kill)
+ .addImm(Offset);
+ } else {
+ TmpResultReg = RS->scavengeRegisterBackwards(
+ AMDGPU::VGPR_32RegClass, MI, false, 0);
+
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHR_B32_e64),
+ TmpResultReg)
+ .addImm(ST.getWavefrontSizeLog2())
+ .addReg(FrameReg);
+ auto Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32),
+ TmpResultReg);
+ Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
+ Register NewDest =
+ IsCopy ? ResultReg
+ : RS->scavengeRegisterBackwards(
+ AMDGPU::SReg_32RegClass, Add, false, 0);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ NewDest)
+ .addReg(TmpResultReg);
+ ResultReg = NewDest;
+ }
if (!IsSALU)
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
- .addReg(ScaledReg, RegState::Kill);
+ .addReg(TmpResultReg, RegState::Kill);
else
- ResultReg = ScaledReg;
-
+ ResultReg = TmpResultReg;
// If there were truly no free SGPRs, we need to undo everything.
if (!TmpScaledReg.isValid()) {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index.mir b/llvm/test/CodeGen/AMDGPU/frame-index.mir
index d8736c5276a26..f4476a497f754 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/frame-index.mir
@@ -211,3 +211,327 @@ body: |
renamable $vgpr0 = V_ADD_CO_U32_e32 %stack.0, killed $vgpr0, implicit-def dead $vcc, implicit $exec
...
+---
+name: materialize_fi_s_mov_b32_offset_0_dead_scc
+tracksRegLiveness: true
+stack:
+ - { id: 0, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: name: materialize_fi_s_mov_b32_offset_0_dead_scc
+ ; GCN: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc
+ ; GCN-NEXT: S_ENDPGM 0, implicit $sgpr4
+ renamable $sgpr4 = S_MOV_B32 %stack.0
+ S_ENDPGM 0, implicit $sgpr4
+
+...
+
+---
+name: materialize_fi_s_mov_b32_offset_0_live_scc
+tracksRegLiveness: true
+stack:
+ - { id: 0, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ liveins: $sgpr4, $sgpr5
+
+ ; GCN-LABEL: name: materialize_fi_s_mov_b32_offset_0_live_scc
+ ; GCN: liveins: $sgpr4, $sgpr5
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+ ; GCN-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr32, implicit $exec
+ ; GCN-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+ S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+ renamable $sgpr4 = S_MOV_B32 %stack.0
+ S_ENDPGM 0, implicit $sgpr4, implicit $scc
+
+...
+
+# FI#0 is filler to get a non-0 offset for FI#1
+---
+name: materialize_fi_s_mov_b32_offset_64_dead_scc
+tracksRegLiveness: true
+stack:
+ - { id: 0, type: default, size: 64, alignment: 16, stack-id: default }
+ - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: name: materialize_fi_s_mov_b32_offset_64_dead_scc
+ ; GCN: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+ ; GCN-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 64, implicit-def $scc
+ ; GCN-NEXT: S_ENDPGM 0, implicit $sgpr4
+ renamable $sgpr4 = S_MOV_B32 %stack.1
+ S_ENDPGM 0, implicit $sgpr4
+
+...
+
+---
+name: materialize_fi_s_mov_b32_offset_68_dead_scc
+tracksRegLiveness: true
+stack:
+ - { id: 0, type: default, size: 68, alignment: 16, stack-id: default }
+ - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: name: materialize_fi_s_mov_b32_offset_68_dead_scc
+ ; GCN: $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
+ ; GCN-NEXT: $sgpr4 = S_ADD_I32 killed $sgpr4, 68, implicit-def $scc
+ ; GCN-NEXT: S_ENDPGM 0, implicit $sgpr4
+ renamable $sgpr4 = S_MOV_B32 %stack.1
+ S_ENDPGM 0, implicit $sgpr4
+
+...
+
+---
+name: materialize_fi_s_mov_b32_offset_64_live_scc
+tracksRegLiveness: true
+stack:
+ - { id: 0, type: default, size: 64, alignment: 16, stack-id: default }
+ - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ liveins: $sgpr4, $sgpr5
+
+ ; GCN-LABEL: name: materialize_fi_s_mov_b32_offset_64_live_scc
+ ; GCN: liveins: $sgpr4, $sgpr5
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+ ; GCN-NEXT: $vgpr0 = V_LSHR_B32_e64 6, $sgpr32, implicit $exec
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+ ; GCN-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+ S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+ renamable $sgpr4 = S_MOV_B32 %stack.1
+ S_ENDPGM 0, implicit $sgpr4, implicit $scc
+
+...
+
+---
+name: materialize_fi_s_mov_b32_offset_68_live_scc
+tracksRegLiveness: true
+stack:
+ - { id: 0, type: default, size: 68, alignment: 16, stack-id: default }
+ - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ liveins: $sgpr4, $sgpr5
+
+ ; GCN-LABEL: name: materialize_fi_s_mov_b32_offset_68_live_scc
+ ; GCN: liveins: $sgpr4, $sgpr5
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+ ; GCN-NEXT: $vgpr0 = V_LSHR_B32_e64 6, $sgpr32, implicit $exec
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 68, killed $vgpr0, implicit $exec
+ ; GCN-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+ ; GCN-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+ S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+ renamable $sgpr4 = S_MOV_B32 %stack.1
+ S_ENDPGM 0, implicit $sgpr4, implicit $scc
+
+...
+
+# FIXME: This is finding a VGPR
+---
+name: materialize_fi_s_mov_b32_offset_0_live_scc__no_free_vgprs
+tracksRegLiveness: true
+stack:
+ - { id: 0, type: default, size: 4, alignment: 16, stack-id: default }
+machineFunctionInfo:
+ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+ stackPtrOffsetReg: '$sgpr32'
+ occupancy: 10
+
+body: |
+ bb.0:
+ liveins: $sgpr4, $sgpr5, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+
+ ; GCN-LABEL: name: materialize_fi_s_mov_b32_offset_0_live_scc__no_free_vgprs
+ ; GCN: liveins: $sgpr4, $sgpr5, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr56, $vgpr57, $vgpr58, $vgpr59, $vgpr60, $vgpr61, $vgpr62, $vgpr63
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr41, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr42, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr43, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr45, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr47, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr57, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr59, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr61, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr63, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+ ; GCN-NEXT: S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; GCN-NEXT: S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; GCN-NEXT: S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; GCN-NEXT: S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; GCN-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; GCN-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; GCN-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; GCN-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; GCN-NEXT: S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: (store (s32) into %stack.17, addrspace 5)
+ ; GCN-NEXT: $vgpr0 = V_LSHR_B32_e64 6, $sgpr32, implicit $exec
+ ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 64, killed $vgpr0, implicit $exec
+ ; GCN-NEXT: $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+ ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 68, 0, 0, implicit $exec :: (load (s32) from %stack.17, addrspace 5)
+ ; GCN-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ ; GCN-NEXT: S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; GCN-NEXT: S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ ; GCN-NEXT: S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ ; GCN-NEXT: S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ ; GCN-NEXT: S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ ; GCN-NEXT: S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ ; GCN-NEXT: S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ ; GCN-NEXT: $vgpr63 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.16, addrspace 5)
+ ; GCN-NEXT: $vgpr62 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+ ; GCN-NEXT: $vgpr61 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.14, addrspace 5)
+ ; GCN-NEXT: $vgpr60 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 12, 0, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+ ; GCN-NEXT: $vgpr59 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 16, 0, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+ ; GCN-NEXT: $vgpr58 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 20, 0, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+ ; GCN-NEXT: $vgpr57 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 24, 0, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+ ; GCN-NEXT: $vgpr56 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 28, 0, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+ ; GCN-NEXT: $vgpr47 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 32, 0, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+ ; GCN-NEXT: $vgpr46 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 36, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+ ; GCN-NEXT: $vgpr45 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 40, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+ ; GCN-NEXT: $vgpr44 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 44, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+ ; GCN-NEXT: $vgpr43 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 48, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+ ; GCN-NEXT: $vgpr42 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+ ; GCN-NEXT: $vgpr41 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+ ; GCN-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+ ; GCN-NEXT: S_ENDPGM 0, implicit $sgpr4, implicit $scc
+ S_NOP 0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit-def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit-def $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit-def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ S_CMP_EQ_I32 $sgpr4, $sgpr5, implicit-def $scc
+
+ renamable $sgpr4 = S_MOV_B32 %stack.0
+
+ S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+ S_NOP 0, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ S_NOP 0, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23
+ S_NOP 0, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
+ S_NOP 0, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39
+ S_NOP 0, implicit $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
+ S_NOP 0, implicit $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55
+ S_NOP 0, implicit $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
+ S_ENDPGM 0, implicit $sgpr4, implicit $scc
+
+...
+
+# FIXME: This is clobbering scc
+---
+name: materialize_fi_s_mov_b32_offset_96_live_scc__no_free_vgprs
+tracksRegLiveness: true
+stack:
+ - { id: 0, type: default, size: 64, alignment: 16, stack-id: default }
+ - { id: 1, type: default, size: 4, alignment: 4, stack-id: default }
+machineFunctionInfo:
+ scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ liveins: $sgpr4, $sgpr5
+
+ ; GCN-LABEL: name: materialize_fi_s_mov_b32_offset...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/99556
More information about the llvm-commits
mailing list