[llvm] [AMDGPU] In a tie, unpack packed instructions to avoid issue stalls (PR #165641)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 29 20:27:20 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Austin Kerbow (kerbowa)
<details>
<summary>Changes</summary>
If there is one more cycle before any packed instruction can be issued,
it should still be unpacked to avoid issue stalls.
---
Full diff: https://github.com/llvm/llvm-project/pull/165641.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp (+13-3)
- (modified) llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir (+66)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 7431e111ec862..2f8f93c80ee92 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -631,11 +631,21 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
// latency, add latency of two unpacked instructions (currently estimated
// as 2 cycles).
TotalCyclesBetweenCandidates -= Latency;
+ // Once we've removed the packed latency, if we're already past the MFMA
+ // overlap window, later instructions can only increase the distance. Stop
+ // scanning for more candidates for this MFMA. Subtract 1 to account for
+ // MFMA issue latency. If the packed instruction cannot be immediately
+ // issued in the last cycle of the MFMA's execution we still want to
+ // unpack.
+ //
+ // FIXME: We shouldn't need to subtract 1 here, this should be reflected in
+ // the SchedModel.
+ if (TotalCyclesBetweenCandidates >= NumMFMACycles - 1)
+ return;
+
// TODO: improve latency handling based on instruction modeling.
TotalCyclesBetweenCandidates += 2;
- // Subtract 1 to account for MFMA issue latency.
- if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
- InstrsToUnpack.insert(&Instr);
+ InstrsToUnpack.insert(&Instr);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
index 75ae76fdee19b..802745ce68780 100644
--- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
@@ -1167,3 +1167,69 @@ body: |
$vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
S_ENDPGM 0
+...
+---
+name: test_tie_unpack_minimal
+tracksRegLiveness: true
+
+liveins:
+ - { reg: '$vgpr0_vgpr1_vgpr2_vgpr3' }
+ - { reg: '$vgpr4_vgpr5_vgpr6_vgpr7' }
+ - { reg: '$vgpr8' }
+ - { reg: '$vgpr9' }
+ - { reg: '$vgpr10_vgpr11' }
+ - { reg: '$vgpr12_vgpr13' }
+ - { reg: '$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15' }
+
+body: |
+ bb.0.entry:
+ liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8, $vgpr9, $vgpr10_vgpr11, $vgpr12_vgpr13, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+
+ ; GFX950-LABEL: name: test_tie_unpack_minimal
+ ; GFX950: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8, $vgpr9, $vgpr10_vgpr11, $vgpr12_vgpr13, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; GFX950-NEXT: {{ $}}
+ ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr12 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr10, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr13 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr11, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
+ ; GFX950-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, killed $vgpr9, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr12 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr10, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: $vgpr13 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr11, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX950-NEXT: S_ENDPGM 0
+ ;
+ ; GFX942-LABEL: name: test_tie_unpack_minimal
+ ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8, $vgpr9, $vgpr10_vgpr11, $vgpr12_vgpr13, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; GFX942-NEXT: {{ $}}
+ ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr12 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr10, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr13 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr11, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
+ ; GFX942-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, killed $vgpr9, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr12 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr10, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: $vgpr13 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr11, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX942-NEXT: S_ENDPGM 0
+ ;
+ ; GFX90A-LABEL: name: test_tie_unpack_minimal
+ ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8, $vgpr9, $vgpr10_vgpr11, $vgpr12_vgpr13, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+ ; GFX90A-NEXT: {{ $}}
+ ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = nofpexcept V_PK_MUL_F32 0, $vgpr10_vgpr11, 8, $vgpr10_vgpr11, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, killed $vgpr9, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = nofpexcept V_PK_MUL_F32 0, $vgpr10_vgpr11, 8, $vgpr10_vgpr11, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX90A-NEXT: S_ENDPGM 0
+ renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
+ renamable $vgpr12_vgpr13 = nofpexcept V_PK_MUL_F32 0, $vgpr10_vgpr11, 8, $vgpr10_vgpr11, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
+ renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, killed $vgpr9, implicit $mode, implicit $exec
+ renamable $vgpr12_vgpr13 = nofpexcept V_PK_MUL_F32 0, $vgpr10_vgpr11, 8, $vgpr10_vgpr11, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
``````````
</details>
https://github.com/llvm/llvm-project/pull/165641
More information about the llvm-commits
mailing list