[llvm] [AMDGPU] In a tie, unpack packed instructions to avoid issue stalls (PR #165641)

via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 29 20:27:20 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Austin Kerbow (kerbowa)

<details>
<summary>Changes</summary>

If there is one more cycle before any packed instruction can be issued,
it should still be unpacked to avoid issue stalls.

---
Full diff: https://github.com/llvm/llvm-project/pull/165641.diff


2 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp (+13-3) 
- (modified) llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir (+66) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 7431e111ec862..2f8f93c80ee92 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -631,11 +631,21 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
     // latency, add latency of two unpacked instructions (currently estimated
     // as 2 cycles).
     TotalCyclesBetweenCandidates -= Latency;
+    // Once we've removed the packed latency, if we're already past the MFMA
+    // overlap window, later instructions can only increase the distance.  Stop
+    // scanning for more candidates for this MFMA.  Subtract 1 to account for
+    // MFMA issue latency. If the packed instruction cannot be immediately
+    // issued in the last cycle of the MFMA's execution we still want to
+    // unpack.
+    //
+    // FIXME: We shouldn't need to subtract 1 here, this should be reflected in
+    // the SchedModel.
+    if (TotalCyclesBetweenCandidates >= NumMFMACycles - 1)
+      return;
+
     // TODO: improve latency handling based on instruction modeling.
     TotalCyclesBetweenCandidates += 2;
-    // Subtract 1 to account for MFMA issue latency.
-    if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
-      InstrsToUnpack.insert(&Instr);
+    InstrsToUnpack.insert(&Instr);
   }
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
index 75ae76fdee19b..802745ce68780 100644
--- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
+++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir
@@ -1167,3 +1167,69 @@ body:             |
     $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec
     renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
     S_ENDPGM 0
+...
+---
+name:            test_tie_unpack_minimal
+tracksRegLiveness: true
+
+liveins:
+  - { reg: '$vgpr0_vgpr1_vgpr2_vgpr3' }
+  - { reg: '$vgpr4_vgpr5_vgpr6_vgpr7' }
+  - { reg: '$vgpr8' }
+  - { reg: '$vgpr9' }
+  - { reg: '$vgpr10_vgpr11' }
+  - { reg: '$vgpr12_vgpr13' }
+  - { reg: '$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15' }
+
+body:             |
+  bb.0.entry:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8, $vgpr9, $vgpr10_vgpr11, $vgpr12_vgpr13, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+
+    ; GFX950-LABEL: name: test_tie_unpack_minimal
+    ; GFX950: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8, $vgpr9, $vgpr10_vgpr11, $vgpr12_vgpr13, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX950-NEXT: {{  $}}
+    ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX950-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
+    ; GFX950-NEXT: $vgpr12 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr10, 0, 0, implicit $mode, implicit $exec
+    ; GFX950-NEXT: $vgpr13 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr11, 0, 0, implicit $mode, implicit $exec
+    ; GFX950-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
+    ; GFX950-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, killed $vgpr9, implicit $mode, implicit $exec
+    ; GFX950-NEXT: $vgpr12 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr10, 0, 0, implicit $mode, implicit $exec
+    ; GFX950-NEXT: $vgpr13 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr11, 0, 0, implicit $mode, implicit $exec
+    ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX950-NEXT: S_ENDPGM 0
+    ;
+    ; GFX942-LABEL: name: test_tie_unpack_minimal
+    ; GFX942: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8, $vgpr9, $vgpr10_vgpr11, $vgpr12_vgpr13, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX942-NEXT: {{  $}}
+    ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
+    ; GFX942-NEXT: $vgpr12 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr10, 0, 0, implicit $mode, implicit $exec
+    ; GFX942-NEXT: $vgpr13 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr11, 0, 0, implicit $mode, implicit $exec
+    ; GFX942-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
+    ; GFX942-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, killed $vgpr9, implicit $mode, implicit $exec
+    ; GFX942-NEXT: $vgpr12 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr10, 0, 0, implicit $mode, implicit $exec
+    ; GFX942-NEXT: $vgpr13 = nofpexcept V_MUL_F32_e64 0, $vgpr10, 0, $vgpr11, 0, 0, implicit $mode, implicit $exec
+    ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942-NEXT: S_ENDPGM 0
+    ;
+    ; GFX90A-LABEL: name: test_tie_unpack_minimal
+    ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8, $vgpr9, $vgpr10_vgpr11, $vgpr12_vgpr13, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = nofpexcept V_PK_MUL_F32 0, $vgpr10_vgpr11, 8, $vgpr10_vgpr11, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, killed $vgpr9, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = nofpexcept V_PK_MUL_F32 0, $vgpr10_vgpr11, 8, $vgpr10_vgpr11, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX90A-NEXT: S_ENDPGM 0
+    renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
+    renamable $vgpr12_vgpr13 = nofpexcept V_PK_MUL_F32 0, $vgpr10_vgpr11, 8, $vgpr10_vgpr11, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, $vgpr9, implicit $mode, implicit $exec
+    renamable $vgpr8 = nofpexcept V_ADD_F32_e32 killed $vgpr8, killed $vgpr9, implicit $mode, implicit $exec
+    renamable $vgpr12_vgpr13 = nofpexcept V_PK_MUL_F32 0, $vgpr10_vgpr11, 8, $vgpr10_vgpr11, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+    S_ENDPGM 0
+...

``````````

</details>


https://github.com/llvm/llvm-project/pull/165641


More information about the llvm-commits mailing list