[llvm] [AMDGPU] A SCHED_BARRIER in a bundle should not prevent other SCHED_BARRIERs to be considered (PR #152627)

Yoonseo Choi via llvm-commits llvm-commits at lists.llvm.org
Fri Aug 8 15:33:35 PDT 2025


https://github.com/yoonseoch updated https://github.com/llvm/llvm-project/pull/152627

>From f06e57fe0e160e31db35e68d7ea320e5bc68e206 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <yoonchoi at amd.com>
Date: Thu, 7 Aug 2025 19:08:57 -0500
Subject: [PATCH 1/2] A SCHED_BARRIER in a bundle should not prevent other
 SCHED_BARRIER to be considered

---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp     |   4 +-
 .../CodeGen/AMDGPU/sched-barrier-post-RA.mir  | 286 ++++++++++++++++++
 2 files changed, 289 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index dbe74b1b08f8c..8c92060128a12 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2508,7 +2508,9 @@ bool SchedGroup::canAddSU(SUnit &SU) const {
     ++E;
 
   // Return true if all of the bundled MIs can be added to this group.
-  return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); });
+  return std::all_of(B, E, [this](MachineInstr &MI) {
+    return (MI.isMetaInstruction()) || canAddMI(MI);
+  });
 }
 
 void SchedGroup::initSchedGroup() {
diff --git a/llvm/test/CodeGen/AMDGPU/sched-barrier-post-RA.mir b/llvm/test/CodeGen/AMDGPU/sched-barrier-post-RA.mir
index 7bdb8f5b35ec5..a49d2188ce704 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-barrier-post-RA.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-barrier-post-RA.mir
@@ -5,6 +5,10 @@
   define amdgpu_kernel void @no_sched_barrier(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ret void }
   define amdgpu_kernel void @sched_barrier_mask_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ret void }
   define amdgpu_kernel void @sched_barrier_mask_1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ret void }
+  define amdgpu_kernel void @no_sched_barrier_many_loads(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ret void }
+  define amdgpu_kernel void @sched_barrier_mask_1924(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ret void }
+  define amdgpu_kernel void @extra_sched_barrier_in_bundle(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ret void }
+  define amdgpu_kernel void @extra_sched_barrier_in_bundle_2(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) { ret void }
 
   !0 = distinct !{!0}
   !1 = !{!1, !0}
@@ -120,3 +124,285 @@ body: |
     }
     S_ENDPGM 0
 ...
+
+
+# No SCHED_BARRIER - First two bundles global_loads get clustered and VALU instrucions are scheduled later.
+
+---
+name: no_sched_barrier_many_loads
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: no_sched_barrier_many_loads
+    ; CHECK: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit killed $sgpr0_sgpr1, implicit killed $vgpr0, implicit $exec {
+    ; CHECK-NEXT:   renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT:   renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: renamable $sgpr2_sgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr10 = IMPLICIT_DEF
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10, implicit $exec {
+    ; CHECK-NEXT:   renamable $vgpr11 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT:   renamable $vgpr12 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: renamable $sgpr4_sgpr5 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr11 = nsw V_MUL_LO_U32_e64 killed $vgpr11, $vgpr11, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr12 = nsw V_MUL_LO_U32_e64 killed $vgpr12, $vgpr12, implicit $exec
+    ; CHECK-NEXT: BUNDLE implicit killed $vgpr10, implicit killed $vgpr11, implicit killed $sgpr2_sgpr3, implicit $exec, implicit killed $vgpr12 {
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr4_sgpr5, implicit $exec, implicit killed $vgpr2 {
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: S_ENDPGM 0
+    renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+    renamable $vgpr0 = IMPLICIT_DEF
+    BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+      renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+      renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    }
+    renamable $sgpr2_sgpr3 = IMPLICIT_DEF
+    renamable $vgpr10 = IMPLICIT_DEF
+    renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+    renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+    BUNDLE implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10, implicit $exec {
+      renamable $vgpr11 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+      renamable $vgpr12 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    }
+    renamable $sgpr4_sgpr5 = IMPLICIT_DEF
+    renamable $vgpr0 = IMPLICIT_DEF
+    renamable $vgpr11 = nsw V_MUL_LO_U32_e64 killed $vgpr11, $vgpr11, implicit $exec
+    renamable $vgpr12 = nsw V_MUL_LO_U32_e64 killed $vgpr12, $vgpr12, implicit $exec
+    BUNDLE implicit killed $vgpr10, implicit killed $vgpr11, implicit killed $sgpr2_sgpr3, implicit $exec, implicit killed $vgpr12 {
+      GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+      GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    }
+    BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr4_sgpr5, implicit $exec, implicit killed $vgpr2 {
+      GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+      GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    }
+    S_ENDPGM 0
+...
+
+# MASK = 0b 0111 1000 0100  SALU, MFMA/WMMA, All DS, All DS Read, All DS Write, All Trans may be
+#                     scheduled across SCHED_BARRIER. VALU and all VMEM, VMEM Read/Write cannot be
+#                     scheduled across SCHED_BARRIER. V_MULs is not shceduled after the second bundle of loads.
+
+---
+name: sched_barrier_mask_1924
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: sched_barrier_mask_1924
+    ; CHECK: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit killed $sgpr0_sgpr1, implicit killed $vgpr0, implicit $exec {
+    ; CHECK-NEXT:   renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT:   renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: renamable $sgpr2_sgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr10 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $sgpr4_sgpr5 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+    ; CHECK-NEXT: SCHED_BARRIER 1924
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10, implicit $exec {
+    ; CHECK-NEXT:   renamable $vgpr11 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT:   renamable $vgpr12 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: renamable $vgpr11 = nsw V_MUL_LO_U32_e64 killed $vgpr11, $vgpr11, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr12 = nsw V_MUL_LO_U32_e64 killed $vgpr12, $vgpr12, implicit $exec
+    ; CHECK-NEXT: BUNDLE implicit killed $vgpr10, implicit killed $vgpr11, implicit killed $sgpr2_sgpr3, implicit $exec, implicit killed $vgpr12 {
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr4_sgpr5, implicit $exec, implicit killed $vgpr2 {
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: S_ENDPGM 0
+    renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+    renamable $vgpr0 = IMPLICIT_DEF
+    BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+      renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+      renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    }
+    renamable $sgpr2_sgpr3 = IMPLICIT_DEF
+    renamable $vgpr10 = IMPLICIT_DEF
+    renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+    renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+
+    SCHED_BARRIER 1924
+
+    BUNDLE implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10, implicit $exec {
+      renamable $vgpr11 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+      renamable $vgpr12 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    }
+    renamable $sgpr4_sgpr5 = IMPLICIT_DEF
+    renamable $vgpr0 = IMPLICIT_DEF
+    renamable $vgpr11 = nsw V_MUL_LO_U32_e64 killed $vgpr11, $vgpr11, implicit $exec
+    renamable $vgpr12 = nsw V_MUL_LO_U32_e64 killed $vgpr12, $vgpr12, implicit $exec
+    BUNDLE implicit killed $vgpr10, implicit killed $vgpr11, implicit killed $sgpr2_sgpr3, implicit $exec, implicit killed $vgpr12 {
+      GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+      GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    }
+    BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr4_sgpr5, implicit $exec, implicit killed $vgpr2 {
+      GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+      GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    }
+    S_ENDPGM 0
+...
+
+# MASK = 0b 0111 1000 0100  SALU, MFMA/WMMA, All DS, All DS Read, All DS Write, All Trans may be
+#                     scheduled across SCHED_BARRIER. VALU and all VMEM, VMEM Read/Write cannot be
+#                     scheduled across SCHED_BARRIER. V_MULs is not shceduled after the second bundle of loads.
+#
+#                     A SCHED_BARRIER before the seconde bundle is honored.
+
+
+---
+name: extra_sched_barrier_in_bundle
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: extra_sched_barrier_in_bundle
+    ; CHECK: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit killed $sgpr0_sgpr1, implicit killed $vgpr0, implicit $exec {
+    ; CHECK-NEXT:   renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT:   renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: renamable $sgpr2_sgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr10 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $sgpr4_sgpr5 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+    ; CHECK-NEXT: SCHED_BARRIER 1924
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10, implicit $exec {
+    ; CHECK-NEXT:   renamable $vgpr11 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT:   SCHED_BARRIER 1924
+    ; CHECK-NEXT:   renamable $vgpr12 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: renamable $vgpr11 = nsw V_MUL_LO_U32_e64 killed $vgpr11, $vgpr11, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr12 = nsw V_MUL_LO_U32_e64 killed $vgpr12, $vgpr12, implicit $exec
+    ; CHECK-NEXT: BUNDLE implicit killed $vgpr10, implicit killed $vgpr11, implicit killed $sgpr2_sgpr3, implicit $exec, implicit killed $vgpr12 {
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr4_sgpr5, implicit $exec, implicit killed $vgpr2 {
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: S_ENDPGM 0
+    renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+    renamable $vgpr0 = IMPLICIT_DEF
+    BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+      renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+      renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    }
+    renamable $sgpr2_sgpr3 = IMPLICIT_DEF
+    renamable $vgpr10 = IMPLICIT_DEF
+    renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+    renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+
+    SCHED_BARRIER 1924
+
+    BUNDLE implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10, implicit $exec {
+      renamable $vgpr11 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+      SCHED_BARRIER 1924
+      renamable $vgpr12 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    }
+    renamable $sgpr4_sgpr5 = IMPLICIT_DEF
+    renamable $vgpr0 = IMPLICIT_DEF
+    renamable $vgpr11 = nsw V_MUL_LO_U32_e64 killed $vgpr11, $vgpr11, implicit $exec
+    renamable $vgpr12 = nsw V_MUL_LO_U32_e64 killed $vgpr12, $vgpr12, implicit $exec
+    BUNDLE implicit killed $vgpr10, implicit killed $vgpr11, implicit killed $sgpr2_sgpr3, implicit $exec, implicit killed $vgpr12 {
+      GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+      GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    }
+    BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr4_sgpr5, implicit $exec, implicit killed $vgpr2 {
+      GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+      GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    }
+    S_ENDPGM 0
+...
+
+# MASK = 0b 0111 1000 0100  SALU, MFMA/WMMA, All DS, All DS Read, All DS Write, All Trans may be
+#                     scheduled across SCHED_BARRIER. VALU and all VMEM, VMEM Read/Write cannot be
+#                     scheduled across SCHED_BARRIER. V_MULs is not shceduled after the second bundle of loads.
+#
+#                     A SCHED_BARRIER in the seconde bundle is ignored.
+#                     TODO: The SCHED_BARRIER may be honored.
+
+---
+name: extra_sched_barrier_in_bundle_2
+tracksRegLiveness: true
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: extra_sched_barrier_in_bundle_2
+    ; CHECK: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit killed $sgpr0_sgpr1, implicit killed $vgpr0, implicit $exec {
+    ; CHECK-NEXT:   renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT:   renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR killed renamable $sgpr0_sgpr1, killed renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: renamable $sgpr2_sgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr10 = IMPLICIT_DEF
+    ; CHECK-NEXT: BUNDLE implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10, implicit $exec {
+    ; CHECK-NEXT:   renamable $vgpr11 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT:   SCHED_BARRIER 1924
+    ; CHECK-NEXT:   renamable $vgpr12 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: renamable $sgpr4_sgpr5 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr11 = nsw V_MUL_LO_U32_e64 killed $vgpr11, $vgpr11, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr12 = nsw V_MUL_LO_U32_e64 killed $vgpr12, $vgpr12, implicit $exec
+    ; CHECK-NEXT: BUNDLE implicit killed $vgpr10, implicit killed $vgpr11, implicit killed $sgpr2_sgpr3, implicit $exec, implicit killed $vgpr12 {
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr4_sgpr5, implicit $exec, implicit killed $vgpr2 {
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: S_ENDPGM 0
+    renamable $sgpr0_sgpr1 = IMPLICIT_DEF
+    renamable $vgpr0 = IMPLICIT_DEF
+    BUNDLE implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit $sgpr0_sgpr1, implicit $vgpr0, implicit $exec {
+      renamable $vgpr1 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+      renamable $vgpr2 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr0_sgpr1, renamable $vgpr0, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    }
+    renamable $sgpr2_sgpr3 = IMPLICIT_DEF
+    renamable $vgpr10 = IMPLICIT_DEF
+    renamable $vgpr1 = nsw V_MUL_LO_U32_e64 killed $vgpr1, $vgpr1, implicit $exec
+    renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr2, $vgpr2, implicit $exec
+
+    BUNDLE implicit-def $vgpr11, implicit-def $vgpr11_lo16, implicit-def $vgpr11_hi16, implicit-def $vgpr12, implicit-def $vgpr12_lo16, implicit-def $vgpr12_hi16, implicit $sgpr2_sgpr3, implicit $vgpr10, implicit $exec {
+      renamable $vgpr11 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 0, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+      SCHED_BARRIER 1924
+      renamable $vgpr12 = GLOBAL_LOAD_DWORD_SADDR renamable $sgpr2_sgpr3, renamable $vgpr10, 512, 0, implicit $exec :: (load (s32) from %ir.in, !alias.scope !0, addrspace 1)
+    }
+    renamable $sgpr4_sgpr5 = IMPLICIT_DEF
+    renamable $vgpr0 = IMPLICIT_DEF
+    renamable $vgpr11 = nsw V_MUL_LO_U32_e64 killed $vgpr11, $vgpr11, implicit $exec
+    renamable $vgpr12 = nsw V_MUL_LO_U32_e64 killed $vgpr12, $vgpr12, implicit $exec
+    BUNDLE implicit killed $vgpr10, implicit killed $vgpr11, implicit killed $sgpr2_sgpr3, implicit $exec, implicit killed $vgpr12 {
+      GLOBAL_STORE_DWORD_SADDR renamable $vgpr10, killed renamable $vgpr11, renamable $sgpr2_sgpr3, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+      GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr10, killed renamable $vgpr12, killed renamable $sgpr2_sgpr3, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    }
+    BUNDLE implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $sgpr4_sgpr5, implicit $exec, implicit killed $vgpr2 {
+      GLOBAL_STORE_DWORD_SADDR renamable $vgpr0, killed renamable $vgpr1, renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+      GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5, 512, 0, implicit $exec :: (store (s32) into %ir.out, !noalias !0, addrspace 1)
+    }
+    S_ENDPGM 0
+...

>From c1b4e379d6d9f6485da506a0199a2eb4a7ec7e41 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <Yoonseo.Choi at amd.com>
Date: Fri, 8 Aug 2025 17:29:24 -0500
Subject: [PATCH 2/2] Simplifying scanning inside a bundle

---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 8c92060128a12..c47ca42007f82 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2502,15 +2502,18 @@ bool SchedGroup::canAddSU(SUnit &SU) const {
     return canAddMI(MI);
 
   // Special case for bundled MIs.
+  // Return true if all of the bundled MIs can be added to this group.
+  // A meta instruction in a bundle is an exception.
   const MachineBasicBlock *MBB = MI.getParent();
-  MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
-  while (E != MBB->end() && E->isBundledWithPred())
-    ++E;
+  // Initially, iterator is on a bundler header.
+  MachineBasicBlock::instr_iterator B = std::next(MI.getIterator());
+  while (B != MBB->end() && B->isBundledWithPred()) {
+    if (!B->isMetaInstruction() && !canAddMI(*B))
+      return false;
+    ++B;
+  }
 
-  // Return true if all of the bundled MIs can be added to this group.
-  return std::all_of(B, E, [this](MachineInstr &MI) {
-    return (MI.isMetaInstruction()) || canAddMI(MI);
-  });
+  return true;
 }
 
 void SchedGroup::initSchedGroup() {



More information about the llvm-commits mailing list