[llvm] [AMDGPU] Enable overwrite ALU bit in sched.barrier mask (PR #160782)

Mon Oct 20 04:36:45 PDT 2025

https://github.com/jplehr updated https://github.com/llvm/llvm-project/pull/160782

>From bba92f154219dee17d2337622c99d7040d3e0356 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Thu, 25 Sep 2025 15:25:08 -0500
Subject: [PATCH 1/5] [AMDGPU] Enable overwrite ALU bit in sched.barrier mask

The sched.barrier takes a bit mask that determines which instruction
categories are allowed to cross the inserted sched.barrier during the
igrouplp scheduling pass.

Currently, a set ALU bit results in allowing all ALU instructions to
move across the barrier, independent of whether more specific bits
have been specified.
The documentation is silent about the semantics in that case.

This PR changes the current handling: When a mask contains both a set
ALU bit and a set more-specific bit, the more specific bit is respected
and the ALU bit does *not imply* all other bits.

Current:
0x00000005 -- 0101 set ALU and SALU bit. Currently the ALU bit implies
                   SALU and VALU and MFMA to be set.

New:
0x00000005 -- 0101 set ALU and SALU bit. SALU bit set, therefore ALU bit
                   is ignored and only SALU bit is considered.
---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 5700468e2420e..7e9503a36f603 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2619,8 +2619,14 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
   // allowed past the SCHED_BARRIER.
   SchedGroupMask InvertedMask = ~Mask;
 
+  // When given, specific bits overrule the more general ALU type.
+  bool HasConcreteClassSpecified =
+      (Mask & (SchedGroupMask::SALU | SchedGroupMask::VALU |
+               SchedGroupMask::MFMA)) != SchedGroupMask::NONE;
+
   // ALU implies VALU, SALU, MFMA, TRANS.
-  if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
+  if (!HasConcreteClassSpecified &&
+      (InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
     InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
                     ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
   // VALU, SALU, MFMA, TRANS implies ALU.

>From f16792cf5dc4b8137bbbf73da708e4e2bb7f0275 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Sat, 18 Oct 2025 13:26:32 -0500
Subject: [PATCH 2/5] Add MEM handling

---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 7e9503a36f603..f7a4353705fc1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2619,13 +2619,13 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
   // allowed past the SCHED_BARRIER.
   SchedGroupMask InvertedMask = ~Mask;
 
-  // When given, specific bits overrule the more general ALU type.
-  bool HasConcreteClassSpecified =
+  // When given specific bits overrule the more general ALU type.
+  bool HasConcreteALUClassSpecified =
       (Mask & (SchedGroupMask::SALU | SchedGroupMask::VALU |
                SchedGroupMask::MFMA)) != SchedGroupMask::NONE;
 
   // ALU implies VALU, SALU, MFMA, TRANS.
-  if (!HasConcreteClassSpecified &&
+  if (!HasConcreteALUClassSpecified &&
       (InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
     InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
                     ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
@@ -2636,8 +2636,15 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
            (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
     InvertedMask &= ~SchedGroupMask::ALU;
 
+  // When given specific bits overrule the more general MEM type.
+  bool HasConcreteMemClassSpecified = (Mask & (SchedGroupMask::VMEM_READ |
+                                           SchedGroupMask::VMEM_WRITE |
+                                           SchedGroupMask::DS_READ |
+                                           SchedGroupMask::DS_WRITE)) !=
+                                     SchedGroupMask::NONE;
+
   // VMEM implies VMEM_READ, VMEM_WRITE.
-  if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
+  if (!HasConcreteMemClassSpecified && (InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
     InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
   // VMEM_READ, VMEM_WRITE implies VMEM.
   else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||

>From d513ccbf58a400521b8a69d98d16b6a0e3b5907e Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Mon, 20 Oct 2025 06:28:00 -0500
Subject: [PATCH 3/5] Adds test

Various different sched.barrier masks are tested for their inverse.
The inverse is used to create a sched.group in which the instructions
that are not allowed to move are contained.
---
 .../llvm.amdgcn.sched.barrier.alu-bit.ll      | 185 ++++++++++++++++++
 1 file changed, 185 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.alu-bit.ll

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.alu-bit.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.alu-bit.ll
new file mode 100644
index 0000000000000..48073a3856caa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.alu-bit.ll
@@ -0,0 +1,185 @@
+; RUN: llc -mtriple=amdgcn -debug-only=igrouplp < %s 2>&1| FileCheck -check-prefix=GCN %s
+
+define protected amdgpu_kernel void @sched_barrier_m0(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 0 (no bits set)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Applying IGroupLPDAGMutation...
+entry:
+  ; we need salu, valu, mfma, trans instructions here.
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 0) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @sched_barrier_m1(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 1 (ALU Bit, implies all *-ALU bits)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 1
+; GCN-NEXT: After Inverting, SchedGroup Mask: 1008
+entry:
+  ; we need salu, valu, mfma, trans instructions here.
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 1) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @sched_barrier_m2(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 2 (VALU Bit)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 2
+; GCN-NEXT: After Inverting, SchedGroup Mask: 2044
+entry:
+  ; we need salu, valu, mfma, trans instructions here.
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 2) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @sched_barrier_m4(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 4 (SALU Bit)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 4
+; GCN-NEXT: After Inverting, SchedGroup Mask: 2042
+entry:
+  ; we need salu, valu, mfma, trans instructions here.
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 4) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @sched_barrier_m8(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 8 (MFMA Bit)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 8
+; GCN-NEXT: After Inverting, SchedGroup Mask: 2038
+entry:
+  ; we need salu, valu, mfma, trans instructions here.
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 8) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @sched_barrier_m1024(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 1024 (TRANS Bit)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 1024
+; GCN-NEXT: After Inverting, SchedGroup Mask: 1022
+entry:
+  ; we need salu, valu, mfma, trans instructions here.
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 1024) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @sched_barrier_m3(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 3 (ALU + VALU Bits)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 3
+; GCN-NEXT: After Inverting, SchedGroup Mask: 2044
+entry:
+  ; we need salu, valu, mfma, trans instructions here.
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 3) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @sched_barrier_m5(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 5 (ALU + SALU Bits)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 5
+; GCN-NEXT: After Inverting, SchedGroup Mask: 2042
+entry:
+  ; we need salu, valu, mfma, trans instructions here.
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 5) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @sched_barrier_m7(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 7 (ALU + VALU + SALU Bits)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 7
+; GCN-NEXT: After Inverting, SchedGroup Mask: 2040
+entry:
+  ; we need salu, valu, mfma, trans instructions here.
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 7) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @sched_barrier_m15(ptr addrspace(3) noalias %ind, ptr addrspace(3) noalias %outd) #0 {
+;
+; Set mask to 15 (ALU + VALU + SALU + MFMA Bits)
+;
+; GCN: Applying IGroupLPDAGMutation...
+; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 15
+; GCN-NEXT: After Inverting, SchedGroup Mask: 2032
+entry:
+  ; we need salu, valu, mfma, trans instructions here.
+  %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
+  %1 = load float, ptr addrspace(3) %arrayidx, align 4
+  call void @llvm.amdgcn.sched.barrier(i32 15) #1
+  %add = fadd contract float %1, 1.000000e+00
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(3) %outd, i64 0
+  store float %add, ptr addrspace(3) %arrayidx3, align 4
+  ret void
+}
+
+declare void @llvm.amdgcn.sched.barrier(i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }

>From 42e25e21506282006362af666249e395daf1d89d Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Mon, 20 Oct 2025 06:34:11 -0500
Subject: [PATCH 4/5] Edit documentation

---
 llvm/docs/AMDGPUUsage.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index e062032313058..2406f0b8f8b1f 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1385,7 +1385,8 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
 
   llvm.amdgcn.sched.barrier                        Controls the types of instructions that may be allowed to cross the intrinsic
                                                    during instruction scheduling. The parameter is a mask for the instruction types
-                                                   that can cross the intrinsic.
+                                                   that can cross the intrinsic. When bits for specific instructions are set, their
+                                                   more general version (all ALU or all VMEM) is ignored.
 
                                                    - 0x0000: No instructions may be scheduled across sched_barrier.
                                                    - 0x0001: All, non-memory, non-side-effect producing instructions may be

>From d2dc639268f89543a92afb8499bc943cede62a74 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr at amd.com>
Date: Mon, 20 Oct 2025 06:36:14 -0500
Subject: [PATCH 5/5] Clean tests a bit

---
 .../AMDGPU/llvm.amdgcn.sched.barrier.alu-bit.ll        | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.alu-bit.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.alu-bit.ll
index 48073a3856caa..346755eb60fd4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.alu-bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.alu-bit.ll
@@ -7,7 +7,6 @@ define protected amdgpu_kernel void @sched_barrier_m0(ptr addrspace(3) noalias %
 ; GCN: Applying IGroupLPDAGMutation...
 ; GCN-NEXT: Applying IGroupLPDAGMutation...
 entry:
-  ; we need salu, valu, mfma, trans instructions here.
   %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
   %1 = load float, ptr addrspace(3) %arrayidx, align 4
   call void @llvm.amdgcn.sched.barrier(i32 0) #1
@@ -25,7 +24,6 @@ define protected amdgpu_kernel void @sched_barrier_m1(ptr addrspace(3) noalias %
 ; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 1
 ; GCN-NEXT: After Inverting, SchedGroup Mask: 1008
 entry:
-  ; we need salu, valu, mfma, trans instructions here.
   %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
   %1 = load float, ptr addrspace(3) %arrayidx, align 4
   call void @llvm.amdgcn.sched.barrier(i32 1) #1
@@ -43,7 +41,6 @@ define protected amdgpu_kernel void @sched_barrier_m2(ptr addrspace(3) noalias %
 ; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 2
 ; GCN-NEXT: After Inverting, SchedGroup Mask: 2044
 entry:
-  ; we need salu, valu, mfma, trans instructions here.
   %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
   %1 = load float, ptr addrspace(3) %arrayidx, align 4
   call void @llvm.amdgcn.sched.barrier(i32 2) #1
@@ -61,7 +58,6 @@ define protected amdgpu_kernel void @sched_barrier_m4(ptr addrspace(3) noalias %
 ; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 4
 ; GCN-NEXT: After Inverting, SchedGroup Mask: 2042
 entry:
-  ; we need salu, valu, mfma, trans instructions here.
   %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
   %1 = load float, ptr addrspace(3) %arrayidx, align 4
   call void @llvm.amdgcn.sched.barrier(i32 4) #1
@@ -79,7 +75,6 @@ define protected amdgpu_kernel void @sched_barrier_m8(ptr addrspace(3) noalias %
 ; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 8
 ; GCN-NEXT: After Inverting, SchedGroup Mask: 2038
 entry:
-  ; we need salu, valu, mfma, trans instructions here.
   %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
   %1 = load float, ptr addrspace(3) %arrayidx, align 4
   call void @llvm.amdgcn.sched.barrier(i32 8) #1
@@ -97,7 +92,6 @@ define protected amdgpu_kernel void @sched_barrier_m1024(ptr addrspace(3) noalia
 ; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 1024
 ; GCN-NEXT: After Inverting, SchedGroup Mask: 1022
 entry:
-  ; we need salu, valu, mfma, trans instructions here.
   %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
   %1 = load float, ptr addrspace(3) %arrayidx, align 4
   call void @llvm.amdgcn.sched.barrier(i32 1024) #1
@@ -115,7 +109,6 @@ define protected amdgpu_kernel void @sched_barrier_m3(ptr addrspace(3) noalias %
 ; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 3
 ; GCN-NEXT: After Inverting, SchedGroup Mask: 2044
 entry:
-  ; we need salu, valu, mfma, trans instructions here.
   %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
   %1 = load float, ptr addrspace(3) %arrayidx, align 4
   call void @llvm.amdgcn.sched.barrier(i32 3) #1
@@ -133,7 +126,6 @@ define protected amdgpu_kernel void @sched_barrier_m5(ptr addrspace(3) noalias %
 ; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 5
 ; GCN-NEXT: After Inverting, SchedGroup Mask: 2042
 entry:
-  ; we need salu, valu, mfma, trans instructions here.
   %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
   %1 = load float, ptr addrspace(3) %arrayidx, align 4
   call void @llvm.amdgcn.sched.barrier(i32 5) #1
@@ -151,7 +143,6 @@ define protected amdgpu_kernel void @sched_barrier_m7(ptr addrspace(3) noalias %
 ; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 7
 ; GCN-NEXT: After Inverting, SchedGroup Mask: 2040
 entry:
-  ; we need salu, valu, mfma, trans instructions here.
   %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
   %1 = load float, ptr addrspace(3) %arrayidx, align 4
   call void @llvm.amdgcn.sched.barrier(i32 7) #1
@@ -169,7 +160,6 @@ define protected amdgpu_kernel void @sched_barrier_m15(ptr addrspace(3) noalias
 ; GCN-NEXT: Building SchedGroup for SchedBarrier with Mask: 15
 ; GCN-NEXT: After Inverting, SchedGroup Mask: 2032
 entry:
-  ; we need salu, valu, mfma, trans instructions here.
   %arrayidx = getelementptr inbounds float, ptr addrspace(3) %ind, i64 0
   %1 = load float, ptr addrspace(3) %arrayidx, align 4
   call void @llvm.amdgcn.sched.barrier(i32 15) #1