[llvm] [AMDGCN] Allow unscheduling of bundled insns (PR #129769)

Julian Brown via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 6 05:47:33 PST 2025


https://github.com/jtb20 updated https://github.com/llvm/llvm-project/pull/129769

>From dcdd0bbb22920201445fe47c320377afa7399687 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Tue, 4 Mar 2025 10:34:11 -0600
Subject: [PATCH] [AMDGCN] Allow unscheduling of bundled insns

This is a patch arising from AMD's fuzzing project.

In the test case, the scheduling algorithm decides to undo an attempted
schedule, but is unprepared to handle bundled instructions at that
point -- and those can arise via the expansion of intrinsics earlier
in compilation.  The fix is to use the splice method instead of
remove/insert, since that can handle bundles properly.
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |   3 +-
 .../AMDGPU/sema-v-unsched-bundle-2.mir        | 141 ++++++++++++++++++
 .../CodeGen/AMDGPU/sema-v-unsched-bundle.ll   |  24 +++
 3 files changed, 166 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle-2.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle.ll

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index c277223de13ac..5dcf523430fd2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1567,8 +1567,7 @@ void GCNSchedStage::revertScheduling() {
     }
 
     if (MI->getIterator() != DAG.RegionEnd) {
-      DAG.BB->remove(MI);
-      DAG.BB->insert(DAG.RegionEnd, MI);
+      DAG.BB->splice(DAG.RegionEnd, DAG.BB, MI);
       if (!MI->isDebugInstr())
         DAG.LIS->handleMove(*MI, true);
     }
diff --git a/llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle-2.mir b/llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle-2.mir
new file mode 100644
index 0000000000000..494631ecd3fe7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle-2.mir
@@ -0,0 +1,141 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -misched=gcn-max-occupancy -run-pass=machine-scheduler %s -o - | FileCheck %s
+
+--- |
+  @G = global <32 x i8> splat (i8 1)
+  @G.1 = global <32 x i8> splat (i8 127)
+  define amdgpu_kernel void @gws_sema_v_offset0(i32 %val) #0 {
+    ret void
+  }
+name:            gws_sema_v_offset0
+body:             |
+  bb.0 (%ir-block.0):
+    ; CHECK:      BUNDLE implicit $m0, implicit $exec {
+    ; CHECK-NEXT:   DS_GWS_SEMA_V 0, implicit $m0, implicit $exec :: (store (s32) into custom "GWSResource")
+    ; CHECK-NEXT:   S_WAITCNT 0
+    ; CHECK-NEXT: }
+    %9:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @G.1, target-flags(amdgpu-gotprel32-hi) @G.1, implicit-def dead $scc
+    %10:sreg_64_xexec = S_LOAD_DWORDX2_IMM %9, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+    %12:vreg_64_align2 = COPY %10
+    %11:vreg_128_align2 = FLAT_LOAD_DWORDX4 %12, 16, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s128) from @G.1 + 16)
+    %16:vgpr_32 = V_AND_B32_e32 255, %11.sub0, implicit $exec
+    %18:vgpr_32 = V_LSHRREV_B16_e32 8, %11.sub0, implicit $exec
+    %20:vgpr_32 = V_LSHRREV_B32_e32 16, %11.sub0, implicit $exec
+    %22:vgpr_32 = V_AND_B32_e32 255, %20, implicit $exec
+    %24:vgpr_32 = V_LSHRREV_B32_e32 24, %11.sub0, implicit $exec
+    %28:vgpr_32 = V_AND_B32_e32 255, %11.sub1, implicit $exec
+    %29:vgpr_32 = V_LSHRREV_B16_e32 8, %11.sub1, implicit $exec
+    %30:vgpr_32 = V_LSHRREV_B32_e32 16, %11.sub1, implicit $exec
+    %32:vgpr_32 = V_AND_B32_e32 255, %30, implicit $exec
+    %33:vgpr_32 = V_LSHRREV_B32_e32 24, %11.sub1, implicit $exec
+    %37:vgpr_32 = V_AND_B32_e32 255, %11.sub2, implicit $exec
+    %38:vgpr_32 = V_LSHRREV_B16_e32 8, %11.sub2, implicit $exec
+    %39:vgpr_32 = V_LSHRREV_B32_e32 16, %11.sub2, implicit $exec
+    %41:vgpr_32 = V_AND_B32_e32 255, %39, implicit $exec
+    %42:vgpr_32 = V_LSHRREV_B32_e32 24, %11.sub2, implicit $exec
+    %46:vgpr_32 = V_AND_B32_e32 255, %11.sub3, implicit $exec
+    %47:vgpr_32 = V_LSHRREV_B16_e32 8, %11.sub3, implicit $exec
+    %48:vgpr_32 = V_LSHRREV_B32_e32 16, %11.sub3, implicit $exec
+    %50:vgpr_32 = V_AND_B32_e32 255, %48, implicit $exec
+    %51:vgpr_32 = V_LSHRREV_B32_e32 24, %11.sub3, implicit $exec
+    %53:vreg_128_align2 = FLAT_LOAD_DWORDX4 %12, 0, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s128) from @G.1, align 32)
+    %57:vgpr_32 = V_AND_B32_e32 255, %53.sub0, implicit $exec
+    %58:vgpr_32 = V_LSHRREV_B16_e32 8, %53.sub0, implicit $exec
+    %59:vgpr_32 = V_LSHRREV_B32_e32 16, %53.sub0, implicit $exec
+    %61:vgpr_32 = V_AND_B32_e32 255, %59, implicit $exec
+    %62:vgpr_32 = V_LSHRREV_B32_e32 24, %53.sub0, implicit $exec
+    %66:vgpr_32 = V_AND_B32_e32 255, %53.sub1, implicit $exec
+    %67:vgpr_32 = V_LSHRREV_B16_e32 8, %53.sub1, implicit $exec
+    %68:vgpr_32 = V_LSHRREV_B32_e32 16, %53.sub1, implicit $exec
+    %70:vgpr_32 = V_AND_B32_e32 255, %68, implicit $exec
+    %71:vgpr_32 = V_LSHRREV_B32_e32 24, %53.sub1, implicit $exec
+    %75:vgpr_32 = V_AND_B32_e32 255, %53.sub2, implicit $exec
+    %76:vgpr_32 = V_LSHRREV_B16_e32 8, %53.sub2, implicit $exec
+    %77:vgpr_32 = V_LSHRREV_B32_e32 16, %53.sub2, implicit $exec
+    %79:vgpr_32 = V_AND_B32_e32 255, %77, implicit $exec
+    %80:vgpr_32 = V_LSHRREV_B32_e32 24, %53.sub2, implicit $exec
+    %84:vgpr_32 = V_AND_B32_e32 255, %53.sub3, implicit $exec
+    %85:vgpr_32 = V_LSHRREV_B16_e32 8, %53.sub3, implicit $exec
+    %86:vgpr_32 = V_LSHRREV_B32_e32 16, %53.sub3, implicit $exec
+    %88:vgpr_32 = V_AND_B32_e32 255, %86, implicit $exec
+    %89:vgpr_32 = V_LSHRREV_B32_e32 24, %53.sub3, implicit $exec
+    %91:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @G, target-flags(amdgpu-gotprel32-hi) @G, implicit-def dead $scc
+    %92:sreg_64_xexec = S_LOAD_DWORDX2_IMM %91, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+    %94:vreg_64_align2 = COPY %92
+    %93:vreg_128_align2 = FLAT_LOAD_DWORDX4 %94, 16, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s128) from @G + 16)
+    %97:vgpr_32 = V_AND_B32_e32 255, %93.sub0, implicit $exec
+    %98:vgpr_32 = V_LSHRREV_B16_e32 8, %93.sub0, implicit $exec
+    %99:vgpr_32 = V_LSHRREV_B32_e32 16, %93.sub0, implicit $exec
+    %101:vgpr_32 = V_AND_B32_e32 255, %99, implicit $exec
+    %102:vgpr_32 = V_LSHRREV_B32_e32 24, %93.sub0, implicit $exec
+    %106:vgpr_32 = V_AND_B32_e32 255, %93.sub1, implicit $exec
+    %107:vgpr_32 = V_LSHRREV_B16_e32 8, %93.sub1, implicit $exec
+    %108:vgpr_32 = V_LSHRREV_B32_e32 16, %93.sub1, implicit $exec
+    %110:vgpr_32 = V_AND_B32_e32 255, %108, implicit $exec
+    %111:vgpr_32 = V_LSHRREV_B32_e32 24, %93.sub1, implicit $exec
+    %115:vgpr_32 = V_AND_B32_e32 255, %93.sub2, implicit $exec
+    %116:vgpr_32 = V_LSHRREV_B16_e32 8, %93.sub2, implicit $exec
+    %117:vgpr_32 = V_LSHRREV_B32_e32 16, %93.sub2, implicit $exec
+    %119:vgpr_32 = V_AND_B32_e32 255, %117, implicit $exec
+    %120:vgpr_32 = V_LSHRREV_B32_e32 24, %93.sub2, implicit $exec
+    %124:vgpr_32 = V_AND_B32_e32 255, %93.sub3, implicit $exec
+    %125:vgpr_32 = V_LSHRREV_B16_e32 8, %93.sub3, implicit $exec
+    %126:vgpr_32 = V_LSHRREV_B32_e32 16, %93.sub3, implicit $exec
+    %128:vgpr_32 = V_AND_B32_e32 255, %126, implicit $exec
+    %129:vgpr_32 = V_LSHRREV_B32_e32 24, %93.sub3, implicit $exec
+    %131:vreg_128_align2 = FLAT_LOAD_DWORDX4 %94, 0, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s128) from @G, align 32)
+    %135:vgpr_32 = V_AND_B32_e32 255, %131.sub0, implicit $exec
+    %136:vgpr_32 = V_LSHRREV_B16_e32 8, %131.sub0, implicit $exec
+    %137:vgpr_32 = V_LSHRREV_B32_e32 16, %131.sub0, implicit $exec
+    %139:vgpr_32 = V_AND_B32_e32 255, %137, implicit $exec
+    %140:vgpr_32 = V_LSHRREV_B32_e32 24, %131.sub0, implicit $exec
+    %144:vgpr_32 = V_AND_B32_e32 255, %131.sub1, implicit $exec
+    %145:vgpr_32 = V_LSHRREV_B16_e32 8, %131.sub1, implicit $exec
+    %146:vgpr_32 = V_LSHRREV_B32_e32 16, %131.sub1, implicit $exec
+    %148:vgpr_32 = V_AND_B32_e32 255, %146, implicit $exec
+    %149:vgpr_32 = V_LSHRREV_B32_e32 24, %131.sub1, implicit $exec
+    %153:vgpr_32 = V_AND_B32_e32 255, %131.sub2, implicit $exec
+    %154:vgpr_32 = V_LSHRREV_B16_e32 8, %131.sub2, implicit $exec
+    %155:vgpr_32 = V_LSHRREV_B32_e32 16, %131.sub2, implicit $exec
+    %157:vgpr_32 = V_AND_B32_e32 255, %155, implicit $exec
+    %158:vgpr_32 = V_LSHRREV_B32_e32 24, %131.sub2, implicit $exec
+    %162:vgpr_32 = V_AND_B32_e32 255, %131.sub3, implicit $exec
+    %163:vgpr_32 = V_LSHRREV_B16_e32 8, %131.sub3, implicit $exec
+    %164:vgpr_32 = V_LSHRREV_B32_e32 16, %131.sub3, implicit $exec
+    %166:vgpr_32 = V_AND_B32_e32 255, %164, implicit $exec
+    %167:vgpr_32 = V_LSHRREV_B32_e32 24, %131.sub3, implicit $exec
+    BUNDLE implicit $m0, implicit $exec {
+      DS_GWS_SEMA_V 0, implicit $m0, implicit $exec :: (store (s32) into custom "GWSResource")
+      S_WAITCNT 0
+    }
+    %169:sreg_64_xexec = V_CMP_NE_U16_e64 %167, %89, implicit $exec
+    %173:sreg_64_xexec = V_CMP_NE_U16_e64 %166, %88, implicit $exec
+    %178:sreg_64_xexec = V_CMP_NE_U16_e64 %163, %85, implicit $exec
+    %182:sreg_64_xexec = V_CMP_NE_U16_e64 %162, %84, implicit $exec
+    %189:sreg_64_xexec = V_CMP_NE_U16_e64 %158, %80, implicit $exec
+    %192:sreg_64_xexec = V_CMP_NE_U16_e64 %157, %79, implicit $exec
+    %196:sreg_64_xexec = V_CMP_NE_U16_e64 %154, %76, implicit $exec
+    %199:sreg_64_xexec = V_CMP_NE_U16_e64 %153, %75, implicit $exec
+    %208:sreg_64_xexec = V_CMP_NE_U16_e64 %149, %71, implicit $exec
+    %211:sreg_64_xexec = V_CMP_NE_U16_e64 %148, %70, implicit $exec
+    %215:sreg_64_xexec = V_CMP_NE_U16_e64 %145, %67, implicit $exec
+    %218:sreg_64_xexec = V_CMP_NE_U16_e64 %144, %66, implicit $exec
+    %225:sreg_64_xexec = V_CMP_NE_U16_e64 %140, %62, implicit $exec
+    %228:sreg_64_xexec = V_CMP_NE_U16_e64 %139, %61, implicit $exec
+    %232:sreg_64_xexec = V_CMP_NE_U16_e64 %136, %58, implicit $exec
+    %235:sreg_64_xexec = V_CMP_NE_U16_e64 %135, %57, implicit $exec
+    %246:sreg_64_xexec = V_CMP_NE_U16_e64 %129, %51, implicit $exec
+    %249:sreg_64_xexec = V_CMP_NE_U16_e64 %128, %50, implicit $exec
+    %253:sreg_64_xexec = V_CMP_NE_U16_e64 %125, %47, implicit $exec
+    %256:sreg_64_xexec = V_CMP_NE_U16_e64 %124, %46, implicit $exec
+    %262:sreg_64_xexec = V_CMP_NE_U16_e64 %120, %42, implicit $exec
+    %265:sreg_64_xexec = V_CMP_NE_U16_e64 %119, %41, implicit $exec
+    %269:sreg_64_xexec = V_CMP_NE_U16_e64 %116, %38, implicit $exec
+    %272:sreg_64_xexec = V_CMP_NE_U16_e64 %115, %37, implicit $exec
+    %280:sreg_64_xexec = V_CMP_NE_U16_e64 %111, %33, implicit $exec
+    %287:sreg_64_xexec = V_CMP_NE_U16_e64 %107, %29, implicit $exec
+    %290:sreg_64_xexec = V_CMP_NE_U16_e64 %106, %28, implicit $exec
+    %296:sreg_64_xexec = V_CMP_NE_U16_e64 %102, %24, implicit $exec
+    %299:sreg_64_xexec = V_CMP_NE_U16_e64 %101, %22, implicit $exec
+    %303:sreg_64_xexec = V_CMP_NE_U16_e64 %98, %18, implicit $exec
+    %306:sreg_64_xexec = V_CMP_NE_U16_e64 %97, %16, implicit $exec
+
diff --git a/llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle.ll b/llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle.ll
new file mode 100644
index 0000000000000..e2b5b85257dac
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=amdgcn -O1 -mcpu=gfx90a -stop-after=machine-scheduler < %s | FileCheck %s
+
+; CHECK:      BUNDLE implicit $m0, implicit $exec {
+; CHECK-NEXT:   DS_GWS_SEMA_V 0, implicit $m0, implicit $exec :: (store (s32) into custom "GWSResource")
+; CHECK-NEXT:   S_WAITCNT 0
+; CHECK-NEXT: }
+
+ at G = global <32 x i8> splat (i8 1)
+ at G.1 = global <32 x i8> splat (i8 127)
+
+define amdgpu_kernel void @gws_sema_v_offset0(i32 %val) #0 {
+  %LGV1 = load <32 x i8>, ptr @G.1, align 32
+  %LGV = load <32 x i8>, ptr @G, align 32
+  call void @llvm.amdgcn.ds.gws.sema.v(i32 0)
+  %C = icmp ne <32 x i8> %LGV, %LGV1
+  store <32 x i1> %C, ptr poison, align 4
+  ret void
+}
+
+declare void @llvm.amdgcn.ds.gws.sema.v(i32) #1
+
+attributes #0 = { convergent nounwind memory(inaccessiblemem: readwrite) }
+attributes #1 = { convergent nocallback nofree nounwind willreturn memory(inaccessiblemem: readwrite) }
+



More information about the llvm-commits mailing list