[llvm] [AMDGCN] Allow unscheduling of bundled insns (PR #129769)
Julian Brown via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 6 05:47:33 PST 2025
https://github.com/jtb20 updated https://github.com/llvm/llvm-project/pull/129769
>From dcdd0bbb22920201445fe47c320377afa7399687 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Tue, 4 Mar 2025 10:34:11 -0600
Subject: [PATCH] [AMDGCN] Allow unscheduling of bundled insns
This is a patch arising from AMD's fuzzing project.
In the test case, the scheduling algorithm decides to undo an attempted
schedule, but is unprepared to handle bundled instructions at that
point -- and those can arise via the expansion of intrinsics earlier
in compilation. The fix is to use the splice method instead of
remove/insert, since that can handle bundles properly.
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 3 +-
.../AMDGPU/sema-v-unsched-bundle-2.mir | 141 ++++++++++++++++++
.../CodeGen/AMDGPU/sema-v-unsched-bundle.ll | 24 +++
3 files changed, 166 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle-2.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle.ll
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index c277223de13ac..5dcf523430fd2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1567,8 +1567,7 @@ void GCNSchedStage::revertScheduling() {
}
if (MI->getIterator() != DAG.RegionEnd) {
- DAG.BB->remove(MI);
- DAG.BB->insert(DAG.RegionEnd, MI);
+ DAG.BB->splice(DAG.RegionEnd, DAG.BB, MI);
if (!MI->isDebugInstr())
DAG.LIS->handleMove(*MI, true);
}
diff --git a/llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle-2.mir b/llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle-2.mir
new file mode 100644
index 0000000000000..494631ecd3fe7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle-2.mir
@@ -0,0 +1,141 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -misched=gcn-max-occupancy -run-pass=machine-scheduler %s -o - | FileCheck %s
+
+--- |
+ @G = global <32 x i8> splat (i8 1)
+ @G.1 = global <32 x i8> splat (i8 127)
+ define amdgpu_kernel void @gws_sema_v_offset0(i32 %val) #0 {
+ ret void
+ }
+name: gws_sema_v_offset0
+body: |
+ bb.0 (%ir-block.0):
+ ; CHECK: BUNDLE implicit $m0, implicit $exec {
+ ; CHECK-NEXT: DS_GWS_SEMA_V 0, implicit $m0, implicit $exec :: (store (s32) into custom "GWSResource")
+ ; CHECK-NEXT: S_WAITCNT 0
+ ; CHECK-NEXT: }
+ %9:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @G.1, target-flags(amdgpu-gotprel32-hi) @G.1, implicit-def dead $scc
+ %10:sreg_64_xexec = S_LOAD_DWORDX2_IMM %9, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+ %12:vreg_64_align2 = COPY %10
+ %11:vreg_128_align2 = FLAT_LOAD_DWORDX4 %12, 16, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s128) from @G.1 + 16)
+ %16:vgpr_32 = V_AND_B32_e32 255, %11.sub0, implicit $exec
+ %18:vgpr_32 = V_LSHRREV_B16_e32 8, %11.sub0, implicit $exec
+ %20:vgpr_32 = V_LSHRREV_B32_e32 16, %11.sub0, implicit $exec
+ %22:vgpr_32 = V_AND_B32_e32 255, %20, implicit $exec
+ %24:vgpr_32 = V_LSHRREV_B32_e32 24, %11.sub0, implicit $exec
+ %28:vgpr_32 = V_AND_B32_e32 255, %11.sub1, implicit $exec
+ %29:vgpr_32 = V_LSHRREV_B16_e32 8, %11.sub1, implicit $exec
+ %30:vgpr_32 = V_LSHRREV_B32_e32 16, %11.sub1, implicit $exec
+ %32:vgpr_32 = V_AND_B32_e32 255, %30, implicit $exec
+ %33:vgpr_32 = V_LSHRREV_B32_e32 24, %11.sub1, implicit $exec
+ %37:vgpr_32 = V_AND_B32_e32 255, %11.sub2, implicit $exec
+ %38:vgpr_32 = V_LSHRREV_B16_e32 8, %11.sub2, implicit $exec
+ %39:vgpr_32 = V_LSHRREV_B32_e32 16, %11.sub2, implicit $exec
+ %41:vgpr_32 = V_AND_B32_e32 255, %39, implicit $exec
+ %42:vgpr_32 = V_LSHRREV_B32_e32 24, %11.sub2, implicit $exec
+ %46:vgpr_32 = V_AND_B32_e32 255, %11.sub3, implicit $exec
+ %47:vgpr_32 = V_LSHRREV_B16_e32 8, %11.sub3, implicit $exec
+ %48:vgpr_32 = V_LSHRREV_B32_e32 16, %11.sub3, implicit $exec
+ %50:vgpr_32 = V_AND_B32_e32 255, %48, implicit $exec
+ %51:vgpr_32 = V_LSHRREV_B32_e32 24, %11.sub3, implicit $exec
+ %53:vreg_128_align2 = FLAT_LOAD_DWORDX4 %12, 0, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s128) from @G.1, align 32)
+ %57:vgpr_32 = V_AND_B32_e32 255, %53.sub0, implicit $exec
+ %58:vgpr_32 = V_LSHRREV_B16_e32 8, %53.sub0, implicit $exec
+ %59:vgpr_32 = V_LSHRREV_B32_e32 16, %53.sub0, implicit $exec
+ %61:vgpr_32 = V_AND_B32_e32 255, %59, implicit $exec
+ %62:vgpr_32 = V_LSHRREV_B32_e32 24, %53.sub0, implicit $exec
+ %66:vgpr_32 = V_AND_B32_e32 255, %53.sub1, implicit $exec
+ %67:vgpr_32 = V_LSHRREV_B16_e32 8, %53.sub1, implicit $exec
+ %68:vgpr_32 = V_LSHRREV_B32_e32 16, %53.sub1, implicit $exec
+ %70:vgpr_32 = V_AND_B32_e32 255, %68, implicit $exec
+ %71:vgpr_32 = V_LSHRREV_B32_e32 24, %53.sub1, implicit $exec
+ %75:vgpr_32 = V_AND_B32_e32 255, %53.sub2, implicit $exec
+ %76:vgpr_32 = V_LSHRREV_B16_e32 8, %53.sub2, implicit $exec
+ %77:vgpr_32 = V_LSHRREV_B32_e32 16, %53.sub2, implicit $exec
+ %79:vgpr_32 = V_AND_B32_e32 255, %77, implicit $exec
+ %80:vgpr_32 = V_LSHRREV_B32_e32 24, %53.sub2, implicit $exec
+ %84:vgpr_32 = V_AND_B32_e32 255, %53.sub3, implicit $exec
+ %85:vgpr_32 = V_LSHRREV_B16_e32 8, %53.sub3, implicit $exec
+ %86:vgpr_32 = V_LSHRREV_B32_e32 16, %53.sub3, implicit $exec
+ %88:vgpr_32 = V_AND_B32_e32 255, %86, implicit $exec
+ %89:vgpr_32 = V_LSHRREV_B32_e32 24, %53.sub3, implicit $exec
+ %91:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @G, target-flags(amdgpu-gotprel32-hi) @G, implicit-def dead $scc
+ %92:sreg_64_xexec = S_LOAD_DWORDX2_IMM %91, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+ %94:vreg_64_align2 = COPY %92
+ %93:vreg_128_align2 = FLAT_LOAD_DWORDX4 %94, 16, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s128) from @G + 16)
+ %97:vgpr_32 = V_AND_B32_e32 255, %93.sub0, implicit $exec
+ %98:vgpr_32 = V_LSHRREV_B16_e32 8, %93.sub0, implicit $exec
+ %99:vgpr_32 = V_LSHRREV_B32_e32 16, %93.sub0, implicit $exec
+ %101:vgpr_32 = V_AND_B32_e32 255, %99, implicit $exec
+ %102:vgpr_32 = V_LSHRREV_B32_e32 24, %93.sub0, implicit $exec
+ %106:vgpr_32 = V_AND_B32_e32 255, %93.sub1, implicit $exec
+ %107:vgpr_32 = V_LSHRREV_B16_e32 8, %93.sub1, implicit $exec
+ %108:vgpr_32 = V_LSHRREV_B32_e32 16, %93.sub1, implicit $exec
+ %110:vgpr_32 = V_AND_B32_e32 255, %108, implicit $exec
+ %111:vgpr_32 = V_LSHRREV_B32_e32 24, %93.sub1, implicit $exec
+ %115:vgpr_32 = V_AND_B32_e32 255, %93.sub2, implicit $exec
+ %116:vgpr_32 = V_LSHRREV_B16_e32 8, %93.sub2, implicit $exec
+ %117:vgpr_32 = V_LSHRREV_B32_e32 16, %93.sub2, implicit $exec
+ %119:vgpr_32 = V_AND_B32_e32 255, %117, implicit $exec
+ %120:vgpr_32 = V_LSHRREV_B32_e32 24, %93.sub2, implicit $exec
+ %124:vgpr_32 = V_AND_B32_e32 255, %93.sub3, implicit $exec
+ %125:vgpr_32 = V_LSHRREV_B16_e32 8, %93.sub3, implicit $exec
+ %126:vgpr_32 = V_LSHRREV_B32_e32 16, %93.sub3, implicit $exec
+ %128:vgpr_32 = V_AND_B32_e32 255, %126, implicit $exec
+ %129:vgpr_32 = V_LSHRREV_B32_e32 24, %93.sub3, implicit $exec
+ %131:vreg_128_align2 = FLAT_LOAD_DWORDX4 %94, 0, 0, implicit $exec, implicit $flat_scr :: (dereferenceable load (s128) from @G, align 32)
+ %135:vgpr_32 = V_AND_B32_e32 255, %131.sub0, implicit $exec
+ %136:vgpr_32 = V_LSHRREV_B16_e32 8, %131.sub0, implicit $exec
+ %137:vgpr_32 = V_LSHRREV_B32_e32 16, %131.sub0, implicit $exec
+ %139:vgpr_32 = V_AND_B32_e32 255, %137, implicit $exec
+ %140:vgpr_32 = V_LSHRREV_B32_e32 24, %131.sub0, implicit $exec
+ %144:vgpr_32 = V_AND_B32_e32 255, %131.sub1, implicit $exec
+ %145:vgpr_32 = V_LSHRREV_B16_e32 8, %131.sub1, implicit $exec
+ %146:vgpr_32 = V_LSHRREV_B32_e32 16, %131.sub1, implicit $exec
+ %148:vgpr_32 = V_AND_B32_e32 255, %146, implicit $exec
+ %149:vgpr_32 = V_LSHRREV_B32_e32 24, %131.sub1, implicit $exec
+ %153:vgpr_32 = V_AND_B32_e32 255, %131.sub2, implicit $exec
+ %154:vgpr_32 = V_LSHRREV_B16_e32 8, %131.sub2, implicit $exec
+ %155:vgpr_32 = V_LSHRREV_B32_e32 16, %131.sub2, implicit $exec
+ %157:vgpr_32 = V_AND_B32_e32 255, %155, implicit $exec
+ %158:vgpr_32 = V_LSHRREV_B32_e32 24, %131.sub2, implicit $exec
+ %162:vgpr_32 = V_AND_B32_e32 255, %131.sub3, implicit $exec
+ %163:vgpr_32 = V_LSHRREV_B16_e32 8, %131.sub3, implicit $exec
+ %164:vgpr_32 = V_LSHRREV_B32_e32 16, %131.sub3, implicit $exec
+ %166:vgpr_32 = V_AND_B32_e32 255, %164, implicit $exec
+ %167:vgpr_32 = V_LSHRREV_B32_e32 24, %131.sub3, implicit $exec
+ BUNDLE implicit $m0, implicit $exec {
+ DS_GWS_SEMA_V 0, implicit $m0, implicit $exec :: (store (s32) into custom "GWSResource")
+ S_WAITCNT 0
+ }
+ %169:sreg_64_xexec = V_CMP_NE_U16_e64 %167, %89, implicit $exec
+ %173:sreg_64_xexec = V_CMP_NE_U16_e64 %166, %88, implicit $exec
+ %178:sreg_64_xexec = V_CMP_NE_U16_e64 %163, %85, implicit $exec
+ %182:sreg_64_xexec = V_CMP_NE_U16_e64 %162, %84, implicit $exec
+ %189:sreg_64_xexec = V_CMP_NE_U16_e64 %158, %80, implicit $exec
+ %192:sreg_64_xexec = V_CMP_NE_U16_e64 %157, %79, implicit $exec
+ %196:sreg_64_xexec = V_CMP_NE_U16_e64 %154, %76, implicit $exec
+ %199:sreg_64_xexec = V_CMP_NE_U16_e64 %153, %75, implicit $exec
+ %208:sreg_64_xexec = V_CMP_NE_U16_e64 %149, %71, implicit $exec
+ %211:sreg_64_xexec = V_CMP_NE_U16_e64 %148, %70, implicit $exec
+ %215:sreg_64_xexec = V_CMP_NE_U16_e64 %145, %67, implicit $exec
+ %218:sreg_64_xexec = V_CMP_NE_U16_e64 %144, %66, implicit $exec
+ %225:sreg_64_xexec = V_CMP_NE_U16_e64 %140, %62, implicit $exec
+ %228:sreg_64_xexec = V_CMP_NE_U16_e64 %139, %61, implicit $exec
+ %232:sreg_64_xexec = V_CMP_NE_U16_e64 %136, %58, implicit $exec
+ %235:sreg_64_xexec = V_CMP_NE_U16_e64 %135, %57, implicit $exec
+ %246:sreg_64_xexec = V_CMP_NE_U16_e64 %129, %51, implicit $exec
+ %249:sreg_64_xexec = V_CMP_NE_U16_e64 %128, %50, implicit $exec
+ %253:sreg_64_xexec = V_CMP_NE_U16_e64 %125, %47, implicit $exec
+ %256:sreg_64_xexec = V_CMP_NE_U16_e64 %124, %46, implicit $exec
+ %262:sreg_64_xexec = V_CMP_NE_U16_e64 %120, %42, implicit $exec
+ %265:sreg_64_xexec = V_CMP_NE_U16_e64 %119, %41, implicit $exec
+ %269:sreg_64_xexec = V_CMP_NE_U16_e64 %116, %38, implicit $exec
+ %272:sreg_64_xexec = V_CMP_NE_U16_e64 %115, %37, implicit $exec
+ %280:sreg_64_xexec = V_CMP_NE_U16_e64 %111, %33, implicit $exec
+ %287:sreg_64_xexec = V_CMP_NE_U16_e64 %107, %29, implicit $exec
+ %290:sreg_64_xexec = V_CMP_NE_U16_e64 %106, %28, implicit $exec
+ %296:sreg_64_xexec = V_CMP_NE_U16_e64 %102, %24, implicit $exec
+ %299:sreg_64_xexec = V_CMP_NE_U16_e64 %101, %22, implicit $exec
+ %303:sreg_64_xexec = V_CMP_NE_U16_e64 %98, %18, implicit $exec
+ %306:sreg_64_xexec = V_CMP_NE_U16_e64 %97, %16, implicit $exec
+
diff --git a/llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle.ll b/llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle.ll
new file mode 100644
index 0000000000000..e2b5b85257dac
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sema-v-unsched-bundle.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=amdgcn -O1 -mcpu=gfx90a -stop-after=machine-scheduler < %s | FileCheck %s
+
+; CHECK: BUNDLE implicit $m0, implicit $exec {
+; CHECK-NEXT: DS_GWS_SEMA_V 0, implicit $m0, implicit $exec :: (store (s32) into custom "GWSResource")
+; CHECK-NEXT: S_WAITCNT 0
+; CHECK-NEXT: }
+
+ at G = global <32 x i8> splat (i8 1)
+ at G.1 = global <32 x i8> splat (i8 127)
+
+define amdgpu_kernel void @gws_sema_v_offset0(i32 %val) #0 {
+ %LGV1 = load <32 x i8>, ptr @G.1, align 32
+ %LGV = load <32 x i8>, ptr @G, align 32
+ call void @llvm.amdgcn.ds.gws.sema.v(i32 0)
+ %C = icmp ne <32 x i8> %LGV, %LGV1
+ store <32 x i1> %C, ptr poison, align 4
+ ret void
+}
+
+declare void @llvm.amdgcn.ds.gws.sema.v(i32) #1
+
+attributes #0 = { convergent nounwind memory(inaccessiblemem: readwrite) }
+attributes #1 = { convergent nocallback nofree nounwind willreturn memory(inaccessiblemem: readwrite) }
+
More information about the llvm-commits
mailing list