[llvm] [AMDGPU] Introduce conditional barrier pseudo instruction (PR #171604)

Wed Dec 10 05:21:34 PST 2025

https://github.com/tyb0807 updated https://github.com/llvm/llvm-project/pull/171604

>From c1e03555a69b0de969785b170461f79e28c74207 Mon Sep 17 00:00:00 2001
From: tyb0807 <sontuan.vu at amd.com>
Date: Wed, 10 Dec 2025 06:00:10 -0600
Subject: [PATCH 1/2] [AMDGPU] Introduce conditional barrier pseudo instruction

---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |   3 +
 .../Target/AMDGPU/AMDGPUExpandCondBarrier.cpp | 147 ++++++++++++++++++
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   5 +
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |   6 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   8 +
 .../AMDGPU/si-cond-barrier-expansion-only.mir | 117 ++++++++++++++
 7 files changed, 287 insertions(+)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUExpandCondBarrier.cpp
 create mode 100644 llvm/test/CodeGen/AMDGPU/si-cond-barrier-expansion-only.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 5df11a45b4889..d5068e0ed0dd6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -53,6 +53,9 @@ FunctionPass *createSIPreAllocateWWMRegsLegacyPass();
 FunctionPass *createSIFormMemoryClausesLegacyPass();
 
 FunctionPass *createSIPostRABundlerPass();
+FunctionPass *createAMDGPUExpandCondBarrierPass();
+extern char &AMDGPUExpandCondBarrierID;
+void initializeAMDGPUExpandCondBarrierPass(PassRegistry &);
 FunctionPass *createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *);
 ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *);
 FunctionPass *createAMDGPUCodeGenPreparePass();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExpandCondBarrier.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExpandCondBarrier.cpp
new file mode 100644
index 0000000000000..7432d93d06176
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExpandCondBarrier.cpp
@@ -0,0 +1,147 @@
+//===-- AMDGPUExpandCondBarrier.cpp - Expand conditional barriers ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands SI_COND_BARRIER pseudo instructions into conditional
+// control flow with actual barrier instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-expand-cond-barrier"
+
+class AMDGPUExpandCondBarrier : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUExpandCondBarrier() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "AMDGPU Expand Conditional Barriers";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // We modify the CFG, so don't call setPreservesCFG().
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  bool expandCondBarrier(MachineBasicBlock &MBB, MachineInstr &MI);
+};
+
+char AMDGPUExpandCondBarrier::ID = 0;
+
+char &llvm::AMDGPUExpandCondBarrierID = AMDGPUExpandCondBarrier::ID;
+
+INITIALIZE_PASS(AMDGPUExpandCondBarrier, DEBUG_TYPE,
+                "Expand conditional barrier pseudo instructions", false, false)
+
+bool AMDGPUExpandCondBarrier::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+
+  // Collect all SI_COND_BARRIER instructions first to avoid iterator
+  // invalidation.
+  SmallVector<MachineInstr *, 4> CondBarriers;
+
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      if (MI.getOpcode() == AMDGPU::SI_COND_BARRIER) {
+        CondBarriers.push_back(&MI);
+      }
+    }
+  }
+
+  // Process collected instructions.
+  for (MachineInstr *MI : CondBarriers) {
+    MachineBasicBlock *MBB = MI->getParent();
+    Changed |= expandCondBarrier(*MBB, *MI);
+  }
+
+  return Changed;
+}
+
+bool AMDGPUExpandCondBarrier::expandCondBarrier(MachineBasicBlock &MBB,
+                                                MachineInstr &MI) {
+  MachineFunction *MF = MBB.getParent();
+  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  // Get the variant that determines barrier execution condition.
+  // This allows complementary thread groups to synchronize on opposite
+  // conditions.
+  unsigned Variant = MI.getOperand(0).getImm();
+
+  // Split current block only if there are instructions after MI.
+  MachineBasicBlock *ContinueMBB = nullptr;
+  if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
+    ContinueMBB = MBB.splitAt(MI, false /*UpdateLiveIns*/);
+  }
+
+  // Build simple linear expansion with proper basic block structure:
+  // Split current block if needed to create continuation block.
+  if (!ContinueMBB) {
+    ContinueMBB = MF->CreateMachineBasicBlock();
+    MF->push_back(ContinueMBB);
+  }
+
+  // Create barrier basic block - insert it immediately after current block.
+  // to ensure proper layout for fallthrough.
+  MachineBasicBlock *BarrierMBB = MF->CreateMachineBasicBlock();
+
+  // Insert BarrierMBB right after MBB for proper fallthrough layout.
+  MachineFunction::iterator MBBI = MBB.getIterator();
+  ++MBBI;
+  MF->insert(MBBI, BarrierMBB);
+
+  // 1. Conditional branch to skip barrier based on variant:
+  //    Variant 0: Execute barrier when SCC=1, skip when SCC=0 (use
+  //    S_CBRANCH_SCC0). Variant 1: Execute barrier when SCC=0, skip when SCC=1
+  //    (use S_CBRANCH_SCC1).
+  unsigned BranchOpcode =
+      (Variant == 0) ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1;
+  BuildMI(MBB, &MI, DL, TII->get(BranchOpcode)).addMBB(ContinueMBB);
+  LLVM_DEBUG(dbgs() << "ExpandCondBarrier: Variant " << Variant
+                    << " expansion\n");
+
+  // 2. Insert barrier in fallthrough block.
+  BuildMI(*BarrierMBB, BarrierMBB->end(), DL, TII->get(AMDGPU::S_BARRIER));
+
+  // 3. Add explicit unconditional branch from barrier block to continuation.
+  BuildMI(*BarrierMBB, BarrierMBB->end(), DL, TII->get(AMDGPU::S_BRANCH))
+      .addMBB(ContinueMBB);
+
+  // 4. Set up CFG with both paths.
+  // For S_CBRANCH_SCC0: SCC=0 -> branch to ContinueMBB, SCC=1 -> fallthrough to
+  // BarrierMBB
+  MBB.addSuccessor(
+      BarrierMBB); // Barrier path (implicit fallthrough when SCC=1)
+  MBB.addSuccessor(
+      ContinueMBB); // Skip barrier path (explicit branch target when SCC=0)
+  BarrierMBB->addSuccessor(ContinueMBB); // Barrier to continue
+
+  // Remove the pseudo-instruction.
+  MI.eraseFromParent();
+
+  return true;
+}
+
+FunctionPass *llvm::createAMDGPUExpandCondBarrierPass() {
+  return new AMDGPUExpandCondBarrier();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 8a831f7915882..872218a714a4b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -596,6 +596,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR);
   initializeAMDGPURewriteOutArgumentsPass(*PR);
   initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
+  initializeAMDGPUExpandCondBarrierPass(*PR);
   initializeSIAnnotateControlFlowLegacyPass(*PR);
   initializeAMDGPUInsertDelayAluLegacyPass(*PR);
   initializeAMDGPULowerVGPREncodingLegacyPass(*PR);
@@ -1761,6 +1762,10 @@ void GCNPassConfig::addPostRegAlloc() {
   addPass(&SIFixVGPRCopiesID);
   if (getOptLevel() > CodeGenOptLevel::None)
     addPass(&SIOptimizeExecMaskingLegacyID);
+
+  // Add ExpandCondBarrier pass before post-RA pseudo expansion
+  addPass(&AMDGPUExpandCondBarrierID);
+
   TargetPassConfig::addPostRegAlloc();
 }
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 782cbfa76e6e9..8eb0cf5b4ea4e 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -56,6 +56,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUCtorDtorLowering.cpp
   AMDGPUExportClustering.cpp
   AMDGPUExportKernelRuntimeHandles.cpp
+  AMDGPUExpandCondBarrier.cpp
   AMDGPUFrameLowering.cpp
   AMDGPUGlobalISelDivergenceLowering.cpp
   AMDGPUGlobalISelUtils.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6d2110957002a..7d3f47b96958f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2027,6 +2027,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
   switch (MI.getOpcode()) {
   default: return TargetInstrInfo::expandPostRAPseudo(MI);
+
+  case AMDGPU::SI_COND_BARRIER:
+    // SI_COND_BARRIER is handled by the dedicated ExpandCondBarrier pass
+    return false;
+
   case AMDGPU::S_MOV_B64_term:
     // This is only a terminator to get the correct spill code placement during
     // register allocation.
@@ -2104,6 +2109,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
     MI.setDesc(get(AMDGPU::V_READLANE_B32));
     break;
+
   case AMDGPU::AV_MOV_B32_IMM_PSEUDO: {
     Register Dst = MI.getOperand(0).getReg();
     bool IsAGPR = SIRegisterInfo::isAGPRClass(RI.getPhysRegBaseClass(Dst));
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 984d1a4db4cd6..d73f264263a04 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -117,6 +117,14 @@ def ATOMIC_FENCE : SPseudoInstSI<
   let hasSideEffects = 1;
 }
 
+// Conditional barrier pseudo-instruction, abstracting complex control flow.
+def SI_COND_BARRIER : SPseudoInstSI<(outs), (ins i32imm:$variant), [],
+                                    "SI_COND_BARRIER $variant"> {
+  let Uses = [SCC];
+  let hasSideEffects = 1;
+  let Size = 8; // Expands to ~4 instructions
+}
+
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
 
 // For use in patterns
diff --git a/llvm/test/CodeGen/AMDGPU/si-cond-barrier-expansion-only.mir b/llvm/test/CodeGen/AMDGPU/si-cond-barrier-expansion-only.mir
new file mode 100644
index 0000000000000..98ae7e5b69550
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-cond-barrier-expansion-only.mir
@@ -0,0 +1,117 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-expand-cond-barrier %s -o - | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @test_single_barrier() {
+  entry:
+    ret void
+  }
+
+  define amdgpu_kernel void @test_multiple_barriers() {
+  entry:
+    ret void
+  }
+
+  define amdgpu_kernel void @test_barrier_with_instructions_after() {
+  entry:
+    ret void
+  }
+
+  define amdgpu_kernel void @test_barrier_in_middle() {
+  entry:
+    ret void
+  }
+...
+
+---
+# CHECK-LABEL: name: test_single_barrier
+# CHECK: bb.0:
+# CHECK: S_CMP_LG_U32
+# CHECK: S_CBRANCH_SCC0 %[[CONTINUE_BB:bb\.[0-9]+]]
+# CHECK: [[BARRIER_BB:bb\.[0-9]+]]:
+# CHECK: S_BARRIER
+# CHECK: S_BRANCH %[[CONTINUE_BB]]
+# CHECK: [[CONTINUE_BB]]:
+# CHECK: S_ENDPGM
+name: test_single_barrier
+tracksRegLiveness: true
+body: |
+  bb.0:
+    %thread_id:sreg_32 = S_MOV_B32 256
+    %0:sreg_32 = S_LSHR_B32 %thread_id, 8, implicit-def $scc
+    S_CMP_LG_U32 %0, 0, implicit-def $scc
+    SI_COND_BARRIER 0, implicit $scc
+    S_ENDPGM 0
+
+---
+# CHECK-LABEL: name: test_multiple_barriers
+# CHECK: bb.0:
+# CHECK: S_CMP_LG_U32
+# CHECK: S_CBRANCH_SCC0 %[[CONTINUE1_BB:bb\.[0-9]+]]
+# CHECK: [[BARRIER1_BB:bb\.[0-9]+]]:
+# CHECK: S_BARRIER
+# CHECK: S_BRANCH %[[CONTINUE1_BB]]
+# CHECK: [[CONTINUE1_BB]]:
+# CHECK: S_CMP_LG_U32
+# CHECK: S_CBRANCH_SCC1 %[[CONTINUE2_BB:bb\.[0-9]+]]
+# CHECK: [[BARRIER2_BB:bb\.[0-9]+]]:
+# CHECK: S_BARRIER
+# CHECK: S_BRANCH %[[CONTINUE2_BB]]
+# CHECK: [[CONTINUE2_BB]]:
+# CHECK: S_ENDPGM
+name: test_multiple_barriers
+tracksRegLiveness: true
+body: |
+  bb.0:
+    %0:sreg_32 = S_MOV_B32 1
+    S_CMP_LG_U32 %0, 0, implicit-def $scc
+    SI_COND_BARRIER 0, implicit $scc
+    %1:sreg_32 = S_MOV_B32 1
+    S_CMP_LG_U32 %1, 0, implicit-def $scc
+    SI_COND_BARRIER 1, implicit $scc
+    S_ENDPGM 0
+
+---
+# CHECK-LABEL: name: test_barrier_with_instructions_after
+# CHECK: bb.0:
+# CHECK: S_CMP_LG_U32
+# CHECK: S_CBRANCH_SCC0 %[[CONTINUE_BB:bb\.[0-9]+]]
+# CHECK: [[BARRIER_BB:bb\.[0-9]+]]:
+# CHECK: S_BARRIER
+# CHECK: S_BRANCH %[[CONTINUE_BB]]
+# CHECK: [[CONTINUE_BB]]:
+# CHECK: S_MOV_B32
+# CHECK: S_ADD_U32
+# CHECK: S_ENDPGM
+name: test_barrier_with_instructions_after
+tracksRegLiveness: true
+body: |
+  bb.0:
+    %0:sreg_32 = S_MOV_B32 1
+    S_CMP_LG_U32 %0, 0, implicit-def $scc
+    SI_COND_BARRIER 0, implicit $scc
+    %1:sreg_32 = S_MOV_B32 42
+    %2:sreg_32 = S_ADD_U32 %1, %1, implicit-def $scc
+    S_ENDPGM 0
+
+---
+# CHECK-LABEL: name: test_barrier_in_middle
+# CHECK: bb.0:
+# CHECK: S_MOV_B32
+# CHECK: S_CMP_LG_U32
+# CHECK: S_CBRANCH_SCC0 %[[CONTINUE_BB:bb\.[0-9]+]]
+# CHECK: [[BARRIER_BB:bb\.[0-9]+]]:
+# CHECK: S_BARRIER
+# CHECK: S_BRANCH %[[CONTINUE_BB]]
+# CHECK: [[CONTINUE_BB]]:
+# CHECK: S_ADD_U32
+# CHECK: S_ENDPGM
+name: test_barrier_in_middle
+tracksRegLiveness: true
+body: |
+  bb.0:
+    %0:sreg_32 = S_MOV_B32 100
+    %1:sreg_32 = S_MOV_B32 1
+    S_CMP_LG_U32 %1, 0, implicit-def $scc
+    SI_COND_BARRIER 0, implicit $scc
+    %2:sreg_32 = S_ADD_U32 %0, %1, implicit-def $scc
+    S_ENDPGM 0

>From 4ac4e1896b0fed48b03c87db8d990bcd265a83ef Mon Sep 17 00:00:00 2001
From: tyb0807 <sontuan.vu at amd.com>
Date: Wed, 10 Dec 2025 13:58:03 +0100
Subject: [PATCH 2/2] Guard the expansion pass with a flag

---
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp       | 12 +++++++++---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp               |  6 +++++-
 .../AMDGPU/si-cond-barrier-expansion-only.mir        |  2 +-
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 872218a714a4b..1cf1c3a813c87 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -239,6 +239,10 @@ static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
                 cl::init(&useDefaultRegisterAllocator),
                 cl::desc("Register allocator to use for WWM registers"));
 
+static cl::opt<bool> EnableCondBarrier("amdgpu-enable-cond-barrier",
+                                       cl::Hidden, cl::init(false),
+                                       cl::desc("Enable SI_COND_BARRIER pseudo instruction expansion"));
+
 static void initializeDefaultSGPRRegisterAllocatorOnce() {
   RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
 
@@ -1763,10 +1767,12 @@ void GCNPassConfig::addPostRegAlloc() {
   if (getOptLevel() > CodeGenOptLevel::None)
     addPass(&SIOptimizeExecMaskingLegacyID);
 
-  // Add ExpandCondBarrier pass before post-RA pseudo expansion
-  addPass(&AMDGPUExpandCondBarrierID);
-
   TargetPassConfig::addPostRegAlloc();
+
+  // Add ExpandCondBarrier pass AFTER standard post-RA pseudo expansion
+  // SI_COND_BARRIER returns false from SIInstrInfo::expandPostRAPseudo()
+  if (EnableCondBarrier)
+    addPass(&AMDGPUExpandCondBarrierID);
 }
 
 void GCNPassConfig::addPreSched2() {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 7d3f47b96958f..07d3438c7f82b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2029,7 +2029,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   default: return TargetInstrInfo::expandPostRAPseudo(MI);
 
   case AMDGPU::SI_COND_BARRIER:
-    // SI_COND_BARRIER is handled by the dedicated ExpandCondBarrier pass
+    // SI_COND_BARRIER requires a separate pass because it needs to modify the CFG
+    // (split basic blocks, create new blocks, add successors), which causes
+    // iterator invalidation issues when done within expandPostRAPseudo().
+    // The ExpandPostRAPseudos pass uses early_inc_iterator which gets invalidated
+    // by CFG modifications, leading to crashes.
     return false;
 
   case AMDGPU::S_MOV_B64_term:
diff --git a/llvm/test/CodeGen/AMDGPU/si-cond-barrier-expansion-only.mir b/llvm/test/CodeGen/AMDGPU/si-cond-barrier-expansion-only.mir
index 98ae7e5b69550..d6860560d1043 100644
--- a/llvm/test/CodeGen/AMDGPU/si-cond-barrier-expansion-only.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-cond-barrier-expansion-only.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-expand-cond-barrier %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-enable-cond-barrier -run-pass=amdgpu-expand-cond-barrier %s -o - | FileCheck %s
 
 --- |
   define amdgpu_kernel void @test_single_barrier() {