[llvm] AMDGPU: Refactor lowering of s_barrier to split barriers (PR #154648)
Nicolai Hähnle via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 22 09:12:46 PDT 2025
https://github.com/nhaehnle updated https://github.com/llvm/llvm-project/pull/154648
>From 672744325d25717b52ee97b59cb9b361d5788ccc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle at amd.com>
Date: Thu, 7 Aug 2025 16:44:33 -0700
Subject: [PATCH] AMDGPU: Refactor lowering of s_barrier to split barriers
Let's do the lowering of non-split into split barriers and the
downgrading of barriers based on the workgroup size in a new IR pass,
AMDGPULowerIntrinsics. That way, there is no code duplication between
SelectionDAG and GlobalISel. This simplifies some upcoming extensions to
the code.
v2:
- turn into a Module pass
- also handle the downgrading of barriers for single-wave workgroups in
the IR pass
- add tests for the new pass
(cherry picked from commit e246f42fbdad5667d5a395ce65f4900d67610e72)
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 11 ++
.../AMDGPU/AMDGPUInstructionSelector.cpp | 37 ----
.../Target/AMDGPU/AMDGPUInstructionSelector.h | 1 -
.../Target/AMDGPU/AMDGPULowerIntrinsics.cpp | 163 ++++++++++++++++++
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 +
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 +-
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 35 ----
llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 6 +-
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 5 +
.../AMDGPU/lower-intrinsics-barriers.ll | 84 +++++++++
.../AMDGPU/lower-intrinsics-split-barriers.ll | 80 +++++++++
12 files changed, 352 insertions(+), 77 deletions(-)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
create mode 100644 llvm/test/CodeGen/AMDGPU/lower-intrinsics-barriers.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/lower-intrinsics-split-barriers.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 0059a862ba9b2..ebe38de1636be 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -62,6 +62,7 @@ FunctionPass *createAMDGPURewriteOutArgumentsPass();
ModulePass *
createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
ModulePass *createAMDGPULowerBufferFatPointersPass();
+ModulePass *createAMDGPULowerIntrinsicsLegacyPass();
FunctionPass *createSIModeRegisterPass();
FunctionPass *createGCNPreRAOptimizationsLegacyPass();
FunctionPass *createAMDGPUPreloadKernArgPrologLegacyPass();
@@ -153,6 +154,16 @@ struct AMDGPULowerBufferFatPointersPass
const TargetMachine &TM;
};
+void initializeAMDGPULowerIntrinsicsLegacyPass(PassRegistry &);
+
+struct AMDGPULowerIntrinsicsPass : PassInfoMixin<AMDGPULowerIntrinsicsPass> {
+ AMDGPULowerIntrinsicsPass(const AMDGPUTargetMachine &TM) : TM(TM) {}
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
+
+private:
+ const AMDGPUTargetMachine &TM;
+};
+
void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &);
extern char &AMDGPUPrepareAGPRAllocLegacyID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 5d31eed8fe7d7..fac365d015d95 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1989,39 +1989,6 @@ bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
return selectImpl(MI, *CoverageInfo);
}
-bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
- Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
- if (TM.getOptLevel() > CodeGenOptLevel::None) {
- unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
- if (WGSize <= STI.getWavefrontSize()) {
- // If the workgroup fits in a wave, remove s_barrier_signal and lower
- // s_barrier/s_barrier_wait to wave_barrier.
- if (IntrinsicID == Intrinsic::amdgcn_s_barrier ||
- IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) {
- MachineBasicBlock *MBB = MI.getParent();
- const DebugLoc &DL = MI.getDebugLoc();
- BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
- }
- MI.eraseFromParent();
- return true;
- }
- }
-
- if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
- // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
- MachineBasicBlock *MBB = MI.getParent();
- const DebugLoc &DL = MI.getDebugLoc();
- BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
- .addImm(AMDGPU::Barrier::WORKGROUP);
- BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
- .addImm(AMDGPU::Barrier::WORKGROUP);
- MI.eraseFromParent();
- return true;
- }
-
- return selectImpl(MI, *CoverageInfo);
-}
-
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
bool &IsTexFail) {
if (TexFailCtrl)
@@ -2338,10 +2305,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return selectDSAppendConsume(I, false);
case Intrinsic::amdgcn_init_whole_wave:
return selectInitWholeWave(I);
- case Intrinsic::amdgcn_s_barrier:
- case Intrinsic::amdgcn_s_barrier_signal:
- case Intrinsic::amdgcn_s_barrier_wait:
- return selectSBarrier(I);
case Intrinsic::amdgcn_raw_buffer_load_lds:
case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
case Intrinsic::amdgcn_struct_buffer_load_lds:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 092439693f399..4db46064999c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -124,7 +124,6 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
bool selectInitWholeWave(MachineInstr &MI) const;
- bool selectSBarrier(MachineInstr &MI) const;
bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
bool selectImageIntrinsic(MachineInstr &MI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
new file mode 100644
index 0000000000000..a14f2e3f31550
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -0,0 +1,163 @@
+//===-- AMDGPULowerIntrinsics.cpp -------------------------------------------=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower intrinsics that would otherwise require separate handling in both
+// SelectionDAG and GlobalISel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/InitializePasses.h"
+
+#define DEBUG_TYPE "amdgpu-lower-intrinsics"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPULowerIntrinsicsImpl {
+public:
+ Module &M;
+ const AMDGPUTargetMachine &TM;
+
+ AMDGPULowerIntrinsicsImpl(Module &M, const AMDGPUTargetMachine &TM)
+ : M(M), TM(TM) {}
+
+ bool run();
+
+private:
+ bool visitBarrier(IntrinsicInst &I);
+};
+
+class AMDGPULowerIntrinsicsLegacy : public ModulePass {
+public:
+ static char ID;
+
+ AMDGPULowerIntrinsicsLegacy() : ModulePass(ID) {}
+
+ bool runOnModule(Module &M) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetPassConfig>();
+ AU.setPreservesCFG();
+ }
+};
+
+template <class T> static void forEachCall(Function &Intrin, T Callback) {
+ for (User *U : make_early_inc_range(Intrin.users())) {
+ if (auto *CI = dyn_cast<IntrinsicInst>(U))
+ Callback(CI);
+ }
+}
+
+} // anonymous namespace
+
+bool AMDGPULowerIntrinsicsImpl::run() {
+ bool Changed = false;
+
+ for (Function &F : M) {
+ switch (F.getIntrinsicID()) {
+ default:
+ continue;
+ case Intrinsic::amdgcn_s_barrier:
+ case Intrinsic::amdgcn_s_barrier_signal:
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst:
+ case Intrinsic::amdgcn_s_barrier_wait:
+ forEachCall(F, [&](IntrinsicInst *II) {
+ if (visitBarrier(*II))
+ Changed = true;
+ });
+ break;
+ }
+ }
+
+ return Changed;
+}
+
+// Lower s_{cluster_}barrier to a sequence of split barrier intrinsics.
+bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) {
+ assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
+ I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal ||
+ I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst ||
+ I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait);
+
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*I.getFunction());
+ bool IsSingleWaveWG = false;
+
+ if (TM.getOptLevel() > CodeGenOptLevel::None) {
+ unsigned WGMaxSize = ST.getFlatWorkGroupSizes(*I.getFunction()).second;
+ IsSingleWaveWG = WGMaxSize <= ST.getWavefrontSize();
+ }
+
+ IRBuilder<> B(&I);
+
+ if (IsSingleWaveWG) {
+ // Down-grade waits, remove split signals.
+ if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
+ I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait) {
+ B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_wave_barrier, {});
+ } else if (I.getIntrinsicID() ==
+ Intrinsic::amdgcn_s_barrier_signal_isfirst) {
+ // If we're the only wave of the workgroup, we're always first.
+ I.replaceAllUsesWith(B.getInt1(true));
+ }
+ I.eraseFromParent();
+ return true;
+ }
+
+ if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier &&
+ ST.hasSplitBarriers()) {
+ // Lower to split barriers.
+ Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::WORKGROUP);
+ Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::WORKGROUP);
+ B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_signal,
+ {BarrierID_32});
+ B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
+ {BarrierID_16});
+ I.eraseFromParent();
+ return true;
+ }
+
+ return false;
+}
+
+PreservedAnalyses AMDGPULowerIntrinsicsPass::run(Module &M,
+ ModuleAnalysisManager &MAM) {
+ AMDGPULowerIntrinsicsImpl Impl(M, TM);
+ if (!Impl.run())
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+bool AMDGPULowerIntrinsicsLegacy::runOnModule(Module &M) {
+ auto &TPC = getAnalysis<TargetPassConfig>();
+ const AMDGPUTargetMachine &TM = TPC.getTM<AMDGPUTargetMachine>();
+
+ AMDGPULowerIntrinsicsImpl Impl(M, TM);
+ return Impl.run();
+}
+
+#define PASS_DESC "AMDGPU lower intrinsics"
+INITIALIZE_PASS_BEGIN(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPULowerIntrinsicsLegacy, DEBUG_TYPE, PASS_DESC, false,
+ false)
+
+char AMDGPULowerIntrinsicsLegacy::ID = 0;
+
+ModulePass *llvm::createAMDGPULowerIntrinsicsLegacyPass() {
+ return new AMDGPULowerIntrinsicsLegacy;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 6ddfa386e8ac9..48448833721bf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -20,6 +20,7 @@ MODULE_PASS("amdgpu-always-inline", AMDGPUAlwaysInlinePass())
MODULE_PASS("amdgpu-export-kernel-runtime-handles", AMDGPUExportKernelRuntimeHandlesPass())
MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
AMDGPULowerBufferFatPointersPass(*this))
+MODULE_PASS("amdgpu-lower-intrinsics", AMDGPULowerIntrinsicsPass(*this))
MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
MODULE_PASS("amdgpu-perf-hint",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e969f9ec88899..4a2f0a13b1325 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -577,6 +577,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR);
initializeAMDGPULowerModuleLDSLegacyPass(*PR);
initializeAMDGPULowerBufferFatPointersPass(*PR);
+ initializeAMDGPULowerIntrinsicsLegacyPass(*PR);
initializeAMDGPUReserveWWMRegsLegacyPass(*PR);
initializeAMDGPURewriteAGPRCopyMFMALegacyPass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
@@ -1418,6 +1419,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
// nodes out of the graph, which leads to function-level passes not
// being run on them, which causes crashes in the resource usage analysis).
addPass(createAMDGPULowerBufferFatPointersPass());
+ addPass(createAMDGPULowerIntrinsicsLegacyPass());
// In accordance with the above FIXME, manually force all the
// function-level passes into a CGSCCPassManager.
addPass(new DummyCGSCCPass());
@@ -2155,9 +2157,10 @@ void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
// nodes out of the graph, which leads to function-level passes not
// being run on them, which causes crashes in the resource usage analysis).
addPass(AMDGPULowerBufferFatPointersPass(TM));
-
addPass.requireCGSCCOrder();
+ addPass(AMDGPULowerIntrinsicsPass(TM));
+
Base::addCodeGenPrepare(addPass);
if (isPassEnabled(EnableLoadStoreVectorizer))
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index dc9dd220130ea..619ff4e5c73c4 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -71,6 +71,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUImageIntrinsicOptimizer.cpp
AMDGPULibFunc.cpp
AMDGPULowerBufferFatPointers.cpp
+ AMDGPULowerIntrinsics.cpp
AMDGPULowerKernelArguments.cpp
AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 66c1dfc71c2f5..e568b2d14d7ef 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10421,41 +10421,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
}
- case Intrinsic::amdgcn_s_barrier:
- case Intrinsic::amdgcn_s_barrier_signal:
- case Intrinsic::amdgcn_s_barrier_wait: {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
- unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
- if (WGSize <= ST.getWavefrontSize()) {
- // If the workgroup fits in a wave, remove s_barrier_signal and lower
- // s_barrier/s_barrier_wait to wave_barrier.
- if (IntrinsicID == Intrinsic::amdgcn_s_barrier_signal)
- return Op.getOperand(0);
- else
- return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL,
- MVT::Other, Op.getOperand(0)),
- 0);
- }
- }
-
- if (ST.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
- // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
- SDValue K =
- DAG.getSignedTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
- SDValue BarSignal =
- SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
- MVT::Other, K, Op.getOperand(0)),
- 0);
- SDValue BarWait =
- SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
- BarSignal.getValue(0)),
- 0);
- return BarWait;
- }
-
- return SDValue();
- };
case Intrinsic::amdgcn_struct_tbuffer_store:
case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index ceed41f3ed7c5..6df3d255244d2 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -8,11 +8,11 @@
; RUN: | FileCheck -check-prefix=GCN-O3 %s
-; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O0: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require<uniformity>,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
-; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O2: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
-; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
+; GCN-O3: require<MachineModuleAnalysis>,require<profile-summary>,require<collector-metadata>,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,loop-mssa(licm<allowspeculation>),verify,loop-mssa(canon-freeze,loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument<post-inline>,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,amdgpu-lower-intrinsics,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require<uniformity>,objc-arc-contract,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require<reg-usage>,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require<live-vars>,si-opt-vgpr-liverange,require<machine-loops>,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy<sgpr>,virt-reg-rewriter<no-clear-vregs>,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy<wwm>,si-lower-wwm-copies,virt-reg-rewriter<no-clear-vregs>,amdgpu-reserve-wwm-regs,greedy<vgpr>,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function))
define void @empty() {
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 3e17be6b34a57..36231abda87db 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -49,6 +49,7 @@
; GCN-O0-NEXT: Expand reduction intrinsics
; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O0-NEXT: Lower buffer fat pointer operations to buffer resources
+; GCN-O0-NEXT: AMDGPU lower intrinsics
; GCN-O0-NEXT: CallGraph Construction
; GCN-O0-NEXT: Call Graph SCC Pass Manager
; GCN-O0-NEXT: DummyCGSCCPass
@@ -231,6 +232,7 @@
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O1-NEXT: Lower buffer fat pointer operations to buffer resources
+; GCN-O1-NEXT: AMDGPU lower intrinsics
; GCN-O1-NEXT: CallGraph Construction
; GCN-O1-NEXT: Call Graph SCC Pass Manager
; GCN-O1-NEXT: DummyCGSCCPass
@@ -530,6 +532,7 @@
; GCN-O1-OPTS-NEXT: FunctionPass Manager
; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O1-OPTS-NEXT: Lower buffer fat pointer operations to buffer resources
+; GCN-O1-OPTS-NEXT: AMDGPU lower intrinsics
; GCN-O1-OPTS-NEXT: CallGraph Construction
; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager
; GCN-O1-OPTS-NEXT: DummyCGSCCPass
@@ -847,6 +850,7 @@
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O2-NEXT: Lower buffer fat pointer operations to buffer resources
+; GCN-O2-NEXT: AMDGPU lower intrinsics
; GCN-O2-NEXT: CallGraph Construction
; GCN-O2-NEXT: Call Graph SCC Pass Manager
; GCN-O2-NEXT: DummyCGSCCPass
@@ -1179,6 +1183,7 @@
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O3-NEXT: Lower buffer fat pointer operations to buffer resources
+; GCN-O3-NEXT: AMDGPU lower intrinsics
; GCN-O3-NEXT: CallGraph Construction
; GCN-O3-NEXT: Call Graph SCC Pass Manager
; GCN-O3-NEXT: DummyCGSCCPass
diff --git a/llvm/test/CodeGen/AMDGPU/lower-intrinsics-barriers.ll b/llvm/test/CodeGen/AMDGPU/lower-intrinsics-barriers.ll
new file mode 100644
index 0000000000000..bc70c3b36d45a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-intrinsics-barriers.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -codegen-opt-level=0 | FileCheck --check-prefixes=GFX11,GFX11-NOOPT %s
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -codegen-opt-level=1 -mattr=+wavefrontsize32 | FileCheck --check-prefixes=GFX11,OPT-WAVE32,GFX11-OPT-WAVE32 %s
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -codegen-opt-level=1 -mattr=+wavefrontsize64 | FileCheck --check-prefixes=GFX11,OPT-WAVE64,GFX11-OPT-WAVE64 %s
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -codegen-opt-level=0 | FileCheck --check-prefixes=GFX12,GFX12-NOOPT %s
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -codegen-opt-level=1 -mattr=+wavefrontsize32 | FileCheck --check-prefixes=GFX12,OPT-WAVE32,GFX12-OPT-WAVE32 %s
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -codegen-opt-level=1 -mattr=+wavefrontsize64 | FileCheck --check-prefixes=GFX12,OPT-WAVE64,GFX12-OPT-WAVE64 %s
+
+define amdgpu_kernel void @barrier() {
+; GFX11-LABEL: define amdgpu_kernel void @barrier(
+; GFX11-SAME: ) #[[ATTR0:[0-9]+]] {
+; GFX11-NEXT: call void @llvm.amdgcn.s.barrier()
+; GFX11-NEXT: ret void
+;
+; GFX12-LABEL: define amdgpu_kernel void @barrier(
+; GFX12-SAME: ) #[[ATTR0:[0-9]+]] {
+; GFX12-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+; GFX12-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; GFX12-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier()
+ ret void
+}
+
+define amdgpu_kernel void @barrier_32threads() "amdgpu-flat-work-group-size"="32,32" {
+; GFX11-NOOPT-LABEL: define amdgpu_kernel void @barrier_32threads(
+; GFX11-NOOPT-SAME: ) #[[ATTR1:[0-9]+]] {
+; GFX11-NOOPT-NEXT: call void @llvm.amdgcn.s.barrier()
+; GFX11-NOOPT-NEXT: ret void
+;
+; OPT-WAVE32-LABEL: define amdgpu_kernel void @barrier_32threads(
+; OPT-WAVE32-SAME: ) #[[ATTR1:[0-9]+]] {
+; OPT-WAVE32-NEXT: call void @llvm.amdgcn.wave.barrier()
+; OPT-WAVE32-NEXT: ret void
+;
+; OPT-WAVE64-LABEL: define amdgpu_kernel void @barrier_32threads(
+; OPT-WAVE64-SAME: ) #[[ATTR1:[0-9]+]] {
+; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier()
+; OPT-WAVE64-NEXT: ret void
+;
+; GFX12-NOOPT-LABEL: define amdgpu_kernel void @barrier_32threads(
+; GFX12-NOOPT-SAME: ) #[[ATTR1:[0-9]+]] {
+; GFX12-NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+; GFX12-NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; GFX12-NOOPT-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier()
+ ret void
+}
+
+define amdgpu_kernel void @barrier_64threads() "amdgpu-flat-work-group-size"="64,64" {
+; GFX11-NOOPT-LABEL: define amdgpu_kernel void @barrier_64threads(
+; GFX11-NOOPT-SAME: ) #[[ATTR2:[0-9]+]] {
+; GFX11-NOOPT-NEXT: call void @llvm.amdgcn.s.barrier()
+; GFX11-NOOPT-NEXT: ret void
+;
+; GFX11-OPT-WAVE32-LABEL: define amdgpu_kernel void @barrier_64threads(
+; GFX11-OPT-WAVE32-SAME: ) #[[ATTR2:[0-9]+]] {
+; GFX11-OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier()
+; GFX11-OPT-WAVE32-NEXT: ret void
+;
+; OPT-WAVE64-LABEL: define amdgpu_kernel void @barrier_64threads(
+; OPT-WAVE64-SAME: ) #[[ATTR2:[0-9]+]] {
+; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier()
+; OPT-WAVE64-NEXT: ret void
+;
+; GFX12-NOOPT-LABEL: define amdgpu_kernel void @barrier_64threads(
+; GFX12-NOOPT-SAME: ) #[[ATTR2:[0-9]+]] {
+; GFX12-NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+; GFX12-NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; GFX12-NOOPT-NEXT: ret void
+;
+; GFX12-OPT-WAVE32-LABEL: define amdgpu_kernel void @barrier_64threads(
+; GFX12-OPT-WAVE32-SAME: ) #[[ATTR2:[0-9]+]] {
+; GFX12-OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+; GFX12-OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; GFX12-OPT-WAVE32-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier()
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-OPT-WAVE64: {{.*}}
+; GFX12-OPT-WAVE64: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-intrinsics-split-barriers.ll b/llvm/test/CodeGen/AMDGPU/lower-intrinsics-split-barriers.ll
new file mode 100644
index 0000000000000..69ad4b6793c1d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-intrinsics-split-barriers.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -codegen-opt-level=0 | FileCheck --check-prefixes=CHECK,NOOPT %s
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -codegen-opt-level=1 -mattr=+wavefrontsize32 | FileCheck --check-prefixes=CHECK,OPT-WAVE32 %s
+; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -codegen-opt-level=1 -mattr=+wavefrontsize64 | FileCheck --check-prefixes=CHECK,OPT-WAVE64 %s
+
+declare void @foo(i1)
+
+define amdgpu_kernel void @barrier() {
+; CHECK-LABEL: define amdgpu_kernel void @barrier(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; CHECK-NEXT: [[ISFIRST:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+; CHECK-NEXT: call void @foo(i1 [[ISFIRST]])
+; CHECK-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+ %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+ call void @foo(i1 %isfirst)
+ ret void
+}
+
+define amdgpu_kernel void @barrier_32threads() "amdgpu-flat-work-group-size"="32,32" {
+; NOOPT-LABEL: define amdgpu_kernel void @barrier_32threads(
+; NOOPT-SAME: ) #[[ATTR1:[0-9]+]] {
+; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; NOOPT-NEXT: [[ISFIRST:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+; NOOPT-NEXT: call void @foo(i1 [[ISFIRST]])
+; NOOPT-NEXT: ret void
+;
+; OPT-WAVE32-LABEL: define amdgpu_kernel void @barrier_32threads(
+; OPT-WAVE32-SAME: ) #[[ATTR1:[0-9]+]] {
+; OPT-WAVE32-NEXT: call void @llvm.amdgcn.wave.barrier()
+; OPT-WAVE32-NEXT: call void @foo(i1 true)
+; OPT-WAVE32-NEXT: ret void
+;
+; OPT-WAVE64-LABEL: define amdgpu_kernel void @barrier_32threads(
+; OPT-WAVE64-SAME: ) #[[ATTR1:[0-9]+]] {
+; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier()
+; OPT-WAVE64-NEXT: call void @foo(i1 true)
+; OPT-WAVE64-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+ %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+ call void @foo(i1 %isfirst)
+ ret void
+}
+
+define amdgpu_kernel void @barrier_64threads() "amdgpu-flat-work-group-size"="64,64" {
+; NOOPT-LABEL: define amdgpu_kernel void @barrier_64threads(
+; NOOPT-SAME: ) #[[ATTR2:[0-9]+]] {
+; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; NOOPT-NEXT: [[ISFIRST:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+; NOOPT-NEXT: call void @foo(i1 [[ISFIRST]])
+; NOOPT-NEXT: ret void
+;
+; OPT-WAVE32-LABEL: define amdgpu_kernel void @barrier_64threads(
+; OPT-WAVE32-SAME: ) #[[ATTR2:[0-9]+]] {
+; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+; OPT-WAVE32-NEXT: [[ISFIRST:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+; OPT-WAVE32-NEXT: call void @foo(i1 [[ISFIRST]])
+; OPT-WAVE32-NEXT: ret void
+;
+; OPT-WAVE64-LABEL: define amdgpu_kernel void @barrier_64threads(
+; OPT-WAVE64-SAME: ) #[[ATTR2:[0-9]+]] {
+; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier()
+; OPT-WAVE64-NEXT: call void @foo(i1 true)
+; OPT-WAVE64-NEXT: ret void
+;
+ call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+ %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+ call void @foo(i1 %isfirst)
+ ret void
+}
More information about the llvm-commits
mailing list