[clang] d0ee820 - [AMDGPU] Add s_barrier_init|join|leave instructions (#153296)

via cfe-commits cfe-commits at lists.llvm.org
Tue Aug 12 15:07:10 PDT 2025


Author: Stanislav Mekhanoshin
Date: 2025-08-12T15:07:07-07:00
New Revision: d0ee82040cb22ae38c92eb83b0c9ae71ca51a517

URL: https://github.com/llvm/llvm-project/commit/d0ee82040cb22ae38c92eb83b0c9ae71ca51a517
DIFF: https://github.com/llvm/llvm-project/commit/d0ee82040cb22ae38c92eb83b0c9ae71ca51a517.diff

LOG: [AMDGPU] Add s_barrier_init|join|leave instructions (#153296)

Added: 
    llvm/test/CodeGen/AMDGPU/s-barrier.ll

Modified: 
    clang/include/clang/Basic/BuiltinsAMDGPU.def
    clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl
    clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
    llvm/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
    llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
    llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/lib/Target/AMDGPU/SOPInstructions.td
    llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
    llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
    llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
    llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
    llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
    llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index b16d4a22207a7..f8f55772db8fe 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -503,6 +503,9 @@ TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal, "vIi", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_var, "vv*i", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_barrier_wait, "vIs", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst, "bIi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_init, "vv*i", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vv*", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "vIs", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_get_named_barrier_state, "Uiv*", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_prefetch_data, "vvC*Ui", "nc", "gfx12-insts")

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl
index 5d86a9b369429..1a5043328895a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl
@@ -23,6 +23,13 @@ kernel void builtins_amdgcn_s_barrier_signal_isfirst_err(global int* in, global
   *out = *in;
 }
 
+kernel void builtins_amdgcn_s_barrier_leave_err(global int* in, global int* out, int barrier) {
+
+  __builtin_amdgcn_s_barrier_signal(-1);
+  __builtin_amdgcn_s_barrier_leave(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_leave' must be a constant integer}}
+  *out = *in;
+}
+
 void test_s_buffer_prefetch_data(__amdgpu_buffer_rsrc_t rsrc, unsigned int off)
 {
   __builtin_amdgcn_s_buffer_prefetch_data(rsrc, off, 31); // expected-error {{'__builtin_amdgcn_s_buffer_prefetch_data' must be a constant integer}}

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
index f7641280715c8..8c02616780182 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -139,6 +139,50 @@ void test_s_barrier_signal_isfirst(int* a, int* b, int *c)
   __builtin_amdgcn_s_barrier_wait(1);
 }
 
+// CHECK-LABEL: @test_s_barrier_init(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test_s_barrier_init(void *bar, int a)
+{
+  __builtin_amdgcn_s_barrier_init(bar, a);
+}
+
+// CHECK-LABEL: @test_s_barrier_join(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
+// CHECK-NEXT:    store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+void test_s_barrier_join(void *bar)
+{
+  __builtin_amdgcn_s_barrier_join(bar);
+}
+
+// CHECK-LABEL: @test_s_barrier_leave(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @llvm.amdgcn.s.barrier.leave(i16 1)
+// CHECK-NEXT:    ret void
+//
+void test_s_barrier_leave()
+{
+  __builtin_amdgcn_s_barrier_leave(1);
+}
+
 // CHECK-LABEL: @test_s_get_barrier_state(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)

diff  --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index be6df257f668b..cf82f7f06a693 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -290,11 +290,29 @@ def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barri
   Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
                                 IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
+// void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %barrier, i32 %memberCnt)
+// The %barrier and %memberCnt argument must be uniform, otherwise behavior is undefined.
+def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">,
+  Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent,
+                                IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
+// void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %barrier)
+// The %barrier argument must be uniform, otherwise behavior is undefined.
+def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">,
+  Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+                                IntrNoCallback, IntrNoFree]>;
+
 // void @llvm.amdgcn.s.barrier.wait(i16 %barrierType)
 def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">,
   Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
                                 IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
+
+// void @llvm.amdgcn.s.barrier.leave(i16 %barrierType)
+def int_amdgcn_s_barrier_leave : ClangBuiltin<"__builtin_amdgcn_s_barrier_leave">,
+  Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
+                                IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
 // uint32_t @llvm.amdgcn.s.get.barrier.state(i32 %barrierId)
 // The %barrierType argument must be uniform, otherwise behavior is undefined.
 def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrier_state">,

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 402147abd8891..5d31eed8fe7d7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2368,8 +2368,10 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
   case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
   case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
     return selectDSBvhStackIntrinsic(I);
+  case Intrinsic::amdgcn_s_barrier_init:
   case Intrinsic::amdgcn_s_barrier_signal_var:
     return selectNamedBarrierInit(I, IntrinsicID);
+  case Intrinsic::amdgcn_s_barrier_join:
   case Intrinsic::amdgcn_s_get_named_barrier_state:
     return selectNamedBarrierInst(I, IntrinsicID);
   case Intrinsic::amdgcn_s_get_barrier_state:
@@ -6772,6 +6774,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
     switch (IntrID) {
     default:
       llvm_unreachable("not a named barrier op");
+    case Intrinsic::amdgcn_s_barrier_join:
+      return AMDGPU::S_BARRIER_JOIN_IMM;
     case Intrinsic::amdgcn_s_get_named_barrier_state:
       return AMDGPU::S_GET_BARRIER_STATE_IMM;
     };
@@ -6779,6 +6783,8 @@ unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
     switch (IntrID) {
     default:
       llvm_unreachable("not a named barrier op");
+    case Intrinsic::amdgcn_s_barrier_join:
+      return AMDGPU::S_BARRIER_JOIN_M0;
     case Intrinsic::amdgcn_s_get_named_barrier_state:
       return AMDGPU::S_GET_BARRIER_STATE_M0;
     };
@@ -6829,8 +6835,11 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit(
       BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
   constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
 
+  unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
+                     ? AMDGPU::S_BARRIER_INIT_M0
+                     : AMDGPU::S_BARRIER_SIGNAL_M0;
   MachineInstrBuilder MIB;
-  MIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_M0));
+  MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
 
   I.eraseFromParent();
   return true;

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 31f5ba1dd5040..092439693f399 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -156,6 +156,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool selectNamedBarrierInst(MachineInstr &I, Intrinsic::ID IID) const;
   bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
   bool selectSGetBarrierState(MachineInstr &I, Intrinsic::ID IID) const;
+  bool selectSBarrierLeave(MachineInstr &I) const;
 
   std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
                                                    bool IsCanonicalizing = true,

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index aa72c3e61f680..e65dd1b04cc48 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -352,7 +352,10 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
     case Intrinsic::amdgcn_s_barrier_signal:
     case Intrinsic::amdgcn_s_barrier_signal_var:
     case Intrinsic::amdgcn_s_barrier_signal_isfirst:
+    case Intrinsic::amdgcn_s_barrier_init:
+    case Intrinsic::amdgcn_s_barrier_join:
     case Intrinsic::amdgcn_s_barrier_wait:
+    case Intrinsic::amdgcn_s_barrier_leave:
     case Intrinsic::amdgcn_s_get_barrier_state:
     case Intrinsic::amdgcn_wave_barrier:
     case Intrinsic::amdgcn_sched_barrier:

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 868b1a21e3cd5..237929699dd9d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3342,6 +3342,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       assert(OpdMapper.getVRegs(1).empty());
       constrainOpWithReadfirstlane(B, MI, 1);
       return;
+    case Intrinsic::amdgcn_s_barrier_join:
+      constrainOpWithReadfirstlane(B, MI, 1);
+      return;
+    case Intrinsic::amdgcn_s_barrier_init:
     case Intrinsic::amdgcn_s_barrier_signal_var:
       constrainOpWithReadfirstlane(B, MI, 1);
       constrainOpWithReadfirstlane(B, MI, 2);
@@ -5515,6 +5519,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_s_sleep_var:
       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
       break;
+    case Intrinsic::amdgcn_s_barrier_join:
+      OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+      break;
+    case Intrinsic::amdgcn_s_barrier_init:
     case Intrinsic::amdgcn_s_barrier_signal_var:
       OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ebd79d9e8f5cf..2e76225bbc542 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10825,6 +10825,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
                                       Op->getOperand(2), Chain),
                    0);
+  case Intrinsic::amdgcn_s_barrier_init:
   case Intrinsic::amdgcn_s_barrier_signal_var: {
     // these two intrinsics have two operands: barrier pointer and member count
     SDValue Chain = Op->getOperand(0);
@@ -10832,6 +10833,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     SDValue BarOp = Op->getOperand(2);
     SDValue CntOp = Op->getOperand(3);
     SDValue M0Val;
+    unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
+                       ? AMDGPU::S_BARRIER_INIT_M0
+                       : AMDGPU::S_BARRIER_SIGNAL_M0;
     // extract the BarrierID from bits 4-9 of BarOp
     SDValue BarID;
     BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
@@ -10855,8 +10859,40 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 
     Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
 
-    auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_M0, DL,
-                                     Op->getVTList(), Ops);
+    auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
+    return SDValue(NewMI, 0);
+  }
+  case Intrinsic::amdgcn_s_barrier_join: {
+    // these three intrinsics have one operand: barrier pointer
+    SDValue Chain = Op->getOperand(0);
+    SmallVector<SDValue, 2> Ops;
+    SDValue BarOp = Op->getOperand(2);
+    unsigned Opc;
+
+    if (isa<ConstantSDNode>(BarOp)) {
+      uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
+      Opc = AMDGPU::S_BARRIER_JOIN_IMM;
+
+      // extract the BarrierID from bits 4-9 of the immediate
+      unsigned BarID = (BarVal >> 4) & 0x3F;
+      SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
+      Ops.push_back(K);
+      Ops.push_back(Chain);
+    } else {
+      Opc = AMDGPU::S_BARRIER_JOIN_M0;
+
+      // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
+      SDValue M0Val;
+      M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
+                          DAG.getShiftAmountConstant(4, MVT::i32, DL));
+      M0Val =
+          SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
+                                     DAG.getTargetConstant(0x3F, DL, MVT::i32)),
+                  0);
+      Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
+    }
+
+    auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
     return SDValue(NewMI, 0);
   }
   case Intrinsic::amdgcn_s_prefetch_data: {

diff  --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 4b48fc4ab8824..343e4550e1d93 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2341,6 +2341,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
     case AMDGPU::S_MEMREALTIME:
     case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
     case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
+    case AMDGPU::S_BARRIER_LEAVE:
     case AMDGPU::S_GET_BARRIER_STATE_M0:
     case AMDGPU::S_GET_BARRIER_STATE_IMM:
       ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 6b9403f9c7a21..18f0e5b9b56bc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -996,6 +996,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   bool isBarrier(unsigned Opcode) const {
     return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT ||
+           Opcode == AMDGPU::S_BARRIER_INIT_M0 ||
+           Opcode == AMDGPU::S_BARRIER_INIT_IMM ||
+           Opcode == AMDGPU::S_BARRIER_JOIN_IMM ||
+           Opcode == AMDGPU::S_BARRIER_LEAVE ||
+           Opcode == AMDGPU::S_BARRIER_LEAVE_IMM ||
            Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER;
   }
 

diff  --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 431d73b9a95b5..c2f4dbfa247d1 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -484,6 +484,24 @@ def S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_Pseudo <"s_barrier_signal_isfirst m0", (o
   let isConvergent = 1;
 }
 
+def S_BARRIER_INIT_M0 : SOP1_Pseudo <"s_barrier_init m0", (outs), (ins),
+  "", []>{
+  let SchedRW = [WriteBarrier];
+  let isConvergent = 1;
+}
+
+def S_BARRIER_INIT_IMM : SOP1_Pseudo <"s_barrier_init", (outs),
+  (ins SplitBarrier:$src0), "$src0", []>{
+  let SchedRW = [WriteBarrier];
+  let isConvergent = 1;
+}
+
+def S_BARRIER_JOIN_M0 : SOP1_Pseudo <"s_barrier_join m0", (outs), (ins),
+  "", []>{
+  let SchedRW = [WriteBarrier];
+  let isConvergent = 1;
+}
+
 } // End Uses = [M0]
 
 def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs),
@@ -501,6 +519,12 @@ def S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Pseudo <"s_barrier_signal_isfirst", (out
   let isConvergent = 1;
 }
 
+def S_BARRIER_JOIN_IMM : SOP1_Pseudo <"s_barrier_join", (outs),
+  (ins SplitBarrier:$src0), "$src0", []>{
+  let SchedRW = [WriteBarrier];
+  let isConvergent = 1;
+}
+
 } // End has_sdst = 0
 
 def S_GET_BARRIER_STATE_IMM : SOP1_Pseudo <"s_get_barrier_state", (outs SSrc_b32:$sdst),
@@ -1588,6 +1612,17 @@ def S_BARRIER_WAIT : SOPP_Pseudo <"s_barrier_wait", (ins i16imm:$simm16), "$simm
   let isConvergent = 1;
 }
 
+def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins)> {
+  let SchedRW = [WriteBarrier];
+  let simm16 = 0;
+  let fixed_imm = 1;
+  let isConvergent = 1;
+  let Defs = [SCC];
+}
+
+def S_BARRIER_LEAVE_IMM : SOPP_Pseudo <"s_barrier_leave",
+    (ins i16imm:$simm16), "$simm16", [(int_amdgcn_s_barrier_leave timm:$simm16)]>;
+
 def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
   let SubtargetPredicate = isGFX8Plus;
   let simm16 = 0;
@@ -2144,9 +2179,13 @@ defm S_SENDMSG_RTN_B64            : SOP1_Real_gfx11_gfx12<0x04d>;
 defm S_BARRIER_SIGNAL_M0          : SOP1_M0_Real_gfx12<0x04e>;
 defm S_BARRIER_SIGNAL_ISFIRST_M0  : SOP1_M0_Real_gfx12<0x04f>;
 defm S_GET_BARRIER_STATE_M0       : SOP1_M0_Real_gfx12<0x050>;
+defm S_BARRIER_INIT_M0            : SOP1_M0_Real_gfx12<0x051>;
+defm S_BARRIER_JOIN_M0            : SOP1_M0_Real_gfx12<0x052>;
 defm S_BARRIER_SIGNAL_IMM         : SOP1_IMM_Real_gfx12<0x04e>;
 defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_IMM_Real_gfx12<0x04f>;
 defm S_GET_BARRIER_STATE_IMM      : SOP1_IMM_Real_gfx12<0x050>;
+defm S_BARRIER_INIT_IMM           : SOP1_IMM_Real_gfx12<0x051>;
+defm S_BARRIER_JOIN_IMM           : SOP1_IMM_Real_gfx12<0x052>;
 defm S_ALLOC_VGPR                 : SOP1_Real_gfx12<0x053>;
 defm S_SLEEP_VAR                  : SOP1_IMM_Real_gfx12<0x058>;
 
@@ -2639,6 +2678,7 @@ multiclass SOPP_Real_32_gfx12<bits<7> op, string name = !tolower(NAME)> {
 }
 
 defm S_BARRIER_WAIT         : SOPP_Real_32_gfx12<0x014>;
+defm S_BARRIER_LEAVE        : SOPP_Real_32_gfx12<0x015>;
 defm S_WAIT_LOADCNT         : SOPP_Real_32_gfx12<0x040>;
 defm S_WAIT_STORECNT        : SOPP_Real_32_gfx12<0x041>;
 defm S_WAIT_SAMPLECNT       : SOPP_Real_32_gfx12<0x042>;

diff  --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
index f437dee253d00..d33a809a2ee88 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
@@ -489,3 +489,126 @@ body: |
     S_ENDPGM 0
 ...
 
+---
+name: skip_barrier_init_imm
+body: |
+  ; CHECK-LABEL: name: skip_barrier_init_imm
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   $m0 = S_MOV_B32 -1
+  ; CHECK-NEXT:   S_BARRIER_INIT_IMM -1, implicit $m0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    $m0 = S_MOV_B32 -1
+    S_BARRIER_INIT_IMM -1, implicit $m0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_barrier_init_m0
+body: |
+  ; CHECK-LABEL: name: skip_barrier_init_m0
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   $m0 = S_MOV_B32 -1
+  ; CHECK-NEXT:   S_BARRIER_INIT_M0 implicit $m0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    $m0 = S_MOV_B32 -1
+    S_BARRIER_INIT_M0 implicit $m0
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_barrier_join_imm
+body: |
+  ; CHECK-LABEL: name: skip_barrier_join_imm
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_BARRIER_JOIN_IMM -1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_BARRIER_JOIN_IMM -1
+
+  bb.2:
+    S_ENDPGM 0
+...
+
+---
+name: skip_barrier_leave
+body: |
+  ; CHECK-LABEL: name: skip_barrier_leave
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   V_NOP_e32 implicit $exec
+  ; CHECK-NEXT:   S_BARRIER_LEAVE implicit-def $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_CBRANCH_EXECZ %bb.2, implicit $exec
+
+  bb.1:
+    successors: %bb.2
+    V_NOP_e32 implicit $exec
+    S_BARRIER_LEAVE implicit-def $scc
+
+  bb.2:
+    S_ENDPGM 0
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
index 1f2b3e2c31892..7cf8883082458 100644
--- a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
@@ -11,12 +11,14 @@
 
 define void @func1() {
     call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
     call void @llvm.amdgcn.s.barrier.wait(i16 1)
     ret void
 }
 
 define void @func2() {
     call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
     call void @llvm.amdgcn.s.barrier.wait(i16 1)
     ret void
 }
@@ -24,6 +26,7 @@ define void @func2() {
 define amdgpu_kernel void @kernel1() #0 {
 ; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
     call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
     call void @llvm.amdgcn.s.barrier.wait(i16 1)
     call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) @bar1)
     %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
@@ -36,6 +39,7 @@ define amdgpu_kernel void @kernel1() #0 {
 define amdgpu_kernel void @kernel2() #0 {
 ; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
     call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
     call void @llvm.amdgcn.s.barrier.wait(i16 1)
 
     call void @func2()
@@ -47,6 +51,9 @@ declare void @llvm.amdgcn.s.barrier.wait(i16) #1
 declare void @llvm.amdgcn.s.barrier.signal(i32) #1
 declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
 declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.leave(i16) #1
 declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
 declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
 

diff  --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
new file mode 100644
index 0000000000000..1821bd45dc1cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
@@ -0,0 +1,275 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s
+
+ at bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+
+define void @func1() {
+; GFX12-SDAG-LABEL: func1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, 0x70003
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    s_barrier_signal m0
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, 3
+; GFX12-SDAG-NEXT:    s_barrier_join m0
+; GFX12-SDAG-NEXT:    s_barrier_wait 1
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: func1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_mov_b32 m0, 0x70003
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    s_barrier_signal m0
+; GFX12-GISEL-NEXT:    s_barrier_join 3
+; GFX12-GISEL-NEXT:    s_barrier_wait 1
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+    ret void
+}
+
+define void @func2() {
+; GFX12-SDAG-LABEL: func2:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, 0x70001
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    s_barrier_signal m0
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, 1
+; GFX12-SDAG-NEXT:    s_barrier_join m0
+; GFX12-SDAG-NEXT:    s_barrier_wait 1
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: func2:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_mov_b32 m0, 0x70001
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    s_barrier_signal m0
+; GFX12-GISEL-NEXT:    s_barrier_join 1
+; GFX12-GISEL-NEXT:    s_barrier_wait 1
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+    ret void
+}
+
+define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+; GFX12-SDAG-LABEL: kernel1:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX12-SDAG-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX12-SDAG-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, 0xc0002
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v31, v0
+; GFX12-SDAG-NEXT:    s_barrier_init m0
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[8:9], s[4:5], 48
+; GFX12-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX12-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_lshr_b32 s2, s2, 4
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_and_b32 s2, s2, 63
+; GFX12-SDAG-NEXT:    s_or_b32 s3, 0x90000, s2
+; GFX12-SDAG-NEXT:    s_cmp_eq_u32 0, 0
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, s3
+; GFX12-SDAG-NEXT:    s_barrier_init m0
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, 0xc0002
+; GFX12-SDAG-NEXT:    s_barrier_signal m0
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, s3
+; GFX12-SDAG-NEXT:    s_barrier_signal m0
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX12-SDAG-NEXT:    s_barrier_signal -1
+; GFX12-SDAG-NEXT:    s_barrier_signal_isfirst -1
+; GFX12-SDAG-NEXT:    s_barrier_join m0
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, 2
+; GFX12-SDAG-NEXT:    s_barrier_wait 1
+; GFX12-SDAG-NEXT:    s_barrier_leave
+; GFX12-SDAG-NEXT:    s_get_barrier_state s3, m0
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX12-SDAG-NEXT:    s_get_barrier_state s2, m0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-SDAG-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-SDAG-NEXT:    s_add_co_u32 s2, s2, func1 at gotpcrel32@lo+8
+; GFX12-SDAG-NEXT:    s_add_co_ci_u32 s3, s3, func1 at gotpcrel32@hi+16
+; GFX12-SDAG-NEXT:    s_barrier_signal -1
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-SDAG-NEXT:    s_barrier_wait -1
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-SDAG-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-SDAG-NEXT:    s_add_co_u32 s2, s2, func2 at gotpcrel32@lo+12
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_add_co_ci_u32 s3, s3, func2 at gotpcrel32@hi+24
+; GFX12-SDAG-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-SDAG-NEXT:    s_get_barrier_state s0, -1
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: kernel1:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX12-GISEL-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX12-GISEL-NEXT:    s_load_b32 s0, s[12:13], 0x2c
+; GFX12-GISEL-NEXT:    s_mov_b32 m0, 0xc0002
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v31, v0
+; GFX12-GISEL-NEXT:    s_barrier_init m0
+; GFX12-GISEL-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX12-GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX12-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_lshr_b32 s0, s0, 4
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_and_b32 s0, s0, 63
+; GFX12-GISEL-NEXT:    s_or_b32 s1, s0, 0x90000
+; GFX12-GISEL-NEXT:    s_cmp_eq_u32 0, 0
+; GFX12-GISEL-NEXT:    s_mov_b32 m0, s1
+; GFX12-GISEL-NEXT:    s_barrier_init m0
+; GFX12-GISEL-NEXT:    s_mov_b32 m0, 0xc0002
+; GFX12-GISEL-NEXT:    s_barrier_signal m0
+; GFX12-GISEL-NEXT:    s_mov_b32 m0, s1
+; GFX12-GISEL-NEXT:    s_barrier_signal m0
+; GFX12-GISEL-NEXT:    s_barrier_signal -1
+; GFX12-GISEL-NEXT:    s_barrier_signal_isfirst -1
+; GFX12-GISEL-NEXT:    s_mov_b32 m0, s0
+; GFX12-GISEL-NEXT:    s_add_co_u32 s8, s12, 48
+; GFX12-GISEL-NEXT:    s_barrier_join m0
+; GFX12-GISEL-NEXT:    s_barrier_wait 1
+; GFX12-GISEL-NEXT:    s_barrier_leave
+; GFX12-GISEL-NEXT:    s_get_barrier_state s0, 2
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_get_barrier_state s0, m0
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s9, s13, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-GISEL-NEXT:    s_sext_i32_i16 s1, s1
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, func1 at gotpcrel32@lo+8
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, func1 at gotpcrel32@hi+16
+; GFX12-GISEL-NEXT:    s_barrier_signal -1
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-GISEL-NEXT:    s_barrier_wait -1
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-GISEL-NEXT:    s_add_co_u32 s8, s12, 48
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s9, s13, 0
+; GFX12-GISEL-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_sext_i32_i16 s1, s1
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, func2 at gotpcrel32@lo+12
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, func2 at gotpcrel32@hi+24
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-GISEL-NEXT:    s_get_barrier_state s0, -1
+; GFX12-GISEL-NEXT:    s_endpgm
+    call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) @bar, i32 12)
+    call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %in, i32 9)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 12)
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %in, i32 9)
+    call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+    %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %in)
+    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+    call void @llvm.amdgcn.s.barrier.leave(i16 1)
+    %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar)
+    %state2 = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %in)
+    call void @llvm.amdgcn.s.barrier()
+    call void @func1()
+    call void @func2()
+    %state3 = call i32 @llvm.amdgcn.s.get.barrier.state(i32 -1)
+    ret void
+}
+
+define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+; GFX12-SDAG-LABEL: kernel2:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX12-SDAG-NEXT:    s_getpc_b64 s[6:7]
+; GFX12-SDAG-NEXT:    s_sext_i32_i16 s7, s7
+; GFX12-SDAG-NEXT:    s_add_co_u32 s6, s6, func2 at gotpcrel32@lo+8
+; GFX12-SDAG-NEXT:    s_add_co_ci_u32 s7, s7, func2 at gotpcrel32@hi+16
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v31, v0
+; GFX12-SDAG-NEXT:    s_load_b64 s[12:13], s[6:7], 0x0
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, 0x70002
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[8:9], s[4:5], 48
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_barrier_signal m0
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, 2
+; GFX12-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX12-SDAG-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX12-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT:    s_barrier_join m0
+; GFX12-SDAG-NEXT:    s_barrier_wait 1
+; GFX12-SDAG-NEXT:    s_swappc_b64 s[30:31], s[12:13]
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: kernel2:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_add_co_u32 s8, s4, 48
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s9, s5, 0
+; GFX12-GISEL-NEXT:    s_getpc_b64 s[4:5]
+; GFX12-GISEL-NEXT:    s_sext_i32_i16 s5, s5
+; GFX12-GISEL-NEXT:    s_add_co_u32 s4, s4, func2 at gotpcrel32@lo+8
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s5, s5, func2 at gotpcrel32@hi+16
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v31, v0
+; GFX12-GISEL-NEXT:    s_load_b64 s[12:13], s[4:5], 0x0
+; GFX12-GISEL-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX12-GISEL-NEXT:    s_mov_b32 m0, 0x70002
+; GFX12-GISEL-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX12-GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX12-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_barrier_signal m0
+; GFX12-GISEL-NEXT:    s_barrier_join 2
+; GFX12-GISEL-NEXT:    s_barrier_wait 1
+; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[12:13]
+; GFX12-GISEL-NEXT:    s_endpgm
+    call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7)
+    call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
+    call void @llvm.amdgcn.s.barrier.wait(i16 1)
+
+    call void @func2()
+    ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+declare void @llvm.amdgcn.s.barrier.wait(i16) #1
+declare void @llvm.amdgcn.s.barrier.signal(i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.leave(i16) #1
+declare i32 @llvm.amdgcn.s.get.barrier.state(i32) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone }

diff  --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
index b7d4764335a53..949847e2dec3b 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
@@ -725,6 +725,24 @@ s_barrier_signal_isfirst -1
 s_barrier_signal_isfirst m0
 // GFX12: s_barrier_signal_isfirst m0             ; encoding: [0x7d,0x4f,0x80,0xbe]
 
+s_barrier_init -1
+// GFX12: s_barrier_init -1                       ; encoding: [0xc1,0x51,0x80,0xbe]
+
+s_barrier_init -2
+// GFX12: s_barrier_init -2                       ; encoding: [0xc2,0x51,0x80,0xbe]
+
+s_barrier_init m0
+// GFX12: s_barrier_init m0                       ; encoding: [0x7d,0x51,0x80,0xbe]
+
+s_barrier_join -1
+// GFX12: s_barrier_join -1                       ; encoding: [0xc1,0x52,0x80,0xbe]
+
+s_barrier_join -2
+// GFX12: s_barrier_join -2                       ; encoding: [0xc2,0x52,0x80,0xbe]
+
+s_barrier_join m0
+// GFX12: s_barrier_join m0                       ; encoding: [0x7d,0x52,0x80,0xbe]
+
 s_get_barrier_state s3, -1
 // GFX12: s_get_barrier_state s3, -1              ; encoding: [0xc1,0x50,0x83,0xbe]
 

diff  --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
index a58d68cb30cb1..75c3d6e6368df 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
@@ -76,6 +76,9 @@ s_barrier_wait 0xffff
 s_barrier_wait 1
 // GFX12: encoding: [0x01,0x00,0x94,0xbf]
 
+s_barrier_leave
+// GFX12: encoding: [0x00,0x00,0x95,0xbf]
+
 //===----------------------------------------------------------------------===//
 // s_waitcnt
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
index 1016d07df4995..c88fbc2fc6c4f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
@@ -719,6 +719,24 @@
 0x7d,0x4f,0x80,0xbe
 # GFX12: s_barrier_signal_isfirst m0             ; encoding: [0x7d,0x4f,0x80,0xbe]
 
+0xc1,0x51,0x80,0xbe
+# GFX12: s_barrier_init -1                       ; encoding: [0xc1,0x51,0x80,0xbe]
+
+0xc2,0x51,0x80,0xbe
+# GFX12: s_barrier_init -2                       ; encoding: [0xc2,0x51,0x80,0xbe]
+
+0x7d,0x51,0x80,0xbe
+# GFX12: s_barrier_init m0                       ; encoding: [0x7d,0x51,0x80,0xbe]
+
+0xc1,0x52,0x80,0xbe
+# GFX12: s_barrier_join -1                       ; encoding: [0xc1,0x52,0x80,0xbe]
+
+0xc2,0x52,0x80,0xbe
+# GFX12: s_barrier_join -2                       ; encoding: [0xc2,0x52,0x80,0xbe]
+
+0x7d,0x52,0x80,0xbe
+# GFX12: s_barrier_join m0                       ; encoding: [0x7d,0x52,0x80,0xbe]
+
 0xc1,0x50,0x83,0xbe
 # GFX12: s_get_barrier_state s3, -1              ; encoding: [0xc1,0x50,0x83,0xbe]
 

diff  --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
index f8aef72678623..1e40ea27c47d3 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
@@ -68,6 +68,9 @@
 # GFX12: s_barrier_wait 1                        ; encoding: [0x01,0x00,0x94,0xbf]
 0x01,0x00,0x94,0xbf
 
+# GFX12: s_barrier_leave                         ; encoding: [0x00,0x00,0x95,0xbf]
+0x00,0x00,0x95,0xbf
+
 # GFX12: s_branch 0                              ; encoding: [0x00,0x00,0xa0,0xbf]
 0x00,0x00,0xa0,0xbf
 


        


More information about the cfe-commits mailing list