[libcxx-commits] [clang] [compiler-rt] [llvm] [mlir] [flang] [lldb] [clang-tools-extra] [libcxx] [lld] [libc] [AMDGPU] GFX12: Add Split Workgroup Barrier (PR #74836)
Mariusz Sikora via libcxx-commits
libcxx-commits at lists.llvm.org
Wed Dec 13 03:52:18 PST 2023
https://github.com/mariusz-sikora-at-amd updated https://github.com/llvm/llvm-project/pull/74836
>From f8bf834c64aed1f0b08fdf58bf6f1f43a41c7dc6 Mon Sep 17 00:00:00 2001
From: Vang Thao <Vang.Thao at amd.com>
Date: Thu, 7 Dec 2023 16:19:55 +0100
Subject: [PATCH 1/4] [AMDGPU] GFX12: Add Split Workgroup Barrier
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 16 +
.../builtins-amdgcn-gfx12-err.cl | 24 +
.../CodeGenOpenCL/builtins-amdgcn-gfx12.cl | 174 +++
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 39 +
.../AMDGPU/AMDGPUInstructionSelector.cpp | 152 ++
.../Target/AMDGPU/AMDGPUInstructionSelector.h | 3 +
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 45 +-
.../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 12 +-
.../Disassembler/AMDGPUDisassembler.cpp | 11 +
.../AMDGPU/Disassembler/AMDGPUDisassembler.h | 1 +
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 +
.../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 1 +
.../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 1 +
llvm/lib/Target/AMDGPU/SIDefines.h | 9 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 112 +-
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 5 +
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 9 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 7 +
llvm/lib/Target/AMDGPU/SOPInstructions.td | 112 ++
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 1 +
.../Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp | 10 +
.../CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll | 77 +
.../AMDGPU/llvm.amdgcn.s.barrier.wait.ll | 1366 +++++++++++++++++
llvm/test/MC/AMDGPU/gfx12_asm_sop1.s | 45 +
llvm/test/MC/AMDGPU/gfx12_asm_sopp.s | 9 +
.../Disassembler/AMDGPU/gfx12_dasm_sop1.txt | 53 +
.../Disassembler/AMDGPU/gfx12_dasm_sopp.txt | 9 +
27 files changed, 2302 insertions(+), 4 deletions(-)
create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 8b59b3790d7bc..7465f13d552d6 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -406,5 +406,21 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts")
+//===----------------------------------------------------------------------===//
+// GFX12+ only builtins.
+//===----------------------------------------------------------------------===//
+
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal, "vIi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_var, "vi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_wait, "vIs", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst, "bIi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst_var, "bi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_init, "vii", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
+
+
#undef BUILTIN
#undef TARGET_BUILTIN
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
new file mode 100644
index 0000000000000..5e0153c42825e
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
@@ -0,0 +1,24 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -verify -S -emit-llvm -o - %s
+
+kernel void builtins_amdgcn_s_barrier_signal_err(global int* in, global int* out, int barrier) {
+
+ __builtin_amdgcn_s_barrier_signal(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_signal' must be a constant integer}}
+ __builtin_amdgcn_s_barrier_wait(-1);
+ *out = *in;
+}
+
+kernel void builtins_amdgcn_s_barrier_wait_err(global int* in, global int* out, int barrier) {
+
+ __builtin_amdgcn_s_barrier_signal(-1);
+ __builtin_amdgcn_s_barrier_wait(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_wait' must be a constant integer}}
+ *out = *in;
+}
+
+kernel void builtins_amdgcn_s_barrier_signal_isfirst_err(global int* in, global int* out, int barrier) {
+
+ __builtin_amdgcn_s_barrier_signal_isfirst(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_signal_isfirst' must be a constant integer}}
+ __builtin_amdgcn_s_barrier_wait(-1);
+ *out = *in;
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
new file mode 100644
index 0000000000000..b8d281531e218
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -0,0 +1,174 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s
+
+// CHECK-LABEL: @test_s_barrier_signal(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_signal()
+{
+ __builtin_amdgcn_s_barrier_signal(-1);
+ __builtin_amdgcn_s_barrier_wait(-1);
+}
+
+// CHECK-LABEL: @test_s_barrier_signal_var(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(i32 [[TMP0]])
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_signal_var(int a)
+{
+ __builtin_amdgcn_s_barrier_signal_var(a);
+}
+
+// CHECK-LABEL: @test_s_barrier_signal_isfirst(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT: store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1)
+// CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+// CHECK: if.then:
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP1]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: br label [[IF_END:%.*]]
+// CHECK: if.else:
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: br label [[IF_END]]
+// CHECK: if.end:
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_signal_isfirst(int* a, int* b, int *c)
+{
+ if(__builtin_amdgcn_s_barrier_signal_isfirst(1))
+ a = b;
+ else
+ a = c;
+
+ __builtin_amdgcn_s_barrier_wait(1);
+}
+
+// CHECK-LABEL: @test_s_barrier_isfirst_var(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT: store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT: store i32 [[D:%.*]], ptr addrspace(5) [[D_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[D_ADDR]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32 [[TMP0]])
+// CHECK-NEXT: br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+// CHECK: if.then:
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: br label [[IF_END:%.*]]
+// CHECK: if.else:
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP3]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: br label [[IF_END]]
+// CHECK: if.end:
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_isfirst_var(int* a, int* b, int *c, int d)
+{
+ if ( __builtin_amdgcn_s_barrier_signal_isfirst_var(d))
+ a = b;
+ else
+ a = c;
+
+ __builtin_amdgcn_s_barrier_wait(1);
+
+}
+
+// CHECK-LABEL: @test_s_barrier_init(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(i32 1, i32 [[TMP0]])
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_init(int a)
+{
+ __builtin_amdgcn_s_barrier_init(1, a);
+}
+
+// CHECK-LABEL: @test_s_barrier_join(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(i32 1)
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_join()
+{
+ __builtin_amdgcn_s_barrier_join(1);
+}
+
+// CHECK-LABEL: @test_s_wakeup_barrier(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(i32 1)
+// CHECK-NEXT: ret void
+//
+void test_s_wakeup_barrier()
+{
+ __builtin_amdgcn_s_barrier_join(1);
+}
+
+// CHECK-LABEL: @test_s_barrier_leave(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT: store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.leave()
+// CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+// CHECK: if.then:
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP1]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: br label [[IF_END:%.*]]
+// CHECK: if.else:
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: br label [[IF_END]]
+// CHECK: if.end:
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_leave(int* a, int* b, int *c)
+{
+ if (__builtin_amdgcn_s_barrier_leave())
+ a = b;
+ else
+ a = c;
+}
+
+// CHECK-LABEL: @test_s_get_barrier_state(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[STATE:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.s.get.barrier.state(i32 [[TMP0]])
+// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[STATE]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[STATE]], align 4
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+unsigned test_s_get_barrier_state(int a)
+{
+ unsigned State = __builtin_amdgcn_s_get_barrier_state(a);
+ return State;
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bc9f99783d98f..09e88152e65d2 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -227,6 +227,45 @@ def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty],
def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">,
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal">,
+ Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">,
+ Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst">,
+ Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
+ IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_signal_isfirst_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst_var">,
+ Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">,
+ Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent,
+ IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">,
+ Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">,
+ Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">,
+ Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
+ IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_leave : ClangBuiltin<"__builtin_amdgcn_s_barrier_leave">,
+ Intrinsic<[llvm_i1_ty], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrier_state">,
+ Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ IntrNoCallback, IntrNoFree]>;
+
def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">,
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index d24c7da964ce8..75fac09d0b99f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1791,6 +1791,19 @@ bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
return true;
}
}
+
+ // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
+ if (STI.hasSplitBarriers()) {
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
+ .addImm(AMDGPU::Barrier::WORKGROUP);
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
+ .addImm(AMDGPU::Barrier::WORKGROUP);
+ MI.eraseFromParent();
+ return true;
+ }
+
return selectImpl(MI, *CoverageInfo);
}
@@ -2137,6 +2150,16 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
break;
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
return selectDSBvhStackIntrinsic(I);
+ case Intrinsic::amdgcn_s_barrier_init:
+ case Intrinsic::amdgcn_s_barrier_join:
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ case Intrinsic::amdgcn_s_get_barrier_state:
+ return selectNamedBarrierInst(I, IntrinsicID);
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst:
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
+ return selectSBarrierSignalIsfirst(I, IntrinsicID);
+ case Intrinsic::amdgcn_s_barrier_leave:
+ return selectSBarrierLeave(I);
}
return selectImpl(I, *CoverageInfo);
}
@@ -5239,6 +5262,135 @@ AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
}};
}
+bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
+ MachineInstr &I, Intrinsic::ID IntrID) const {
+ MachineBasicBlock *MBB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ Register CCReg = I.getOperand(0).getReg();
+
+ bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var;
+
+ if (HasM0) {
+ auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(I.getOperand(2).getReg());
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0));
+ if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI))
+ return false;
+ } else {
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
+ .addImm(I.getOperand(2).getImm());
+ }
+
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
+
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
+ *MRI);
+}
+
+unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
+ if (HasInlineConst) {
+ switch (IntrID) {
+ default:
+ llvm_unreachable("not a named barrier op");
+ case Intrinsic::amdgcn_s_barrier_init:
+ return AMDGPU::S_BARRIER_INIT_IMM;
+ case Intrinsic::amdgcn_s_barrier_join:
+ return AMDGPU::S_BARRIER_JOIN_IMM;
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ return AMDGPU::S_WAKEUP_BARRIER_IMM;
+ case Intrinsic::amdgcn_s_get_barrier_state:
+ return AMDGPU::S_GET_BARRIER_STATE_IMM;
+ };
+ } else {
+ switch (IntrID) {
+ default:
+ llvm_unreachable("not a named barrier op");
+ case Intrinsic::amdgcn_s_barrier_init:
+ return AMDGPU::S_BARRIER_INIT_M0;
+ case Intrinsic::amdgcn_s_barrier_join:
+ return AMDGPU::S_BARRIER_JOIN_M0;
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ return AMDGPU::S_WAKEUP_BARRIER_M0;
+ case Intrinsic::amdgcn_s_get_barrier_state:
+ return AMDGPU::S_GET_BARRIER_STATE_M0;
+ };
+ }
+}
+
+bool AMDGPUInstructionSelector::selectNamedBarrierInst(
+ MachineInstr &I, Intrinsic::ID IntrID) const {
+ MachineBasicBlock *MBB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state
+ ? I.getOperand(2)
+ : I.getOperand(1);
+ std::optional<int64_t> BarValImm =
+ getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
+ Register M0Val;
+ Register TmpReg0;
+
+ // For S_BARRIER_INIT, member count will always be read from M0[16:22]
+ if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
+ Register MemberCount = I.getOperand(2).getReg();
+ TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ // TODO: This should be expanded during legalization so that the the S_LSHL
+ // and S_OR can be constant-folded
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
+ .addImm(16)
+ .addReg(MemberCount);
+ M0Val = TmpReg0;
+ }
+
+ // If not inlinable, get reference to barrier depending on the instruction
+ if (!BarValImm) {
+ if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
+ // If reference to barrier id is not an inlinable constant then it must be
+ // referenced with M0[4:0]. Perform an OR with the member count to include
+ // it in M0 for S_BARRIER_INIT.
+ Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1)
+ .addReg(BarOp.getReg())
+ .addReg(TmpReg0);
+ M0Val = TmpReg1;
+ } else {
+ M0Val = BarOp.getReg();
+ }
+ }
+
+ // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required.
+ if (M0Val) {
+ auto CopyMIB =
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val);
+ constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
+ }
+
+ MachineInstrBuilder MIB;
+ unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
+ MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
+
+ if (IntrID == Intrinsic::amdgcn_s_get_barrier_state)
+ MIB.addDef(I.getOperand(0).getReg());
+
+ if (BarValImm)
+ MIB.addImm(*BarValImm);
+
+ I.eraseFromParent();
+ return true;
+}
+bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ Register CCReg = I.getOperand(0).getReg();
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE));
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
+
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
+ *MRI);
+}
+
void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index c93e3de66d405..00ff1747ce57a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -149,6 +149,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectSMFMACIntrin(MachineInstr &I) const;
bool selectWaveAddress(MachineInstr &I) const;
bool selectStackRestore(MachineInstr &MI) const;
+ bool selectNamedBarrierInst(MachineInstr &I, Intrinsic::ID IID) const;
+ bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
+ bool selectSBarrierLeave(MachineInstr &I) const;
std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
bool IsCanonicalizing = true,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 62996a3b3fb79..a722f841cb7d9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3101,6 +3101,22 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 8); // M0
return;
+ case Intrinsic::amdgcn_s_barrier_signal_var:
+ case Intrinsic::amdgcn_s_barrier_join:
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ constrainOpWithReadfirstlane(B, MI, 1);
+ return;
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
+ constrainOpWithReadfirstlane(B, MI, 2);
+ return;
+ case Intrinsic::amdgcn_s_barrier_init:
+ constrainOpWithReadfirstlane(B, MI, 1);
+ constrainOpWithReadfirstlane(B, MI, 2);
+ return;
+ case Intrinsic::amdgcn_s_get_barrier_state: {
+ constrainOpWithReadfirstlane(B, MI, 2);
+ return;
+ }
default: {
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -4830,7 +4846,34 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
break;
}
-
+ case Intrinsic::amdgcn_s_barrier_signal_var:
+ case Intrinsic::amdgcn_s_barrier_join:
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ break;
+ case Intrinsic::amdgcn_s_barrier_init:
+ OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ break;
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: {
+ const unsigned ResultSize = 1;
+ OpdsMapping[0] =
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ break;
+ }
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst:
+ case Intrinsic::amdgcn_s_barrier_leave: {
+ const unsigned ResultSize = 1;
+ OpdsMapping[0] =
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
+ break;
+ }
+ case Intrinsic::amdgcn_s_get_barrier_state: {
+ OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ break;
+ }
default:
return getInvalidInstructionMapping();
}
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 092845d391a3b..03a974d609e58 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -893,6 +893,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
bool isSDelayALU() const;
bool isHwreg() const;
bool isSendMsg() const;
+ bool isSplitBarrier() const;
bool isSwizzle() const;
bool isSMRDOffset8() const;
bool isSMEMOffset() const;
@@ -1856,6 +1857,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
case AMDGPU::OPERAND_REG_IMM_V2INT32:
case AMDGPU::OPERAND_KIMM32:
+ case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
return &APFloat::IEEEsingle();
case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_IMM_FP64:
@@ -2185,7 +2187,8 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
case AMDGPU::OPERAND_REG_IMM_V2INT32:
case AMDGPU::OPERAND_KIMM32:
- case AMDGPU::OPERAND_KIMM16: {
+ case AMDGPU::OPERAND_KIMM16:
+ case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: {
bool lost;
APFloat FPLiteral(APFloat::IEEEdouble(), Literal);
// Convert literal to single precision
@@ -2226,6 +2229,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
case AMDGPU::OPERAND_REG_IMM_V2INT32:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
+ case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
if (isSafeTruncation(Val, 32) &&
AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
AsmParser->hasInv2PiInlineImm())) {
@@ -9129,3 +9133,9 @@ bool AMDGPUOperand::isWaitVDST() const {
bool AMDGPUOperand::isWaitEXP() const {
return isImmTy(ImmTyWaitEXP) && isUInt<3>(getImm());
}
+
+//===----------------------------------------------------------------------===//
+// Split Barrier
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUOperand::isSplitBarrier() const { return isInlinableImm(MVT::i32); }
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 1f11beb71101b..9ee92a262d227 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -107,6 +107,13 @@ static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val, uint64_t Addr,
return addOperand(Inst, DAsm->decodeBoolReg(Val));
}
+static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val,
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+ return addOperand(Inst, DAsm->decodeSplitBarrier(Val));
+}
+
#define DECODE_OPERAND(StaticDecoderName, DecoderName) \
static DecodeStatus StaticDecoderName(MCInst &Inst, unsigned Imm, \
uint64_t /*Addr*/, \
@@ -1715,6 +1722,10 @@ MCOperand AMDGPUDisassembler::decodeBoolReg(unsigned Val) const {
: decodeSrcOp(OPW32, Val);
}
+MCOperand AMDGPUDisassembler::decodeSplitBarrier(unsigned Val) const {
+ return decodeSrcOp(OPW32, Val);
+}
+
bool AMDGPUDisassembler::isVI() const {
return STI.hasFeature(AMDGPU::FeatureVolcanicIslands);
}
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 7e233dcb54ea1..233581949d712 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -251,6 +251,7 @@ class AMDGPUDisassembler : public MCDisassembler {
MCOperand decodeSDWAVopcDst(unsigned Val) const;
MCOperand decodeBoolReg(unsigned Val) const;
+ MCOperand decodeSplitBarrier(unsigned Val) const;
int getTTmpIdx(unsigned Val) const;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 94b9e49b765a6..7b3d7a9024dbf 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1204,6 +1204,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return hasKernargPreload() && !hasGFX940Insts();
}
+ // \returns true if the target has split barriers feature
+ bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
+
// \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
bool hasCvtFP8VOP1Bug() const { return true; }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 7ba015cdea241..8e9a631854118 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -699,6 +699,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
case MCOI::OPERAND_IMMEDIATE:
+ case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
printImmediate32(Op.getImm(), STI, O);
break;
case AMDGPU::OPERAND_REG_IMM_INT64:
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index 80e7ca2b39d1b..b403d69d9ff13 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -262,6 +262,7 @@ AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO,
case AMDGPU::OPERAND_REG_IMM_V2FP32:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+ case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
case AMDGPU::OPERAND_REG_IMM_INT64:
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 47dc59e77dc4e..e0f3d0dfd1f25 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -213,6 +213,9 @@ enum OperandType : unsigned {
OPERAND_REG_INLINE_C_V2INT32,
OPERAND_REG_INLINE_C_V2FP32,
+ // Operand for split barrier inline constant
+ OPERAND_INLINE_SPLIT_BARRIER_INT32,
+
/// Operand with 32-bit immediate that uses the constant bus.
OPERAND_KIMM32,
OPERAND_KIMM16,
@@ -1025,6 +1028,12 @@ enum Register_Flag : uint8_t {
} // namespace AMDGPU
+namespace AMDGPU {
+namespace Barrier {
+enum Type { TRAP = -2, WORKGROUP = -1 };
+} // namespace Barrier
+} // namespace AMDGPU
+
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028
#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0)
#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a7f4d63229b7e..81604b7a9fe27 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8612,6 +8612,31 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
M->getVTList(), Ops, M->getMemoryVT(),
M->getMemOperand());
}
+ case Intrinsic::amdgcn_s_get_barrier_state: {
+ SDValue Chain = Op->getOperand(0);
+ SmallVector<SDValue, 2> Ops;
+ unsigned Opc;
+ bool IsInlinableBarID = false;
+ int64_t BarID;
+
+ if (isa<ConstantSDNode>(Op->getOperand(2))) {
+ BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
+ IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID);
+ }
+
+ if (IsInlinableBarID) {
+ Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
+ SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
+ Ops.push_back(K);
+ } else {
+ Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
+ SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2));
+ Ops.push_back(M0Val.getValue(0));
+ }
+
+ auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
+ return SDValue(NewMI, 0);
+ }
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
@@ -8789,13 +8814,29 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
}
case Intrinsic::amdgcn_s_barrier: {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
if (WGSize <= ST.getWavefrontSize())
return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
Op.getOperand(0)), 0);
}
+
+ // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
+ if (ST.hasSplitBarriers()) {
+ SDValue K =
+ DAG.getTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
+ SDValue BarSignal =
+ SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
+ MVT::Other, K, Op.getOperand(0)),
+ 0);
+ SDValue BarWait =
+ SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
+ BarSignal.getValue(0)),
+ 0);
+ return BarWait;
+ }
+
return SDValue();
};
case Intrinsic::amdgcn_tbuffer_store: {
@@ -9181,7 +9222,76 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
case Intrinsic::amdgcn_end_cf:
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
Op->getOperand(2), Chain), 0);
+ case Intrinsic::amdgcn_s_barrier_init:
+ case Intrinsic::amdgcn_s_barrier_join:
+ case Intrinsic::amdgcn_s_wakeup_barrier: {
+ SDValue Chain = Op->getOperand(0);
+ SmallVector<SDValue, 2> Ops;
+ SDValue BarOp = Op->getOperand(2);
+ unsigned Opc;
+ bool IsInlinableBarID = false;
+ int64_t BarVal;
+
+ if (isa<ConstantSDNode>(BarOp)) {
+ BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
+ IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal);
+ }
+ if (IsInlinableBarID) {
+ switch (IntrinsicID) {
+ default:
+ return SDValue();
+ case Intrinsic::amdgcn_s_barrier_init:
+ Opc = AMDGPU::S_BARRIER_INIT_IMM;
+ break;
+ case Intrinsic::amdgcn_s_barrier_join:
+ Opc = AMDGPU::S_BARRIER_JOIN_IMM;
+ break;
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
+ break;
+ }
+
+ SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32);
+ Ops.push_back(K);
+ } else {
+ switch (IntrinsicID) {
+ default:
+ return SDValue();
+ case Intrinsic::amdgcn_s_barrier_init:
+ Opc = AMDGPU::S_BARRIER_INIT_M0;
+ break;
+ case Intrinsic::amdgcn_s_barrier_join:
+ Opc = AMDGPU::S_BARRIER_JOIN_M0;
+ break;
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
+ break;
+ }
+ }
+
+ if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
+ SDValue M0Val;
+ // Member count will be read from M0[16:22]
+ M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3),
+ DAG.getShiftAmountConstant(16, MVT::i32, DL));
+
+ if (!IsInlinableBarID) {
+ // If reference to barrier id is not an inline constant then it must be
+ // referenced with M0[4:0]. Perform an OR with the member count to
+ // include it in M0.
+ M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32,
+ Op.getOperand(2), M0Val),
+ 0);
+ }
+ Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
+ } else if (!IsInlinableBarID) {
+ Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0));
+ }
+
+ auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
+ return SDValue(NewMI, 0);
+ }
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ede4841b8a5fd..13426e7321b67 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1504,6 +1504,11 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
break;
case AMDGPU::S_MEMTIME:
case AMDGPU::S_MEMREALTIME:
+ case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
+ case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
+ case AMDGPU::S_BARRIER_LEAVE:
+ case AMDGPU::S_GET_BARRIER_STATE_M0:
+ case AMDGPU::S_GET_BARRIER_STATE_IMM:
ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
break;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0a06fa88b6b10..9ce7a855a3521 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4118,7 +4118,8 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
case AMDGPU::OPERAND_REG_IMM_V2INT32:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
- case AMDGPU::OPERAND_REG_INLINE_AC_FP32: {
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: {
int32_t Trunc = static_cast<int32_t>(Imm);
return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
}
@@ -4559,6 +4560,12 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
break;
}
+ case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
+ if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
+ ErrInfo = "Expected inline constant for operand.";
+ return false;
+ }
+ break;
case MCOI::OPERAND_IMMEDIATE:
case AMDGPU::OPERAND_KIMM32:
// Check if this operand is an immediate.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 091b40eefa55a..ea346f25fa72c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -919,6 +919,13 @@ def InterpAttr : CustomOperand<i32>;
def InterpAttrChan : ImmOperand<i32>;
+def SplitBarrier : ImmOperand<i32> {
+ let OperandNamespace = "AMDGPU";
+ let OperandType = "OPERAND_INLINE_SPLIT_BARRIER_INT32";
+ let DecoderMethod = "decodeSplitBarrier";
+ let PrintMethod = "printOperand";
+}
+
def VReg32OrOffClass : AsmOperandClass {
let Name = "VReg32OrOff";
let ParserMethod = "parseVReg32OrOff";
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 9ff64968ef01b..6db1259e28889 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -438,6 +438,89 @@ let SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE],
} // End SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE]
// SchedRW = [WriteSFPU], isReMaterializable = 1
+let hasSideEffects = 1 in {
+let has_sdst = 0 in {
+let Uses = [M0] in {
+def S_BARRIER_SIGNAL_M0 : SOP1_Pseudo <"s_barrier_signal m0", (outs), (ins),
+ "", [(int_amdgcn_s_barrier_signal_var M0)]>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_Pseudo <"s_barrier_signal_isfirst m0", (outs), (ins),
+ "", [(set SCC, (int_amdgcn_s_barrier_signal_isfirst_var M0))]>{
+ let Defs = [SCC];
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_BARRIER_INIT_M0 : SOP1_Pseudo <"s_barrier_init m0", (outs), (ins),
+ "", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_BARRIER_INIT_IMM : SOP1_Pseudo <"s_barrier_init", (outs),
+ (ins SplitBarrier:$src0), "$src0", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_BARRIER_JOIN_M0 : SOP1_Pseudo <"s_barrier_join m0", (outs), (ins),
+ "", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_WAKEUP_BARRIER_M0 : SOP1_Pseudo <"s_wakeup_barrier m0", (outs), (ins),
+ "", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+} // End Uses = [M0]
+
+def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs),
+ (ins SplitBarrier:$src0), "$src0", [(int_amdgcn_s_barrier_signal timm:$src0)]>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Pseudo <"s_barrier_signal_isfirst", (outs),
+ (ins SplitBarrier:$src0), "$src0", [(set SCC, (int_amdgcn_s_barrier_signal_isfirst timm:$src0))]>{
+ let Defs = [SCC];
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_BARRIER_JOIN_IMM : SOP1_Pseudo <"s_barrier_join", (outs),
+ (ins SplitBarrier:$src0), "$src0", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_WAKEUP_BARRIER_IMM : SOP1_Pseudo <"s_wakeup_barrier", (outs),
+ (ins SplitBarrier:$src0), "$src0", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+
+
+}
+} // End has_sdst = 0
+
+def S_GET_BARRIER_STATE_IMM : SOP1_Pseudo <"s_get_barrier_state", (outs SSrc_b32:$sdst),
+ (ins SplitBarrier:$src0), "$sdst, $src0", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_GET_BARRIER_STATE_M0 : SOP1_Pseudo <"s_get_barrier_state $sdst, m0", (outs SSrc_b32:$sdst),
+ (ins), "", []>{
+ let Uses = [M0];
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+} // End hasSideEffects = 1
+
//===----------------------------------------------------------------------===//
// SOP2 Instructions
//===----------------------------------------------------------------------===//
@@ -1473,6 +1556,21 @@ def S_BARRIER : SOPP_Pseudo <"s_barrier", (ins), "",
let isConvergent = 1;
}
+def S_BARRIER_WAIT : SOPP_Pseudo <"s_barrier_wait", (ins i16imm:$simm16), "$simm16",
+ [(int_amdgcn_s_barrier_wait timm:$simm16)]> {
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins), "",
+ [(set SCC, (int_amdgcn_s_barrier_leave))]> {
+ let SchedRW = [WriteBarrier];
+ let simm16 = 0;
+ let fixed_imm = 1;
+ let isConvergent = 1;
+ let Defs = [SCC];
+}
+
def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
let SubtargetPredicate = isGFX8Plus;
let simm16 = 0;
@@ -1878,6 +1976,18 @@ defm S_SWAPPC_B64 : SOP1_Real_gfx11_gfx12<0x049>;
defm S_RFE_B64 : SOP1_Real_gfx11_gfx12<0x04a>;
defm S_SENDMSG_RTN_B32 : SOP1_Real_gfx11_gfx12<0x04c>;
defm S_SENDMSG_RTN_B64 : SOP1_Real_gfx11_gfx12<0x04d>;
+defm S_BARRIER_SIGNAL_M0 : SOP1_M0_Real_gfx12<0x04e>;
+defm S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_M0_Real_gfx12<0x04f>;
+defm S_GET_BARRIER_STATE_M0 : SOP1_M0_Real_gfx12<0x050>;
+defm S_BARRIER_INIT_M0 : SOP1_M0_Real_gfx12<0x051>;
+defm S_BARRIER_JOIN_M0 : SOP1_M0_Real_gfx12<0x052>;
+defm S_WAKEUP_BARRIER_M0 : SOP1_M0_Real_gfx12<0x057>;
+defm S_BARRIER_SIGNAL_IMM : SOP1_Real_gfx12<0x04e>;
+defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Real_gfx12<0x04f>;
+defm S_GET_BARRIER_STATE_IMM : SOP1_Real_gfx12<0x050>;
+defm S_BARRIER_INIT_IMM : SOP1_Real_gfx12<0x051>;
+defm S_BARRIER_JOIN_IMM : SOP1_Real_gfx12<0x052>;
+defm S_WAKEUP_BARRIER_IMM : SOP1_Real_gfx12<0x057>;
//===----------------------------------------------------------------------===//
// SOP1 - GFX1150, GFX12
@@ -2365,6 +2475,8 @@ multiclass SOPP_Real_32_Renamed_gfx12<bits<7> op, SOPP_Pseudo backing_pseudo, st
}
defm S_WAIT_ALU : SOPP_Real_32_Renamed_gfx12<0x008, S_WAITCNT_DEPCTR, "s_wait_alu">;
+defm S_BARRIER_WAIT : SOPP_Real_32_gfx12<0x014>;
+defm S_BARRIER_LEAVE : SOPP_Real_32_gfx12<0x015>;
//===----------------------------------------------------------------------===//
// SOPP - GFX11, GFX12.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 436a0ff5ce268..f5044abaeacf4 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1229,6 +1229,7 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
case AMDGPU::OPERAND_KIMM32:
case AMDGPU::OPERAND_KIMM16: // mandatory literal is always size 4
+ case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
return 4;
case AMDGPU::OPERAND_REG_IMM_INT64:
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
index cbdbf1c16f9f0..25e628e5cbc55 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -74,6 +74,16 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_s_barrier:
+ case Intrinsic::amdgcn_s_barrier_signal:
+ case Intrinsic::amdgcn_s_barrier_signal_var:
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst:
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
+ case Intrinsic::amdgcn_s_barrier_init:
+ case Intrinsic::amdgcn_s_barrier_join:
+ case Intrinsic::amdgcn_s_barrier_wait:
+ case Intrinsic::amdgcn_s_barrier_leave:
+ case Intrinsic::amdgcn_s_get_barrier_state:
+ case Intrinsic::amdgcn_s_wakeup_barrier:
case Intrinsic::amdgcn_wave_barrier:
case Intrinsic::amdgcn_sched_barrier:
case Intrinsic::amdgcn_sched_group_barrier:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 48c4e0276edda..f3ee7f3819b5d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -3,6 +3,9 @@
; RUN: llc -march=amdgcn -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT1 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT2 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT3 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT4 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=+auto-waitcnt-before-barrier -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT5 %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=VARIANT6 %s
define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
; VARIANT0-LABEL: test_barrier:
@@ -85,6 +88,80 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
; VARIANT3-NEXT: s_waitcnt vmcnt(0)
; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3]
; VARIANT3-NEXT: s_endpgm
+;
+; VARIANT4-LABEL: test_barrier:
+; VARIANT4: ; %bb.0: ; %entry
+; VARIANT4-NEXT: s_clause 0x1
+; VARIANT4-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; VARIANT4-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; VARIANT4-NEXT: v_lshlrev_b32_e32 v3, 2, v0
+; VARIANT4-NEXT: s_waitcnt lgkmcnt(0)
+; VARIANT4-NEXT: v_xad_u32 v1, v0, -1, s2
+; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1]
+; VARIANT4-NEXT: s_barrier_signal -1
+; VARIANT4-NEXT: s_barrier_wait -1
+; VARIANT4-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; VARIANT4-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2]
+; VARIANT4-NEXT: v_add_co_u32 v1, vcc_lo, s0, v1
+; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; VARIANT4-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo
+; VARIANT4-NEXT: global_load_b32 v0, v[1:2], off
+; VARIANT4-NEXT: s_waitcnt vmcnt(0)
+; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1]
+; VARIANT4-NEXT: s_nop 0
+; VARIANT4-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; VARIANT4-NEXT: s_endpgm
+;
+; VARIANT5-LABEL: test_barrier:
+; VARIANT5: ; %bb.0: ; %entry
+; VARIANT5-NEXT: s_clause 0x1
+; VARIANT5-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; VARIANT5-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; VARIANT5-NEXT: v_lshlrev_b32_e32 v3, 2, v0
+; VARIANT5-NEXT: s_waitcnt lgkmcnt(0)
+; VARIANT5-NEXT: v_xad_u32 v1, v0, -1, s2
+; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1]
+; VARIANT5-NEXT: s_barrier_signal -1
+; VARIANT5-NEXT: s_barrier_wait -1
+; VARIANT5-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; VARIANT5-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2]
+; VARIANT5-NEXT: v_add_co_u32 v1, vcc_lo, s0, v1
+; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; VARIANT5-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo
+; VARIANT5-NEXT: global_load_b32 v0, v[1:2], off
+; VARIANT5-NEXT: s_waitcnt vmcnt(0)
+; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1]
+; VARIANT5-NEXT: s_nop 0
+; VARIANT5-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; VARIANT5-NEXT: s_endpgm
+;
+; VARIANT6-LABEL: test_barrier:
+; VARIANT6: ; %bb.0: ; %entry
+; VARIANT6-NEXT: s_clause 0x1
+; VARIANT6-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; VARIANT6-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; VARIANT6-NEXT: v_lshlrev_b32_e32 v5, 2, v0
+; VARIANT6-NEXT: s_waitcnt lgkmcnt(0)
+; VARIANT6-NEXT: s_sub_co_i32 s2, s2, 1
+; VARIANT6-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
+; VARIANT6-NEXT: v_sub_nc_u32_e32 v1, s2, v0
+; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1]
+; VARIANT6-NEXT: s_barrier_signal -1
+; VARIANT6-NEXT: s_barrier_wait -1
+; VARIANT6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; VARIANT6-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2]
+; VARIANT6-NEXT: v_add_co_u32 v1, vcc_lo, v3, v1
+; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; VARIANT6-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo
+; VARIANT6-NEXT: global_load_b32 v0, v[1:2], off
+; VARIANT6-NEXT: s_waitcnt vmcnt(0)
+; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1]
+; VARIANT6-NEXT: s_nop 0
+; VARIANT6-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; VARIANT6-NEXT: s_endpgm
entry:
%tmp = call i32 @llvm.amdgcn.workitem.id.x()
%tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
new file mode 100644
index 0000000000000..1ad3e58ce7fc3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
@@ -0,0 +1,1366 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GLOBAL-ISEL %s
+
+define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test1_s_barrier_signal:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
+; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: s_barrier_signal -1
+; GCN-NEXT: s_barrier_wait -1
+; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test1_s_barrier_signal:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: s_barrier_signal -1
+; GLOBAL-ISEL-NEXT: s_barrier_wait -1
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+ %tmp3 = mul i32 %tmp, %tmp
+ %tmp4 = sub i32 %tmp3, %tmp
+ store i32 %tmp4, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test2_s_barrier_signal:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
+; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: s_barrier_signal 1
+; GCN-NEXT: s_barrier_wait 1
+; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test2_s_barrier_signal:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: s_barrier_signal 1
+; GLOBAL-ISEL-NEXT: s_barrier_wait 1
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ call void @llvm.amdgcn.s.barrier.signal(i32 1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ %tmp3 = mul i32 %tmp, %tmp
+ %tmp4 = sub i32 %tmp3, %tmp
+ store i32 %tmp4, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test3_s_barrier_signal:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
+; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: s_barrier_signal 0
+; GCN-NEXT: s_barrier_wait 0
+; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test3_s_barrier_signal:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: s_barrier_signal 0
+; GLOBAL-ISEL-NEXT: s_barrier_wait 0
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ call void @llvm.amdgcn.s.barrier.signal(i32 0)
+ call void @llvm.amdgcn.s.barrier.wait(i16 0)
+ %tmp3 = mul i32 %tmp, %tmp
+ %tmp4 = sub i32 %tmp3, %tmp
+ store i32 %tmp4, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test1_s_barrier_signal_var:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
+; GCN-NEXT: s_mov_b32 m0, 1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_b32 v3, v1, s[0:1]
+; GCN-NEXT: s_barrier_signal m0
+; GCN-NEXT: s_barrier_wait 1
+; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_var:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: s_barrier_signal m0
+; GLOBAL-ISEL-NEXT: s_barrier_wait 1
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ call void @llvm.amdgcn.s.barrier.signal.var(i32 1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ %tmp3 = mul i32 %tmp, %tmp
+ %tmp4 = sub i32 %tmp3, %tmp
+ store i32 %tmp4, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define void @test2_s_barrier_signal_var(i32 %arg) {
+; GCN-LABEL: test2_s_barrier_signal_var:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: s_mov_b32 m0, s0
+; GCN-NEXT: s_barrier_signal m0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_var:
+; GLOBAL-ISEL: ; %bb.0:
+; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0
+; GLOBAL-ISEL-NEXT: s_barrier_signal m0
+; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
+ call void @llvm.amdgcn.s.barrier.signal.var(i32 %arg)
+ ret void
+}
+
+define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test1_s_barrier_signal_isfirst:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: s_barrier_signal_isfirst -1
+; GCN-NEXT: s_cselect_b32 s3, s3, s5
+; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
+; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
+; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst -1
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_clause 0x1
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0)
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+ %0 = load i32, ptr addrspace(1) %a, align 4
+ %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c
+ %1 = load i32, ptr addrspace(1) %b.c, align 4
+ %mul1 = mul nsw i32 %1, %0
+ store i32 %mul1, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test2_s_barrier_signal_isfirst:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: s_barrier_signal_isfirst 1
+; GCN-NEXT: s_cselect_b32 s3, s3, s5
+; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
+; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
+; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_clause 0x1
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0)
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1)
+ %0 = load i32, ptr addrspace(1) %a, align 4
+ %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c
+ %1 = load i32, ptr addrspace(1) %b.c, align 4
+ %mul1 = mul nsw i32 %1, %0
+ store i32 %mul1, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test3_s_barrier_signal_isfirst:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: s_barrier_signal_isfirst 1
+; GCN-NEXT: s_cselect_b32 s3, s3, s5
+; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
+; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
+; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test3_s_barrier_signal_isfirst:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_clause 0x1
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0)
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1)
+ %0 = load i32, ptr addrspace(1) %a, align 4
+ %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c
+ %1 = load i32, ptr addrspace(1) %b.c, align 4
+ %mul1 = mul nsw i32 %1, %0
+ store i32 %mul1, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test1_s_barrier_signal_isfirst_var:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: s_mov_b32 m0, 1
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: s_barrier_signal_isfirst m0
+; GCN-NEXT: s_cselect_b32 s3, s3, s5
+; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
+; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
+; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst_var:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst m0
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_clause 0x1
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0)
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32 1)
+ %0 = load i32, ptr addrspace(1) %a, align 4
+ %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c
+ %1 = load i32, ptr addrspace(1) %b.c, align 4
+ %mul1 = mul nsw i32 %1, %0
+ store i32 %mul1, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, i32 %arg, ptr addrspace(1) %out) {
+; GCN-LABEL: test2_s_barrier_signal_isfirst_var:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v31
+; GCN-NEXT: v_readfirstlane_b32 s0, v6
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 2, v9
+; GCN-NEXT: s_mov_b32 m0, s0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_add_co_u32 v7, vcc_lo, v7, v9
+; GCN-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
+; GCN-NEXT: global_store_b32 v[7:8], v10, off
+; GCN-NEXT: s_barrier_signal_isfirst m0
+; GCN-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GCN-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
+; GCN-NEXT: global_load_b32 v0, v[0:1], off
+; GCN-NEXT: global_load_b32 v1, v[2:3], off
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_lo_u32 v0, v1, v0
+; GCN-NEXT: global_store_b32 v[7:8], v0, off
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst_var:
+; GLOBAL-ISEL: ; %bb.0:
+; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: v_and_b32_e32 v9, 0x3ff, v31
+; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v6
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GLOBAL-ISEL-NEXT: v_lshlrev_b32_e32 v9, 2, v9
+; GLOBAL-ISEL-NEXT: v_add_co_u32 v7, vcc_lo, v7, v9
+; GLOBAL-ISEL-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
+; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v9, 0
+; GLOBAL-ISEL-NEXT: global_store_b32 v[7:8], v9, off
+; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst m0
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GLOBAL-ISEL-NEXT: s_and_b32 s0, 1, s0
+; GLOBAL-ISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GLOBAL-ISEL-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
+; GLOBAL-ISEL-NEXT: global_load_b32 v0, v[0:1], off
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v[2:3], off
+; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0)
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v0, v1, v0
+; GLOBAL-ISEL-NEXT: global_store_b32 v[7:8], v0, off
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32 %arg)
+ %1 = load i32, ptr addrspace(1) %a, align 4
+ %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c
+ %2 = load i32, ptr addrspace(1) %b.c, align 4
+ %mul1 = mul nsw i32 %2, %1
+ store i32 %mul1, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 {
+; GCN-LABEL: test1_s_barrier_init:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
+; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 s2, s2, 16
+; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: s_barrier_init -1
+; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test1_s_barrier_init:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_clause 0x1
+; GLOBAL-ISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: s_barrier_init -1
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ call void @llvm.amdgcn.s.barrier.init(i32 -1, i32 %mbrCnt)
+ %tmp2 = mul i32 %tmp, %tmp
+ %tmp3 = sub i32 %tmp2, %tmp
+ store i32 %tmp3, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 {
+; GCN-LABEL: test2_s_barrier_init:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
+; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 s2, s2, 16
+; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: s_barrier_init 1
+; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test2_s_barrier_init:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_clause 0x1
+; GLOBAL-ISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: s_barrier_init 1
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ call void @llvm.amdgcn.s.barrier.init(i32 1, i32 %mbrCnt)
+ %tmp3 = mul i32 %tmp, %tmp
+ %tmp4 = sub i32 %tmp3, %tmp
+ store i32 %tmp4, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 {
+; GCN-LABEL: test3_s_barrier_init:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
+; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 s2, s2, 16
+; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: s_barrier_init 0
+; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test3_s_barrier_init:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_clause 0x1
+; GLOBAL-ISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: s_barrier_init 0
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ call void @llvm.amdgcn.s.barrier.init(i32 0, i32 %mbrCnt)
+ %tmp3 = mul i32 %tmp, %tmp
+ %tmp4 = sub i32 %tmp3, %tmp
+ store i32 %tmp4, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, i32 %mbrCnt) #0 {
+; GCN-LABEL: test4_s_barrier_init:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
+; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 s3, s3, 16
+; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: s_or_b32 s2, s2, s3
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: s_barrier_init m0
+; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test4_s_barrier_init:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: s_lshl_b32 s3, 16, s3
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: s_or_b32 m0, s2, s3
+; GLOBAL-ISEL-NEXT: s_barrier_init m0
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ call void @llvm.amdgcn.s.barrier.init(i32 %bar, i32 %mbrCnt)
+ %tmp3 = mul i32 %tmp, %tmp
+ %tmp4 = sub i32 %tmp3, %tmp
+ store i32 %tmp4, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) {
+; GCN-LABEL: test5_s_barrier_init_m0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_or_b32_e32 v0, v0, v1
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: s_mov_b32 m0, s0
+; GCN-NEXT: s_barrier_init m0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GLOBAL-ISEL-LABEL: test5_s_barrier_init_m0:
+; GLOBAL-ISEL: ; %bb.0:
+; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 s0, v1
+; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 s1, v0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: s_lshl_b32 s0, 16, s0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GLOBAL-ISEL-NEXT: s_or_b32 m0, s1, s0
+; GLOBAL-ISEL-NEXT: s_barrier_init m0
+; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
+ call void @llvm.amdgcn.s.barrier.init(i32 %arg1, i32 %arg2)
+ ret void
+}
+
+define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test1_s_barrier_join:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GCN-NEXT: s_barrier_join -1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test1_s_barrier_join:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: s_barrier_join -1
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ call void @llvm.amdgcn.s.barrier.join(i32 -1)
+ %tmp3 = mul i32 %tmp, %tmp
+ %tmp4 = sub i32 %tmp3, %tmp
+ store i32 %tmp4, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test2_s_barrier_join:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GCN-NEXT: s_barrier_join 1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test2_s_barrier_join:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: s_barrier_join 1
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ call void @llvm.amdgcn.s.barrier.join(i32 1)
+ %tmp3 = mul i32 %tmp, %tmp
+ %tmp4 = sub i32 %tmp3, %tmp
+ store i32 %tmp4, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test3_s_barrier_join:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GCN-NEXT: s_barrier_join 0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test3_s_barrier_join:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: s_barrier_join 0
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ call void @llvm.amdgcn.s.barrier.join(i32 0)
+ %tmp3 = mul i32 %tmp, %tmp
+ %tmp4 = sub i32 %tmp3, %tmp
+ store i32 %tmp4, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %bar) #0 {
+; GCN-LABEL: test4_s_barrier_join_m0:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GCN-NEXT: s_load_b32 s0, s[0:1], 0x2c
+; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_b32 v3, v1, s[2:3]
+; GCN-NEXT: s_mov_b32 m0, s0
+; GCN-NEXT: s_barrier_join m0
+; GCN-NEXT: global_store_b32 v3, v0, s[2:3]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test4_s_barrier_join_m0:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_clause 0x1
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b32 s0, s[0:1], 0x2c
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
+; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s0
+; GLOBAL-ISEL-NEXT: s_barrier_join m0
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ call void @llvm.amdgcn.s.barrier.join(i32 %bar)
+ %tmp3 = mul i32 %tmp, %tmp
+ %tmp4 = sub i32 %tmp3, %tmp
+ store i32 %tmp4, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define void @test5_s_barrier_join_m0(i32 %arg) {
+; GCN-LABEL: test5_s_barrier_join_m0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: s_mov_b32 m0, s0
+; GCN-NEXT: s_barrier_join m0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GLOBAL-ISEL-LABEL: test5_s_barrier_join_m0:
+; GLOBAL-ISEL: ; %bb.0:
+; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0
+; GLOBAL-ISEL-NEXT: s_barrier_join m0
+; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
+ call void @llvm.amdgcn.s.barrier.join(i32 %arg)
+ ret void
+}
+
+define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test1_s_barrier_leave:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: s_barrier_leave
+; GCN-NEXT: s_cselect_b32 s3, s3, s5
+; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
+; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
+; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test1_s_barrier_leave:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: s_barrier_leave
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_clause 0x1
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0)
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ %0 = call i1 @llvm.amdgcn.s.barrier.leave()
+ %1 = load i32, ptr addrspace(1) %a, align 4
+ %b.c = select i1 %0, ptr addrspace(1) %b, ptr addrspace(1) %c
+ %2 = load i32, ptr addrspace(1) %b.c, align 4
+ %mul1 = mul nsw i32 %2, %1
+ store i32 %mul1, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test1_s_wakeup_barrier:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GCN-NEXT: s_wakeup_barrier -1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test1_s_wakeup_barrier:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: s_wakeup_barrier -1
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ call void @llvm.amdgcn.s.wakeup.barrier(i32 -1)
+ %tmp3 = mul i32 %tmp, %tmp
+ %tmp4 = sub i32 %tmp3, %tmp
+ store i32 %tmp4, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test2_s_wakeup_barrier:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GCN-NEXT: s_wakeup_barrier 1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test2_s_wakeup_barrier:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: s_wakeup_barrier 1
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ call void @llvm.amdgcn.s.wakeup.barrier(i32 1)
+ %tmp3 = mul i32 %tmp, %tmp
+ %tmp4 = sub i32 %tmp3, %tmp
+ store i32 %tmp4, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test3_s_wakeup_barrier:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GCN-NEXT: s_wakeup_barrier 0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test3_s_wakeup_barrier:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: s_wakeup_barrier 0
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ call void @llvm.amdgcn.s.wakeup.barrier(i32 0)
+ %tmp3 = mul i32 %tmp, %tmp
+ %tmp4 = sub i32 %tmp3, %tmp
+ store i32 %tmp4, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 %bar) #0 {
+; GCN-LABEL: test4_s_wakeup_barrier_m0:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GCN-NEXT: s_load_b32 s0, s[0:1], 0x2c
+; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_b32 v3, v1, s[2:3]
+; GCN-NEXT: s_mov_b32 m0, s0
+; GCN-NEXT: s_wakeup_barrier m0
+; GCN-NEXT: global_store_b32 v3, v0, s[2:3]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test4_s_wakeup_barrier_m0:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_clause 0x1
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b32 s0, s[0:1], 0x2c
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
+; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s0
+; GLOBAL-ISEL-NEXT: s_wakeup_barrier m0
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ call void @llvm.amdgcn.s.wakeup.barrier(i32 %bar)
+ %tmp3 = mul i32 %tmp, %tmp
+ %tmp4 = sub i32 %tmp3, %tmp
+ store i32 %tmp4, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define void @test5_s_wakeup_barrier_m0(i32 %arg) {
+; GCN-LABEL: test5_s_wakeup_barrier_m0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: s_mov_b32 m0, s0
+; GCN-NEXT: s_wakeup_barrier m0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GLOBAL-ISEL-LABEL: test5_s_wakeup_barrier_m0:
+; GLOBAL-ISEL: ; %bb.0:
+; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0
+; GLOBAL-ISEL-NEXT: s_wakeup_barrier m0
+; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
+ call void @llvm.amdgcn.s.wakeup.barrier(i32 %arg)
+ ret void
+}
+
+define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test1_s_get_barrier_state:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_get_barrier_state s2, -1
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: global_store_b32 v0, v1, s[0:1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test1_s_get_barrier_state:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, -1
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
+; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 -1)
+ store i32 %state, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test2_s_get_barrier_state:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_get_barrier_state s2, 1
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: global_store_b32 v0, v1, s[0:1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test2_s_get_barrier_state:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 1
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
+; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 1)
+ store i32 %state, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test3_s_get_barrier_state:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_get_barrier_state s2, 0
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: global_store_b32 v0, v1, s[0:1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test3_s_get_barrier_state:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
+; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 0)
+ store i32 %state, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i32 %bar) #0 {
+; GCN-LABEL: test4_s_get_barrier_state_m0:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_clause 0x1
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GCN-NEXT: s_load_b32 s0, s[0:1], 0x2c
+; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_b32 v0, v1, s[2:3]
+; GCN-NEXT: s_mov_b32 m0, s0
+; GCN-NEXT: s_get_barrier_state s0, m0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: global_store_b32 v0, v1, s[2:3]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test4_s_get_barrier_state_m0:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_clause 0x1
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b32 s0, s[0:1], 0x2c
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s0
+; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, m0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
+; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s0
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 %bar)
+ store i32 %state, ptr addrspace(1) %tmp1
+ ret void
+}
+
+define i32 @test5_s_get_barrier_state_m0(i32 %arg) {
+; GCN-LABEL: test5_s_get_barrier_state_m0:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_2)
+; GCN-NEXT: s_mov_b32 m0, s0
+; GCN-NEXT: s_get_barrier_state s0, m0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GLOBAL-ISEL-LABEL: test5_s_get_barrier_state_m0:
+; GLOBAL-ISEL: ; %bb.0:
+; GLOBAL-ISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: v_readfirstlane_b32 m0, v0
+; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, m0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
+; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v0, s0
+; GLOBAL-ISEL-NEXT: s_setpc_b64 s[30:31]
+ %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 %arg)
+ ret i32 %state
+}
+
+define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test_barrier_convert:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
+; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: s_barrier_signal -1
+; GCN-NEXT: s_barrier_wait -1
+; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GCN-NEXT: s_endpgm
+;
+; GLOBAL-ISEL-LABEL: test_barrier_convert:
+; GLOBAL-ISEL: ; %bb.0: ; %entry
+; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
+; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GLOBAL-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: s_barrier_signal -1
+; GLOBAL-ISEL-NEXT: s_barrier_wait -1
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: s_nop 0
+; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GLOBAL-ISEL-NEXT: s_endpgm
+entry:
+ %tmp = call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
+ store i32 0, ptr addrspace(1) %tmp1
+ call void @llvm.amdgcn.s.barrier()
+ %tmp3 = mul i32 %tmp, %tmp
+ %tmp4 = sub i32 %tmp3, %tmp
+ store i32 %tmp4, ptr addrspace(1) %tmp1
+ ret void
+}
+declare void @llvm.amdgcn.s.barrier() #1
+declare void @llvm.amdgcn.s.barrier.wait(i16) #1
+declare void @llvm.amdgcn.s.barrier.signal(i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(i32) #1
+declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
+declare i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32) #1
+declare void @llvm.amdgcn.s.barrier.init(i32, i32) #1
+declare void @llvm.amdgcn.s.barrier.join(i32) #1
+declare i1 @llvm.amdgcn.s.barrier.leave() #1
+declare void @llvm.amdgcn.s.wakeup.barrier(i32) #1
+declare i32 @llvm.amdgcn.s.get.barrier.state(i32) #1
+declare i32 @llvm.amdgcn.s.get.barrier.state.var(i32) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone }
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
index 494b8399a26fb..e9d5ad0828c30 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sop1.s
@@ -684,6 +684,51 @@ s_rndne_f16 s5, 0xfe0b
s_rndne_f16 s5, 0x3456
// GFX12: encoding: [0xff,0x6e,0x85,0xbe,0x56,0x34,0x00,0x00]
+s_barrier_signal -2
+// GFX12: encoding: [0xc2,0x4e,0x80,0xbe]
+
+s_barrier_signal -1
+// GFX12: encoding: [0xc1,0x4e,0x80,0xbe]
+
+s_barrier_signal m0
+// GFX12: encoding: [0x7d,0x4e,0x80,0xbe]
+
+s_barrier_signal_isfirst -2
+// GFX12: encoding: [0xc2,0x4f,0x80,0xbe]
+
+s_barrier_signal_isfirst -1
+// GFX12: encoding: [0xc1,0x4f,0x80,0xbe]
+
+s_barrier_signal_isfirst m0
+// GFX12: encoding: [0x7d,0x4f,0x80,0xbe]
+
+s_barrier_init -1
+// GFX12: encoding: [0xc1,0x51,0x80,0xbe]
+
+s_barrier_init -2
+// GFX12: encoding: [0xc2,0x51,0x80,0xbe]
+
+s_barrier_init m0
+// GFX12: encoding: [0x7d,0x51,0x80,0xbe]
+
+s_barrier_join -1
+// GFX12: encoding: [0xc1,0x52,0x80,0xbe]
+
+s_barrier_join -2
+// GFX12: encoding: [0xc2,0x52,0x80,0xbe]
+
+s_barrier_join m0
+// GFX12: encoding: [0x7d,0x52,0x80,0xbe]
+
+s_wakeup_barrier 1
+// GFX12: encoding: [0x81,0x57,0x80,0xbe]
+
+s_wakeup_barrier -1
+// GFX12: encoding: [0xc1,0x57,0x80,0xbe]
+
+s_wakeup_barrier m0
+// GFX12: encoding: [0x7d,0x57,0x80,0xbe]
+
s_mov_b32 s0, s1
// GFX12: encoding: [0x01,0x00,0x80,0xbe]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
index 2e9df11d6f5a4..cf78b87a47618 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
@@ -33,6 +33,15 @@ s_singleuse_vdst 0xffff
s_singleuse_vdst 0x1234
// GFX12: encoding: [0x34,0x12,0x93,0xbf]
+s_barrier_wait 0xffff
+// GFX12: encoding: [0xff,0xff,0x94,0xbf]
+
+s_barrier_wait 1
+// GFX12: encoding: [0x01,0x00,0x94,0xbf]
+
+s_barrier_leave
+// GFX12: encoding: [0x00,0x00,0x95,0xbf]
+
//===----------------------------------------------------------------------===//
// s_waitcnt
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
index 7029f090faa4b..883eccae6da81 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
@@ -684,6 +684,59 @@
# GFX12: s_rndne_f16 s5, 0x3456 ; encoding: [0xff,0x6e,0x85,0xbe,0x56,0x34,0x00,0x00]
0xff,0x6e,0x85,0xbe,0x56,0x34,0x00,0x00
+# GFX12: s_barrier_signal -2 ; encoding: [0xc2,0x4e,0x80,0xbe]
+0xc2,0x4e,0x80,0xbe
+
+# GFX12: s_barrier_signal -1 ; encoding: [0xc1,0x4e,0x80,0xbe]
+0xc1,0x4e,0x80,0xbe
+
+# GFX12: s_barrier_signal m0 ; encoding: [0x7d,0x4e,0x80,0xbe]
+0x7d,0x4e,0x80,0xbe
+
+# GFX12: s_barrier_signal_isfirst -2 ; encoding: [0xc2,0x4f,0x80,0xbe]
+0xc2,0x4f,0x80,0xbe
+
+# GFX12: s_barrier_signal_isfirst -1 ; encoding: [0xc1,0x4f,0x80,0xbe]
+0xc1,0x4f,0x80,0xbe
+
+# GFX12: s_barrier_signal_isfirst m0 ; encoding: [0x7d,0x4f,0x80,0xbe]
+0x7d,0x4f,0x80,0xbe
+
+# GFX12: s_barrier_init -1 ; encoding: [0xc1,0x51,0x80,0xbe]
+0xc1,0x51,0x80,0xbe
+
+# GFX12: s_barrier_init -2 ; encoding: [0xc2,0x51,0x80,0xbe]
+0xc2,0x51,0x80,0xbe
+
+# GFX12: s_barrier_init m0 ; encoding: [0x7d,0x51,0x80,0xbe]
+0x7d,0x51,0x80,0xbe
+
+# GFX12: s_barrier_join -1 ; encoding: [0xc1,0x52,0x80,0xbe]
+0xc1,0x52,0x80,0xbe
+
+# GFX12: s_barrier_join -2 ; encoding: [0xc2,0x52,0x80,0xbe]
+0xc2,0x52,0x80,0xbe
+
+# GFX12: s_barrier_join m0 ; encoding: [0x7d,0x52,0x80,0xbe]
+0x7d,0x52,0x80,0xbe
+
+# GFX12: s_wakeup_barrier 1 ; encoding: [0x81,0x57,0x80,0xbe]
+0x81,0x57,0x80,0xbe
+
+# GFX12: s_wakeup_barrier -1 ; encoding: [0xc1,0x57,0x80,0xbe]
+0xc1,0x57,0x80,0xbe
+
+# GFX12: s_wakeup_barrier m0 ; encoding: [0x7d,0x57,0x80,0xbe]
+0x7d,0x57,0x80,0xbe
+# GFX12: s_get_barrier_state s3, -1 ; encoding: [0xc1,0x50,0x83,0xbe]
+0xc1,0x50,0x83,0xbe
+
+# GFX12: s_get_barrier_state s3, -2 ; encoding: [0xc2,0x50,0x83,0xbe]
+0xc2,0x50,0x83,0xbe
+
+# GFX12: s_get_barrier_state s3, m0 ; encoding: [0x7d,0x50,0x83,0xbe]
+0x7d,0x50,0x83,0xbe
+
# GFX12: s_abs_i32 exec_hi, s1 ; encoding: [0x01,0x15,0xff,0xbe]
0x01,0x15,0xff,0xbe
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
index fe74ff08a8e5c..9d1936b9e74cd 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
@@ -15,6 +15,15 @@
# GFX12: s_singleuse_vdst 0x1234 ; encoding: [0x34,0x12,0x93,0xbf]
0x34,0x12,0x93,0xbf
+# GFX12: s_barrier_wait 0xffff ; encoding: [0xff,0xff,0x94,0xbf]
+0xff,0xff,0x94,0xbf
+
+# GFX12: s_barrier_wait 1 ; encoding: [0x01,0x00,0x94,0xbf]
+0x01,0x00,0x94,0xbf
+
+# GFX12: s_barrier_leave ; encoding: [0x00,0x00,0x95,0xbf]
+0x00,0x00,0x95,0xbf
+
# GFX12: s_branch 0 ; encoding: [0x00,0x00,0xa0,0xbf]
0x00,0x00,0xa0,0xbf
>From 53ed0c7f6a9178e97d510436e7f315122828432c Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Tue, 12 Dec 2023 10:12:09 +0100
Subject: [PATCH 2/4] Add missing blank line
---
llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
index 883eccae6da81..7161bff990274 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt
@@ -728,6 +728,7 @@
# GFX12: s_wakeup_barrier m0 ; encoding: [0x7d,0x57,0x80,0xbe]
0x7d,0x57,0x80,0xbe
+
# GFX12: s_get_barrier_state s3, -1 ; encoding: [0xc1,0x50,0x83,0xbe]
0xc1,0x50,0x83,0xbe
>From 26e5f0458afc02170b102084ac059f252f85e4e6 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Tue, 12 Dec 2023 10:42:01 +0100
Subject: [PATCH 3/4] Add missing s_barrier test
---
llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt | 3 +++
1 file changed, 3 insertions(+)
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
index 9d1936b9e74cd..13ded15998fb2 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
@@ -24,6 +24,9 @@
# GFX12: s_barrier_leave ; encoding: [0x00,0x00,0x95,0xbf]
0x00,0x00,0x95,0xbf
+# GFX12: s_barrier ; encoding: [0x00,0x00,0xbd,0xbf]
+0x00,0x00,0xbd,0xbf
+
# GFX12: s_branch 0 ; encoding: [0x00,0x00,0xa0,0xbf]
0x00,0x00,0xa0,0xbf
>From 35bb5730db764ecbc8a0431796dbf9ea76ae7b96 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora at amd.com>
Date: Wed, 13 Dec 2023 12:39:00 +0100
Subject: [PATCH 4/4] Update s.barrier.ll test
---
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index f3ee7f3819b5d..4e65b37633949 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -102,7 +102,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
; VARIANT4-NEXT: s_barrier_wait -1
; VARIANT4-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; VARIANT4-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2]
+; VARIANT4-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2]
; VARIANT4-NEXT: v_add_co_u32 v1, vcc_lo, s0, v1
; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_2)
; VARIANT4-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo
@@ -126,7 +126,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
; VARIANT5-NEXT: s_barrier_wait -1
; VARIANT5-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; VARIANT5-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2]
+; VARIANT5-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2]
; VARIANT5-NEXT: v_add_co_u32 v1, vcc_lo, s0, v1
; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_2)
; VARIANT5-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo
@@ -152,7 +152,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
; VARIANT6-NEXT: s_barrier_wait -1
; VARIANT6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; VARIANT6-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2]
+; VARIANT6-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2]
; VARIANT6-NEXT: v_add_co_u32 v1, vcc_lo, v3, v1
; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_2)
; VARIANT6-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo
More information about the libcxx-commits
mailing list