[clang] [llvm] [AMDGPU] modify named barrier builtins and intrinsics (PR #114550)
Gang Chen via cfe-commits
cfe-commits at lists.llvm.org
Fri Nov 1 08:34:25 PDT 2024
https://github.com/cmc-rep created https://github.com/llvm/llvm-project/pull/114550
None
>From 1aca81c970e31ba8c87d886447ce5187a796ca0c Mon Sep 17 00:00:00 2001
From: gangc <gangc at amd.com>
Date: Wed, 30 Oct 2024 15:39:42 -0700
Subject: [PATCH] [AMDGPU] modify named barrier builtins and intrinsics
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 12 +-
.../builtins-amdgcn-gfx12-param-err.cl | 7 +
.../CodeGenOpenCL/builtins-amdgcn-gfx12.cl | 138 +-
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 41 +-
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 +-
.../AMDGPU/AMDGPUInstructionSelector.cpp | 195 ++-
.../Target/AMDGPU/AMDGPUInstructionSelector.h | 2 +
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 4 +-
.../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 124 ++
.../Target/AMDGPU/AMDGPUMachineFunction.cpp | 7 +
llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp | 23 +-
llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h | 5 +
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 23 +-
llvm/lib/Target/AMDGPU/SIDefines.h | 6 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 107 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 4 +-
llvm/lib/Target/AMDGPU/SOPInstructions.td | 10 +-
.../Assembler/target-type-param-errors.ll | 5 +
.../AMDGPU/llvm.amdgcn.s.barrier.wait.ll | 1373 -----------------
.../test/CodeGen/AMDGPU/s-barrier-lowering.ll | 66 +
llvm/test/CodeGen/AMDGPU/s-barrier.ll | 299 ++++
21 files changed, 848 insertions(+), 1607 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/s-barrier.ll
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 29001e32085151..8f44afa4059386 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -439,15 +439,15 @@ TARGET_BUILTIN(__builtin_amdgcn_s_sleep_var, "vUi", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_permlane16_var, "UiUiUiUiIbIb", "nc", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_permlanex16_var, "UiUiUiUiIbIb", "nc", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal, "vIi", "n", "gfx12-insts")
-TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_var, "vi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_var, "vv*i", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_wait, "vIs", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst, "bIi", "n", "gfx12-insts")
-TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst_var, "bi", "n", "gfx12-insts")
-TARGET_BUILTIN(__builtin_amdgcn_s_barrier_init, "vii", "n", "gfx12-insts")
-TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vi", "n", "gfx12-insts")
-TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
-TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_init, "vv*i", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vv*", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vv*", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "vIs", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_get_named_barrier_state, "Uiv*", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_prefetch_data, "vvC*Ui", "nc", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_buffer_prefetch_data, "vQbIiUi", "nc", "gfx12-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl
index 5d86a9b369429f..1a5043328895ac 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl
@@ -23,6 +23,13 @@ kernel void builtins_amdgcn_s_barrier_signal_isfirst_err(global int* in, global
*out = *in;
}
+kernel void builtins_amdgcn_s_barrier_leave_err(global int* in, global int* out, int barrier) {
+
+ __builtin_amdgcn_s_barrier_signal(-1);
+ __builtin_amdgcn_s_barrier_leave(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_leave' must be a constant integer}}
+ *out = *in;
+}
+
void test_s_buffer_prefetch_data(__amdgpu_buffer_rsrc_t rsrc, unsigned int off)
{
__builtin_amdgcn_s_buffer_prefetch_data(rsrc, off, 31); // expected-error {{'__builtin_amdgcn_s_buffer_prefetch_data' must be a constant integer}}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
index 9bfedac0032965..b1866a8e492c84 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -87,16 +87,21 @@ void test_s_barrier_signal()
// CHECK-LABEL: @test_s_barrier_signal_var(
// CHECK-NEXT: entry:
+// CHECK-NEXT: [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(i32 [[TMP0]])
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: ret void
//
-void test_s_barrier_signal_var(int a)
+void test_s_barrier_signal_var(void *bar, int a)
{
- __builtin_amdgcn_s_barrier_signal_var(a);
+ __builtin_amdgcn_s_barrier_signal_var(bar, a);
}
// CHECK-LABEL: @test_s_barrier_signal_isfirst(
@@ -134,110 +139,63 @@ void test_s_barrier_signal_isfirst(int* a, int* b, int *c)
__builtin_amdgcn_s_barrier_wait(1);
}
-// CHECK-LABEL: @test_s_barrier_isfirst_var(
-// CHECK-NEXT: entry:
-// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
-// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// CHECK-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr
-// CHECK-NEXT: store ptr [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 8
-// CHECK-NEXT: store ptr [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 8
-// CHECK-NEXT: store ptr [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 8
-// CHECK-NEXT: store i32 [[D:%.*]], ptr [[D_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[D_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32 [[TMP0]])
-// CHECK-NEXT: br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-// CHECK: if.then:
-// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
-// CHECK-NEXT: store ptr [[TMP2]], ptr [[A_ADDR_ASCAST]], align 8
-// CHECK-NEXT: br label [[IF_END:%.*]]
-// CHECK: if.else:
-// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// CHECK-NEXT: store ptr [[TMP3]], ptr [[A_ADDR_ASCAST]], align 8
-// CHECK-NEXT: br label [[IF_END]]
-// CHECK: if.end:
-// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
-// CHECK-NEXT: ret void
-//
-void test_s_barrier_isfirst_var(int* a, int* b, int *c, int d)
-{
- if ( __builtin_amdgcn_s_barrier_signal_isfirst_var(d))
- a = b;
- else
- a = c;
-
- __builtin_amdgcn_s_barrier_wait(1);
-
-}
-
// CHECK-LABEL: @test_s_barrier_init(
// CHECK-NEXT: entry:
+// CHECK-NEXT: [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
-// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(i32 1, i32 [[TMP0]])
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: ret void
//
-void test_s_barrier_init(int a)
+void test_s_barrier_init(void *bar, int a)
{
- __builtin_amdgcn_s_barrier_init(1, a);
+ __builtin_amdgcn_s_barrier_init(bar, a);
}
// CHECK-LABEL: @test_s_barrier_join(
// CHECK-NEXT: entry:
-// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(i32 1)
+// CHECK-NEXT: [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
+// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) [[TMP1]])
// CHECK-NEXT: ret void
//
-void test_s_barrier_join()
+void test_s_barrier_join(void *bar)
{
- __builtin_amdgcn_s_barrier_join(1);
+ __builtin_amdgcn_s_barrier_join(bar);
}
// CHECK-LABEL: @test_s_wakeup_barrier(
// CHECK-NEXT: entry:
-// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(i32 1)
+// CHECK-NEXT: [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
+// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT: call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) [[TMP1]])
// CHECK-NEXT: ret void
//
-void test_s_wakeup_barrier()
+void test_s_wakeup_barrier(void *bar)
{
- __builtin_amdgcn_s_barrier_join(1);
+ __builtin_amdgcn_s_wakeup_barrier(bar);
}
// CHECK-LABEL: @test_s_barrier_leave(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
-// CHECK-NEXT: store ptr [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 8
-// CHECK-NEXT: store ptr [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 8
-// CHECK-NEXT: store ptr [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 8
-// CHECK-NEXT: [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.leave()
-// CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-// CHECK: if.then:
-// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8
-// CHECK-NEXT: store ptr [[TMP1]], ptr [[A_ADDR_ASCAST]], align 8
-// CHECK-NEXT: br label [[IF_END:%.*]]
-// CHECK: if.else:
-// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8
-// CHECK-NEXT: store ptr [[TMP2]], ptr [[A_ADDR_ASCAST]], align 8
-// CHECK-NEXT: br label [[IF_END]]
-// CHECK: if.end:
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.leave(i16 1)
// CHECK-NEXT: ret void
//
-void test_s_barrier_leave(int* a, int* b, int *c)
+void test_s_barrier_leave()
{
- if (__builtin_amdgcn_s_barrier_leave())
- a = b;
- else
- a = c;
+ __builtin_amdgcn_s_barrier_leave(1);
}
// CHECK-LABEL: @test_s_get_barrier_state(
@@ -261,6 +219,28 @@ unsigned test_s_get_barrier_state(int a)
return State;
}
+// CHECK-LABEL: @test_s_get_named_barrier_state(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[BAR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[STATE:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT: [[BAR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BAR_ADDR]] to ptr
+// CHECK-NEXT: [[STATE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[STATE]] to ptr
+// CHECK-NEXT: store ptr [[BAR:%.*]], ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BAR_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[TMP0]] to ptr addrspace(3)
+// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) [[TMP1]])
+// CHECK-NEXT: store i32 [[TMP2]], ptr [[STATE_ASCAST]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[STATE_ASCAST]], align 4
+// CHECK-NEXT: ret i32 [[TMP3]]
+//
+unsigned test_s_get_named_barrier_state(void *bar)
+{
+ unsigned State = __builtin_amdgcn_s_get_named_barrier_state(bar);
+ return State;
+}
+
// CHECK-LABEL: @test_s_ttracedata(
// CHECK-NEXT: entry:
// CHECK-NEXT: call void @llvm.amdgcn.s.ttracedata(i32 1)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 143b538b361c9c..d6375ab77cfb32 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
def global_ptr_ty : LLVMQualPointerType<1>;
+def local_ptr_ty : LLVMQualPointerType<3>;
// The amdgpu-no-* attributes (ex amdgpu-no-workitem-id-z) typically inferred
// by the backend cause whole-program undefined behavior when violated, such as
@@ -247,48 +248,70 @@ def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">,
def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty],
[ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;
+// Vanilla workgroup sync-barrier
def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">,
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+// Lower-level split-barrier intrinsics
+
+// void @llvm.amdgcn.s.barrier.signal(i32 %barrierType)
+// only for non-named barrier
def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal">,
Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;
+// void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %barrier, i32 %memberCnt)
+// The %barrier and %memberCnt argument must be uniform, otherwise behavior is undefined.
def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">,
- Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;
+// bool @llvm.amdgcn.s.barrier.signal.isfirst(i32 %barrierType)
+// only for non-named barrier
def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst">,
Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
IntrWillReturn, IntrNoCallback, IntrNoFree]>;
-def int_amdgcn_s_barrier_signal_isfirst_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst_var">,
- Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
- IntrNoCallback, IntrNoFree]>;
-
+// void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %barrier, i32 %memberCnt)
+// The %barrier and %memberCnt argument must be uniform, otherwise behavior is undefined.
def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">,
- Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent,
+ Intrinsic<[], [local_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent,
IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+// void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %barrier)
+// The %barrier argument must be uniform, otherwise behavior is undefined.
def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">,
- Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;
+// void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %barrier)
+// The %barrier argument must be uniform, otherwise behavior is undefined.
def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">,
- Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ Intrinsic<[], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;
+// void @llvm.amdgcn.s.barrier.wait(i16 %barrierType)
def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">,
Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+// void @llvm.amdgcn.s.barrier.leave(i16 %barrierType)
def int_amdgcn_s_barrier_leave : ClangBuiltin<"__builtin_amdgcn_s_barrier_leave">,
- Intrinsic<[llvm_i1_ty], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+ Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
+ IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+// uint32_t @llvm.amdgcn.s.get.barrier.state(i32 %barrierId)
+// The %barrierType argument must be uniform, otherwise behavior is undefined.
def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrier_state">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
IntrNoCallback, IntrNoFree]>;
+// uint32_t @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %barrier)
+// The %barrier argument must be uniform, otherwise behavior is undefined.
+def int_amdgcn_s_get_named_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_named_barrier_state">,
+ Intrinsic<[llvm_i32_ty], [local_ptr_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ IntrNoCallback, IntrNoFree]>;
+
def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">,
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e4b54c7d72b083..8c640ec18e1a49 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -16,6 +16,7 @@
#include "AMDGPU.h"
#include "AMDGPUInstrInfo.h"
#include "AMDGPUMachineFunction.h"
+#include "AMDGPUMemoryUtils.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
@@ -1508,7 +1509,8 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
if (!MFI->isModuleEntryFunction() &&
- GV->getName() != "llvm.amdgcn.module.lds") {
+ GV->getName() != "llvm.amdgcn.module.lds" &&
+ !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
SDLoc DL(Op);
const Function &Fn = DAG.getMachineFunction().getFunction();
DiagnosticInfoUnsupported BadLDSDecl(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 800bdbe04cf70d..1873251ea358b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2181,15 +2181,16 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
return selectDSBvhStackIntrinsic(I);
case Intrinsic::amdgcn_s_barrier_init:
+ case Intrinsic::amdgcn_s_barrier_signal_var:
+ return selectNamedBarrierInit(I, IntrinsicID);
case Intrinsic::amdgcn_s_barrier_join:
case Intrinsic::amdgcn_s_wakeup_barrier:
- case Intrinsic::amdgcn_s_get_barrier_state:
+ case Intrinsic::amdgcn_s_get_named_barrier_state:
return selectNamedBarrierInst(I, IntrinsicID);
+ case Intrinsic::amdgcn_s_get_barrier_state:
+ return selectSGetBarrierState(I, IntrinsicID);
case Intrinsic::amdgcn_s_barrier_signal_isfirst:
- case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
return selectSBarrierSignalIsfirst(I, IntrinsicID);
- case Intrinsic::amdgcn_s_barrier_leave:
- return selectSBarrierLeave(I);
}
return selectImpl(I, *CoverageInfo);
}
@@ -5437,18 +5438,8 @@ bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
const DebugLoc &DL = I.getDebugLoc();
Register CCReg = I.getOperand(0).getReg();
- bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var;
-
- if (HasM0) {
- auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
- .addReg(I.getOperand(2).getReg());
- BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0));
- if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI))
- return false;
- } else {
- BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
- .addImm(I.getOperand(2).getImm());
- }
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
+ .addImm(I.getOperand(2).getImm());
BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
@@ -5457,80 +5448,143 @@ bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
*MRI);
}
+bool AMDGPUInstructionSelector::selectSGetBarrierState(
+ MachineInstr &I, Intrinsic::ID IntrID) const {
+ MachineBasicBlock *MBB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineOperand BarOp = I.getOperand(2);
+ std::optional<int64_t> BarValImm =
+ getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
+
+ if (!BarValImm) {
+ auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(BarOp.getReg());
+ constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
+ }
+ MachineInstrBuilder MIB;
+ unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
+ : AMDGPU::S_GET_BARRIER_STATE_M0;
+ MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
+
+ auto DstReg = I.getOperand(0).getReg();
+ const TargetRegisterClass *DstRC =
+ TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
+ if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
+ return false;
+ MIB.addDef(DstReg);
+ if (BarValImm) {
+ MIB.addImm(*BarValImm);
+ }
+ I.eraseFromParent();
+ return true;
+}
+
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
if (HasInlineConst) {
switch (IntrID) {
default:
llvm_unreachable("not a named barrier op");
- case Intrinsic::amdgcn_s_barrier_init:
- return AMDGPU::S_BARRIER_INIT_IMM;
case Intrinsic::amdgcn_s_barrier_join:
return AMDGPU::S_BARRIER_JOIN_IMM;
case Intrinsic::amdgcn_s_wakeup_barrier:
return AMDGPU::S_WAKEUP_BARRIER_IMM;
- case Intrinsic::amdgcn_s_get_barrier_state:
+ case Intrinsic::amdgcn_s_get_named_barrier_state:
return AMDGPU::S_GET_BARRIER_STATE_IMM;
};
} else {
switch (IntrID) {
default:
llvm_unreachable("not a named barrier op");
- case Intrinsic::amdgcn_s_barrier_init:
- return AMDGPU::S_BARRIER_INIT_M0;
case Intrinsic::amdgcn_s_barrier_join:
return AMDGPU::S_BARRIER_JOIN_M0;
case Intrinsic::amdgcn_s_wakeup_barrier:
return AMDGPU::S_WAKEUP_BARRIER_M0;
- case Intrinsic::amdgcn_s_get_barrier_state:
+ case Intrinsic::amdgcn_s_get_named_barrier_state:
return AMDGPU::S_GET_BARRIER_STATE_M0;
};
}
}
+bool AMDGPUInstructionSelector::selectNamedBarrierInit(
+ MachineInstr &I, Intrinsic::ID IntrID) const {
+ MachineBasicBlock *MBB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineOperand BarOp = I.getOperand(1);
+ MachineOperand CntOp = I.getOperand(2);
+
+ // BarID = (BarOp >> 4) & 0x3F
+ Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
+ .add(BarOp)
+ .addImm(4u)
+ .setOperandDead(3); // Dead scc
+
+ Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
+ .addReg(TmpReg0)
+ .addImm(0x3F)
+ .setOperandDead(3); // Dead scc
+
+ // MO = ((CntOp & 0x3F) << shAmt) | BarID
+ Register TmpReg2 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
+ .add(CntOp)
+ .addImm(0x3F)
+ .setOperandDead(3); // Dead scc
+
+ Register TmpReg3 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ constexpr unsigned ShAmt = 16;
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
+ .addReg(TmpReg2)
+ .addImm(ShAmt)
+ .setOperandDead(3); // Dead scc
+
+ Register TmpReg4 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
+ .addReg(TmpReg1)
+ .addReg(TmpReg3)
+ .setOperandDead(3); // Dead scc;
+
+ auto CopyMIB =
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
+ constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
+
+ unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
+ ? AMDGPU::S_BARRIER_INIT_M0
+ : AMDGPU::S_BARRIER_SIGNAL_M0;
+ MachineInstrBuilder MIB;
+ MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
+
+ I.eraseFromParent();
+ return true;
+}
+
bool AMDGPUInstructionSelector::selectNamedBarrierInst(
MachineInstr &I, Intrinsic::ID IntrID) const {
MachineBasicBlock *MBB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
- MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state
+ MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
? I.getOperand(2)
: I.getOperand(1);
std::optional<int64_t> BarValImm =
getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
- Register M0Val;
- Register TmpReg0;
-
- // For S_BARRIER_INIT, member count will always be read from M0[16:22]
- if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
- Register MemberCount = I.getOperand(2).getReg();
- TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- // TODO: This should be expanded during legalization so that the the S_LSHL
- // and S_OR can be constant-folded
- BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
- .addImm(16)
- .addReg(MemberCount);
- M0Val = TmpReg0;
- }
-
- // If not inlinable, get reference to barrier depending on the instruction
+
if (!BarValImm) {
- if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
- // If reference to barrier id is not an inlinable constant then it must be
- // referenced with M0[4:0]. Perform an OR with the member count to include
- // it in M0 for S_BARRIER_INIT.
- Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1)
- .addReg(BarOp.getReg())
- .addReg(TmpReg0);
- M0Val = TmpReg1;
- } else {
- M0Val = BarOp.getReg();
- }
- }
+ // BarID = (BarOp >> 4) & 0x3F
+ Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
+ .addReg(BarOp.getReg())
+ .addImm(4u)
+ .setOperandDead(3); // Dead scc;
+
+ Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
+ .addReg(TmpReg0)
+ .addImm(0x3F)
+ .setOperandDead(3); // Dead scc;
- // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required.
- if (M0Val) {
- auto CopyMIB =
- BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val);
+ auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(TmpReg1);
constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
}
@@ -5538,29 +5592,24 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInst(
unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
- if (IntrID == Intrinsic::amdgcn_s_get_barrier_state)
- MIB.addDef(I.getOperand(0).getReg());
+ if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
+ auto DstReg = I.getOperand(0).getReg();
+ const TargetRegisterClass *DstRC =
+ TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
+ if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
+ return false;
+ MIB.addDef(DstReg);
+ }
- if (BarValImm)
- MIB.addImm(*BarValImm);
+ if (BarValImm) {
+ auto BarId = ((*BarValImm) >> 4) & 0x3F;
+ MIB.addImm(BarId);
+ }
I.eraseFromParent();
return true;
}
-bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
- MachineBasicBlock *BB = I.getParent();
- const DebugLoc &DL = I.getDebugLoc();
- Register CCReg = I.getOperand(0).getReg();
-
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE));
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
-
- I.eraseFromParent();
- return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
- *MRI);
-}
-
void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index df39ecbd61bce6..42343104812b66 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -147,8 +147,10 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectSMFMACIntrin(MachineInstr &I) const;
bool selectWaveAddress(MachineInstr &I) const;
bool selectStackRestore(MachineInstr &MI) const;
+ bool selectNamedBarrierInit(MachineInstr &I, Intrinsic::ID IID) const;
bool selectNamedBarrierInst(MachineInstr &I, Intrinsic::ID IID) const;
bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
+ bool selectSGetBarrierState(MachineInstr &I, Intrinsic::ID IID) const;
bool selectSBarrierLeave(MachineInstr &I) const;
std::pair<Register, unsigned> selectVOP3ModsImpl(Register Src,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index d7126132356d2c..ab6b09b0087144 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -16,6 +16,7 @@
#include "AMDGPU.h"
#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPUInstrInfo.h"
+#include "AMDGPUMemoryUtils.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
@@ -2976,7 +2977,8 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
if (!MFI->isModuleEntryFunction() &&
- GV->getName() != "llvm.amdgcn.module.lds") {
+ GV->getName() != "llvm.amdgcn.module.lds" &&
+ !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
const Function &Fn = MF.getFunction();
DiagnosticInfoUnsupported BadLDSDecl(
Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 5791daed00651f..a76d92ee91ee53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -920,6 +920,124 @@ class AMDGPULowerModuleLDS {
return KernelToCreatedDynamicLDS;
}
+ static GlobalVariable *uniquifyGVPerKernel(Module &M, GlobalVariable *GV,
+ Function *KF) {
+ bool NeedsReplacement = false;
+ for (Use &U : GV->uses()) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ Function *F = I->getFunction();
+ if (isKernelLDS(F) && F != KF) {
+ NeedsReplacement = true;
+ break;
+ }
+ }
+ }
+ if (!NeedsReplacement)
+ return GV;
+ // Create a new GV used only by this kernel and its function
+ GlobalVariable *NewGV = new GlobalVariable(
+ M, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
+ GV->getInitializer(), GV->getName() + "." + KF->getName(), nullptr,
+ GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
+ NewGV->copyAttributesFrom(GV);
+ for (Use &U : make_early_inc_range(GV->uses())) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ Function *F = I->getFunction();
+ if (!isKernelLDS(F) || F == KF) {
+ U.getUser()->replaceUsesOfWith(GV, NewGV);
+ }
+ }
+ }
+ return NewGV;
+ }
+
+ bool lowerSpecialLDSVariables(
+ Module &M, LDSUsesInfoTy &LDSUsesInfo,
+ VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly) {
+ bool Changed = false;
+ // The 1st round: give module-absolute assignments
+ int NumAbsolutes = 0;
+ std::vector<GlobalVariable *> OrderedGVs;
+ for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
+ GlobalVariable *GV = K.first;
+ if (!isNamedBarrier(*GV))
+ continue;
+ // give a module-absolute assignment if it is indirectly accessed by
+ // multiple kernels. This is not precise, but we don't want to duplicate
+ // a function when it is called by multiple kernels.
+ if (LDSToKernelsThatNeedToAccessItIndirectly[GV].size() > 1) {
+ OrderedGVs.push_back(GV);
+ } else {
+ // leave it to the 2nd round, which will give a kernel-relative
+ // assignment if it is only indirectly accessed by one kernel
+ LDSUsesInfo.direct_access[*K.second.begin()].insert(GV);
+ }
+ LDSToKernelsThatNeedToAccessItIndirectly.erase(GV);
+ }
+ OrderedGVs = sortByName(std::move(OrderedGVs));
+ for (GlobalVariable *GV : OrderedGVs) {
+ int BarId = ++NumAbsolutes;
+ unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+ // 4 bits for alignment, 5 bits for the barrier num,
+ // 3 bits for the barrier scope
+ unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+ recordLDSAbsoluteAddress(&M, GV, Offset);
+ }
+ OrderedGVs.clear();
+
+ // The 2nd round: give a kernel-relative assignment for GV that
+ // either only indirectly accessed by single kernel or only directly
+ // accessed by multiple kernels.
+ std::vector<Function *> OrderedKernels;
+ for (auto &K : LDSUsesInfo.direct_access) {
+ Function *F = K.first;
+ assert(isKernelLDS(F));
+ OrderedKernels.push_back(F);
+ }
+ OrderedKernels = sortByName(std::move(OrderedKernels));
+
+ llvm::DenseMap<Function *, uint32_t> Kernel2BarId;
+ for (Function *F : OrderedKernels) {
+ for (GlobalVariable *GV : LDSUsesInfo.direct_access[F]) {
+ if (!isNamedBarrier(*GV))
+ continue;
+
+ LDSUsesInfo.direct_access[F].erase(GV);
+ if (GV->isAbsoluteSymbolRef()) {
+ // already assigned
+ continue;
+ }
+ OrderedGVs.push_back(GV);
+ }
+ OrderedGVs = sortByName(std::move(OrderedGVs));
+ for (GlobalVariable *GV : OrderedGVs) {
+ // GV could also be used directly by other kernels. If so, we need to
+ // create a new GV used only by this kernel and its function.
+ auto NewGV = uniquifyGVPerKernel(M, GV, F);
+ Changed |= (NewGV != GV);
+ int BarId = (NumAbsolutes + 1);
+ if (Kernel2BarId.find(F) != Kernel2BarId.end()) {
+ BarId = (Kernel2BarId[F] + 1);
+ }
+ Kernel2BarId[F] = BarId;
+ unsigned BarrierScope = llvm::AMDGPU::Barrier::BARRIER_SCOPE_WORKGROUP;
+ unsigned Offset = 0x802000u | BarrierScope << 9 | BarId << 4;
+ recordLDSAbsoluteAddress(&M, NewGV, Offset);
+ }
+ OrderedGVs.clear();
+ }
+ // Also erase those special LDS variables from indirect_access.
+ for (auto &K : LDSUsesInfo.indirect_access) {
+ Function *F = K.first;
+ assert(isKernelLDS(F));
+ for (GlobalVariable *GV : K.second) {
+ if (isNamedBarrier(*GV))
+ K.second.erase(GV);
+ }
+ }
+ return Changed;
+ }
+
bool runOnModule(Module &M) {
CallGraph CG = CallGraph(M);
bool Changed = superAlignLDSGlobals(M);
@@ -942,6 +1060,12 @@ class AMDGPULowerModuleLDS {
}
}
+ if (LDSUsesInfo.HasSpecialGVs) {
+ // Special LDS variables need special address assignment
+ Changed |= lowerSpecialLDSVariables(
+ M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly);
+ }
+
// Partition variables accessed indirectly into the different strategies
DenseSet<GlobalVariable *> ModuleScopeVariables;
DenseSet<GlobalVariable *> TableLookupVariables;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 1d83d0c4c93372..f5812e45ace4b0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -8,6 +8,7 @@
#include "AMDGPUMachineFunction.h"
#include "AMDGPU.h"
+#include "AMDGPUMemoryUtils.h"
#include "AMDGPUPerfHintAnalysis.h"
#include "AMDGPUSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
@@ -102,6 +103,12 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
unsigned Offset;
if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ if (TargetExtType *TTy = AMDGPU::isNamedBarrier(GV)) {
+ auto Offset = getLDSAbsoluteAddress(GV);
+ assert(Offset && "named barrier should have an absolute address");
+ Entry.first->second = Offset.value();
+ return Offset.value();
+ }
std::optional<uint32_t> MaybeAbs = getLDSAbsoluteAddress(GV);
if (MaybeAbs) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
index c82b8d83704b76..0bc0f03e0942ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp
@@ -32,6 +32,28 @@ Align getAlign(const DataLayout &DL, const GlobalVariable *GV) {
GV->getValueType());
}
+TargetExtType *isNamedBarrier(const GlobalVariable &GV) {
+ // TODO: Allow arrays and structs, if all members are barriers
+ // in the same scope.
+ // TODO: Disallow other uses of target("amdgcn.named.barrier") including:
+ // - Structs containing barriers in different scope.
+ // - Structs containing a mixture of barriers and other data.
+ // - Globals in other address spaces.
+ // - Allocas.
+ Type *Ty = GV.getValueType();
+ while (true) {
+ if (auto *TTy = dyn_cast<TargetExtType>(Ty))
+ return TTy->getName() == "amdgcn.named.barrier" ? TTy : nullptr;
+ if (auto *STy = dyn_cast<StructType>(Ty)) {
+ if (STy->getNumElements() == 0)
+ return nullptr;
+ Ty = STy->getElementType(0);
+ continue;
+ }
+ return nullptr;
+ }
+}
+
bool isDynamicLDS(const GlobalVariable &GV) {
// external zero size addrspace(3) without initializer is dynlds.
const Module *M = GV.getParent();
@@ -294,7 +316,6 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
case Intrinsic::amdgcn_s_barrier_signal:
case Intrinsic::amdgcn_s_barrier_signal_var:
case Intrinsic::amdgcn_s_barrier_signal_isfirst:
- case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
case Intrinsic::amdgcn_s_barrier_init:
case Intrinsic::amdgcn_s_barrier_join:
case Intrinsic::amdgcn_s_barrier_wait:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
index e1cd4d03052b3c..058e74452573c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.h
@@ -26,6 +26,7 @@ class Value;
class Function;
class CallGraph;
class Module;
+class TargetExtType;
namespace AMDGPU {
@@ -34,12 +35,16 @@ using VariableFunctionMap = DenseMap<GlobalVariable *, DenseSet<Function *>>;
Align getAlign(const DataLayout &DL, const GlobalVariable *GV);
+// If GV is a named-barrier return its type. Otherwise return nullptr.
+TargetExtType *isNamedBarrier(const GlobalVariable &GV);
+
bool isDynamicLDS(const GlobalVariable &GV);
bool isLDSVariableToLower(const GlobalVariable &GV);
struct LDSUsesInfoTy {
FunctionVariableMap direct_access;
FunctionVariableMap indirect_access;
+ bool HasSpecialGVs = false;
};
bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 32dfbc98df581a..415c068367074f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3276,19 +3276,17 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
assert(OpdMapper.getVRegs(1).empty());
constrainOpWithReadfirstlane(B, MI, 1);
return;
- case Intrinsic::amdgcn_s_barrier_signal_var:
case Intrinsic::amdgcn_s_barrier_join:
case Intrinsic::amdgcn_s_wakeup_barrier:
constrainOpWithReadfirstlane(B, MI, 1);
return;
- case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
- constrainOpWithReadfirstlane(B, MI, 2);
- return;
case Intrinsic::amdgcn_s_barrier_init:
+ case Intrinsic::amdgcn_s_barrier_signal_var:
constrainOpWithReadfirstlane(B, MI, 1);
constrainOpWithReadfirstlane(B, MI, 2);
return;
- case Intrinsic::amdgcn_s_get_barrier_state: {
+ case Intrinsic::amdgcn_s_get_barrier_state:
+ case Intrinsic::amdgcn_s_get_named_barrier_state: {
constrainOpWithReadfirstlane(B, MI, 2);
return;
}
@@ -5134,30 +5132,23 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_s_sleep_var:
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
break;
- case Intrinsic::amdgcn_s_barrier_signal_var:
case Intrinsic::amdgcn_s_barrier_join:
case Intrinsic::amdgcn_s_wakeup_barrier:
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
break;
case Intrinsic::amdgcn_s_barrier_init:
+ case Intrinsic::amdgcn_s_barrier_signal_var:
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
- case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: {
- const unsigned ResultSize = 1;
- OpdsMapping[0] =
- AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
- OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
- break;
- }
- case Intrinsic::amdgcn_s_barrier_signal_isfirst:
- case Intrinsic::amdgcn_s_barrier_leave: {
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst: {
const unsigned ResultSize = 1;
OpdsMapping[0] =
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
break;
}
- case Intrinsic::amdgcn_s_get_barrier_state: {
+ case Intrinsic::amdgcn_s_get_barrier_state:
+ case Intrinsic::amdgcn_s_get_named_barrier_state: {
OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 07c80bd2575f0e..8f297726a0df88 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -1062,7 +1062,13 @@ enum Register_Flag : uint8_t {
namespace AMDGPU {
namespace Barrier {
+
enum Type { TRAP = -2, WORKGROUP = -1 };
+
+enum {
+ BARRIER_SCOPE_WORKGROUP = 0,
+};
+
} // namespace Barrier
} // namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d66610ae0a160d..e0362b0568cf8c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9386,27 +9386,33 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
Ops, M->getMemOperand());
}
- case Intrinsic::amdgcn_s_get_barrier_state: {
+ case Intrinsic::amdgcn_s_get_barrier_state:
+ case Intrinsic::amdgcn_s_get_named_barrier_state: {
SDValue Chain = Op->getOperand(0);
SmallVector<SDValue, 2> Ops;
unsigned Opc;
- bool IsInlinableBarID = false;
- int64_t BarID;
if (isa<ConstantSDNode>(Op->getOperand(2))) {
- BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
- IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID);
- }
-
- if (IsInlinableBarID) {
+ uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
+ if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
+ BarID = (BarID >> 4) & 0x3F;
Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
Ops.push_back(K);
Ops.push_back(Chain);
} else {
Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
- SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2));
- Ops.push_back(M0Val.getValue(0));
+ if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
+ SDValue M0Val;
+ M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
+ DAG.getShiftAmountConstant(4, MVT::i32, DL));
+ M0Val = SDValue(
+ DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
+ DAG.getTargetConstant(0x3F, DL, MVT::i32)),
+ 0);
+ Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
+ } else
+ Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
}
auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
@@ -9946,27 +9952,55 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op->getOperand(2), Chain),
0);
case Intrinsic::amdgcn_s_barrier_init:
+ case Intrinsic::amdgcn_s_barrier_signal_var: {
+ // these two intrinsics have two operands: barrier pointer and member count
+ SDValue Chain = Op->getOperand(0);
+ SmallVector<SDValue, 2> Ops;
+ SDValue BarOp = Op->getOperand(2);
+ SDValue CntOp = Op->getOperand(3);
+ SDValue M0Val;
+ unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
+ ? AMDGPU::S_BARRIER_INIT_M0
+ : AMDGPU::S_BARRIER_SIGNAL_M0;
+ // extract the BarrierID from bits 4-9 of BarOp
+ SDValue BarID;
+ BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
+ DAG.getShiftAmountConstant(4, MVT::i32, DL));
+ BarID =
+ SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
+ DAG.getTargetConstant(0x3F, DL, MVT::i32)),
+ 0);
+ // Member count should be put into M0[ShAmt:+6]
+ // Barrier ID should be put into M0[5:0]
+ M0Val =
+ SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
+ DAG.getTargetConstant(0x3F, DL, MVT::i32)),
+ 0);
+ constexpr unsigned ShAmt = 16;
+ M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
+ DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
+
+ M0Val = SDValue(
+ DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
+
+ Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
+
+ auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
+ return SDValue(NewMI, 0);
+ }
case Intrinsic::amdgcn_s_barrier_join:
case Intrinsic::amdgcn_s_wakeup_barrier: {
+ // these three intrinsics have one operand: barrier pointer
SDValue Chain = Op->getOperand(0);
SmallVector<SDValue, 2> Ops;
SDValue BarOp = Op->getOperand(2);
unsigned Opc;
- bool IsInlinableBarID = false;
- int64_t BarVal;
if (isa<ConstantSDNode>(BarOp)) {
- BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
- IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal);
- }
-
- if (IsInlinableBarID) {
+ uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
switch (IntrinsicID) {
default:
return SDValue();
- case Intrinsic::amdgcn_s_barrier_init:
- Opc = AMDGPU::S_BARRIER_INIT_IMM;
- break;
case Intrinsic::amdgcn_s_barrier_join:
Opc = AMDGPU::S_BARRIER_JOIN_IMM;
break;
@@ -9974,16 +10008,15 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
break;
}
-
- SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32);
+ // extract the BarrierID from bits 4-9 of the immediate
+ unsigned BarID = (BarVal >> 4) & 0x3F;
+ SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
Ops.push_back(K);
+ Ops.push_back(Chain);
} else {
switch (IntrinsicID) {
default:
return SDValue();
- case Intrinsic::amdgcn_s_barrier_init:
- Opc = AMDGPU::S_BARRIER_INIT_M0;
- break;
case Intrinsic::amdgcn_s_barrier_join:
Opc = AMDGPU::S_BARRIER_JOIN_M0;
break;
@@ -9991,25 +10024,15 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
break;
}
- }
-
- if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
+ // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
SDValue M0Val;
- // Member count will be read from M0[16:22]
- M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3),
- DAG.getShiftAmountConstant(16, MVT::i32, DL));
-
- if (!IsInlinableBarID) {
- // If reference to barrier id is not an inline constant then it must be
- // referenced with M0[4:0]. Perform an OR with the member count to
- // include it in M0.
- M0Val = DAG.getNode(ISD::OR, DL, MVT::i32, Op.getOperand(2), M0Val);
- }
+ M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
+ DAG.getShiftAmountConstant(4, MVT::i32, DL));
+ M0Val =
+ SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
+ DAG.getTargetConstant(0x3F, DL, MVT::i32)),
+ 0);
Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
- } else if (IsInlinableBarID) {
- Ops.push_back(Chain);
- } else {
- Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0));
}
auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 7041b59964645a..1f7fff76d15210 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -946,8 +946,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
Opcode == AMDGPU::S_BARRIER_INIT_IMM ||
Opcode == AMDGPU::S_BARRIER_JOIN_IMM ||
Opcode == AMDGPU::S_BARRIER_LEAVE ||
- Opcode == AMDGPU::DS_GWS_INIT ||
- Opcode == AMDGPU::DS_GWS_BARRIER;
+ Opcode == AMDGPU::S_BARRIER_LEAVE_IMM ||
+ Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER;
}
static bool isF16PseudoScalarTrans(unsigned Opcode) {
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 9da27a7c7ee7d6..c0697c80b23f98 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -458,13 +458,13 @@ let hasSideEffects = 1 in {
let has_sdst = 0 in {
let Uses = [M0] in {
def S_BARRIER_SIGNAL_M0 : SOP1_Pseudo <"s_barrier_signal m0", (outs), (ins),
- "", [(int_amdgcn_s_barrier_signal_var M0)]>{
+ "", []>{
let SchedRW = [WriteBarrier];
let isConvergent = 1;
}
def S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_Pseudo <"s_barrier_signal_isfirst m0", (outs), (ins),
- "", [(set SCC, (int_amdgcn_s_barrier_signal_isfirst_var M0))]>{
+ "", []>{
let Defs = [SCC];
let SchedRW = [WriteBarrier];
let isConvergent = 1;
@@ -1604,8 +1604,7 @@ def S_BARRIER_WAIT : SOPP_Pseudo <"s_barrier_wait", (ins i16imm:$simm16), "$simm
let isConvergent = 1;
}
-def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins), "",
- [(set SCC, (int_amdgcn_s_barrier_leave))]> {
+def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins)> {
let SchedRW = [WriteBarrier];
let simm16 = 0;
let fixed_imm = 1;
@@ -1613,6 +1612,9 @@ def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins), "",
let Defs = [SCC];
}
+def S_BARRIER_LEAVE_IMM : SOPP_Pseudo <"s_barrier_leave",
+ (ins i16imm:$simm16), "$simm16", [(int_amdgcn_s_barrier_leave timm:$simm16)]>;
+
def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
let SubtargetPredicate = isGFX8Plus;
let simm16 = 0;
diff --git a/llvm/test/Assembler/target-type-param-errors.ll b/llvm/test/Assembler/target-type-param-errors.ll
index 03180811c7549e..b9eb9150e6e167 100644
--- a/llvm/test/Assembler/target-type-param-errors.ll
+++ b/llvm/test/Assembler/target-type-param-errors.ll
@@ -1,6 +1,7 @@
; RUN: split-file %s %t
; RUN: not llvm-as < %t/aarch64-svcount.ll -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-AARCH64-SVCOUNT %s
; RUN: not llvm-as < %t/riscv-vector-tuple.ll -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-RISCV-VECTOR-TUPLE %s
+; RUN: not llvm-as < %t/amdgcn-named-barrier.ll -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-AMDGCN-NAMEDBARRIER %s
; Check target extension type properties are verified in the assembler.
;--- aarch64-svcount.ll
@@ -10,3 +11,7 @@ declare target("aarch64.svcount", i32) @aarch64_svcount()
;--- riscv-vector-tuple.ll
declare target("riscv.vector.tuple", 99) @riscv_vector_tuple()
; CHECK-RISCV-VECTOR-TUPLE: target extension type riscv.vector.tuple should have one type parameter and one integer parameter
+
+;--- amdgcn-named-barrier.ll
+declare target("amdgcn.named.barrier", i32) @amdgcn_named_barrier()
+; CHECK-AMDGCN-NAMEDBARRIER: target extension type amdgcn.named.barrier should have no type parameters and one integer parameter
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
deleted file mode 100644
index 61baca24fbdc0b..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
+++ /dev/null
@@ -1,1373 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
-
-define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 {
-; GFX12-SDAG-LABEL: test1_s_barrier_signal:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_barrier_signal -1
-; GFX12-SDAG-NEXT: s_barrier_wait -1
-; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test1_s_barrier_signal:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0
-; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: s_barrier_signal -1
-; GFX12-GISEL-NEXT: s_barrier_wait -1
-; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- call void @llvm.amdgcn.s.barrier.signal(i32 -1)
- call void @llvm.amdgcn.s.barrier.wait(i16 -1)
- %tmp3 = mul i32 %tmp, %tmp
- %tmp4 = sub i32 %tmp3, %tmp
- store i32 %tmp4, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 {
-; GFX12-SDAG-LABEL: test2_s_barrier_signal:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_barrier_signal 1
-; GFX12-SDAG-NEXT: s_barrier_wait 1
-; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test2_s_barrier_signal:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0
-; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: s_barrier_signal 1
-; GFX12-GISEL-NEXT: s_barrier_wait 1
-; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- call void @llvm.amdgcn.s.barrier.signal(i32 1)
- call void @llvm.amdgcn.s.barrier.wait(i16 1)
- %tmp3 = mul i32 %tmp, %tmp
- %tmp4 = sub i32 %tmp3, %tmp
- store i32 %tmp4, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 {
-; GFX12-SDAG-LABEL: test3_s_barrier_signal:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_barrier_signal 0
-; GFX12-SDAG-NEXT: s_barrier_wait 0
-; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test3_s_barrier_signal:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0
-; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: s_barrier_signal 0
-; GFX12-GISEL-NEXT: s_barrier_wait 0
-; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- call void @llvm.amdgcn.s.barrier.signal(i32 0)
- call void @llvm.amdgcn.s.barrier.wait(i16 0)
- %tmp3 = mul i32 %tmp, %tmp
- %tmp4 = sub i32 %tmp3, %tmp
- store i32 %tmp4, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 {
-; GFX12-SDAG-LABEL: test1_s_barrier_signal_var:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_mov_b32 m0, 1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v2, v0, v0
-; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v3, 2, v0
-; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v2, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1]
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_barrier_signal m0
-; GFX12-SDAG-NEXT: s_barrier_wait 1
-; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test1_s_barrier_signal_var:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-GISEL-NEXT: s_mov_b32 m0, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0
-; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: s_barrier_signal m0
-; GFX12-GISEL-NEXT: s_barrier_wait 1
-; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- call void @llvm.amdgcn.s.barrier.signal.var(i32 1)
- call void @llvm.amdgcn.s.barrier.wait(i16 1)
- %tmp3 = mul i32 %tmp, %tmp
- %tmp4 = sub i32 %tmp3, %tmp
- store i32 %tmp4, ptr addrspace(1) %tmp1
- ret void
-}
-
-define void @test2_s_barrier_signal_var(i32 %arg) {
-; GFX12-SDAG-LABEL: test2_s_barrier_signal_var:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: s_mov_b32 m0, s0
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_barrier_signal m0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test2_s_barrier_signal_var:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_readfirstlane_b32 m0, v0
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: s_barrier_signal m0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.amdgcn.s.barrier.signal.var(i32 %arg)
- ret void
-}
-
-define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
-; GFX12-SDAG-LABEL: test1_s_barrier_signal_isfirst:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
-; GFX12-SDAG-NEXT: s_cselect_b32 s3, s3, s5
-; GFX12-SDAG-NEXT: s_cselect_b32 s2, s2, s4
-; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: global_load_b32 v2, v1, s[0:1]
-; GFX12-SDAG-NEXT: global_load_b32 v1, v1, s[2:3]
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test1_s_barrier_signal_isfirst:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
-; GFX12-GISEL-NEXT: s_cselect_b32 s8, 1, 0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: s_and_b32 s8, s8, 1
-; GFX12-GISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GFX12-GISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
-; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GFX12-GISEL-NEXT: global_load_b32 v1, v1, s[2:3]
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
- %0 = load i32, ptr addrspace(1) %a, align 4
- %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c
- %1 = load i32, ptr addrspace(1) %b.c, align 4
- %mul1 = mul nsw i32 %1, %0
- store i32 %mul1, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
-; GFX12-SDAG-LABEL: test2_s_barrier_signal_isfirst:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_barrier_signal_isfirst 1
-; GFX12-SDAG-NEXT: s_cselect_b32 s3, s3, s5
-; GFX12-SDAG-NEXT: s_cselect_b32 s2, s2, s4
-; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: global_load_b32 v2, v1, s[0:1]
-; GFX12-SDAG-NEXT: global_load_b32 v1, v1, s[2:3]
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test2_s_barrier_signal_isfirst:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: s_barrier_signal_isfirst 1
-; GFX12-GISEL-NEXT: s_cselect_b32 s8, 1, 0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: s_and_b32 s8, s8, 1
-; GFX12-GISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GFX12-GISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
-; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GFX12-GISEL-NEXT: global_load_b32 v1, v1, s[2:3]
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1)
- %0 = load i32, ptr addrspace(1) %a, align 4
- %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c
- %1 = load i32, ptr addrspace(1) %b.c, align 4
- %mul1 = mul nsw i32 %1, %0
- store i32 %mul1, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
-; GFX12-SDAG-LABEL: test3_s_barrier_signal_isfirst:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_barrier_signal_isfirst 1
-; GFX12-SDAG-NEXT: s_cselect_b32 s3, s3, s5
-; GFX12-SDAG-NEXT: s_cselect_b32 s2, s2, s4
-; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: global_load_b32 v2, v1, s[0:1]
-; GFX12-SDAG-NEXT: global_load_b32 v1, v1, s[2:3]
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test3_s_barrier_signal_isfirst:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: s_barrier_signal_isfirst 1
-; GFX12-GISEL-NEXT: s_cselect_b32 s8, 1, 0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: s_and_b32 s8, s8, 1
-; GFX12-GISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GFX12-GISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
-; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GFX12-GISEL-NEXT: global_load_b32 v1, v1, s[2:3]
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1)
- %0 = load i32, ptr addrspace(1) %a, align 4
- %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c
- %1 = load i32, ptr addrspace(1) %b.c, align 4
- %mul1 = mul nsw i32 %1, %0
- store i32 %mul1, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
-; GFX12-SDAG-LABEL: test1_s_barrier_signal_isfirst_var:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_mov_b32 m0, 1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_barrier_signal_isfirst m0
-; GFX12-SDAG-NEXT: s_cselect_b32 s3, s3, s5
-; GFX12-SDAG-NEXT: s_cselect_b32 s2, s2, s4
-; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: global_load_b32 v2, v1, s[0:1]
-; GFX12-SDAG-NEXT: global_load_b32 v1, v1, s[2:3]
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test1_s_barrier_signal_isfirst_var:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_mov_b32 m0, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: s_barrier_signal_isfirst m0
-; GFX12-GISEL-NEXT: s_cselect_b32 s8, 1, 0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: s_and_b32 s8, s8, 1
-; GFX12-GISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GFX12-GISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
-; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GFX12-GISEL-NEXT: global_load_b32 v1, v1, s[2:3]
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32 1)
- %0 = load i32, ptr addrspace(1) %a, align 4
- %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c
- %1 = load i32, ptr addrspace(1) %b.c, align 4
- %mul1 = mul nsw i32 %1, %0
- store i32 %mul1, ptr addrspace(1) %tmp1
- ret void
-}
-
-define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, i32 %arg, ptr addrspace(1) %out) {
-; GFX12-SDAG-LABEL: test2_s_barrier_signal_isfirst_var:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v31
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v9, 2, v9
-; GFX12-SDAG-NEXT: s_mov_b32 m0, s0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_u32 v7, vcc_lo, v7, v9
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
-; GFX12-SDAG-NEXT: global_store_b32 v[7:8], v10, off
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_barrier_signal_isfirst m0
-; GFX12-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX12-SDAG-NEXT: global_load_b32 v0, v[0:1], off
-; GFX12-SDAG-NEXT: global_load_b32 v1, v[2:3], off
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mul_lo_u32 v0, v1, v0
-; GFX12-SDAG-NEXT: global_store_b32 v[7:8], v0, off
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test2_s_barrier_signal_isfirst_var:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_and_b32_e32 v9, 0x3ff, v31
-; GFX12-GISEL-NEXT: v_readfirstlane_b32 m0, v6
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v9, 2, v9
-; GFX12-GISEL-NEXT: v_add_co_u32 v7, vcc_lo, v7, v9
-; GFX12-GISEL-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v9, 0
-; GFX12-GISEL-NEXT: global_store_b32 v[7:8], v9, off
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: s_barrier_signal_isfirst m0
-; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT: s_and_b32 s0, 1, s0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GFX12-GISEL-NEXT: global_load_b32 v0, v[0:1], off
-; GFX12-GISEL-NEXT: global_load_b32 v1, v[2:3], off
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0
-; GFX12-GISEL-NEXT: global_store_b32 v[7:8], v0, off
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32 %arg)
- %1 = load i32, ptr addrspace(1) %a, align 4
- %b.c = select i1 %isfirst, ptr addrspace(1) %b, ptr addrspace(1) %c
- %2 = load i32, ptr addrspace(1) %b.c, align 4
- %mul1 = mul nsw i32 %2, %1
- store i32 %mul1, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 {
-; GFX12-SDAG-LABEL: test1_s_barrier_init:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_lshl_b32 s2, s2, 16
-; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-SDAG-NEXT: s_mov_b32 m0, s2
-; GFX12-SDAG-NEXT: s_barrier_init -1
-; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test1_s_barrier_init:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0
-; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_lshl_b32 m0, 16, s2
-; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-GISEL-NEXT: s_barrier_init -1
-; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- call void @llvm.amdgcn.s.barrier.init(i32 -1, i32 %mbrCnt)
- %tmp2 = mul i32 %tmp, %tmp
- %tmp3 = sub i32 %tmp2, %tmp
- store i32 %tmp3, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 {
-; GFX12-SDAG-LABEL: test2_s_barrier_init:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_lshl_b32 s2, s2, 16
-; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-SDAG-NEXT: s_mov_b32 m0, s2
-; GFX12-SDAG-NEXT: s_barrier_init 1
-; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test2_s_barrier_init:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0
-; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_lshl_b32 m0, 16, s2
-; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-GISEL-NEXT: s_barrier_init 1
-; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- call void @llvm.amdgcn.s.barrier.init(i32 1, i32 %mbrCnt)
- %tmp3 = mul i32 %tmp, %tmp
- %tmp4 = sub i32 %tmp3, %tmp
- store i32 %tmp4, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 {
-; GFX12-SDAG-LABEL: test3_s_barrier_init:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_lshl_b32 s2, s2, 16
-; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-SDAG-NEXT: s_mov_b32 m0, s2
-; GFX12-SDAG-NEXT: s_barrier_init 0
-; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test3_s_barrier_init:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0
-; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_lshl_b32 m0, 16, s2
-; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-GISEL-NEXT: s_barrier_init 0
-; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- call void @llvm.amdgcn.s.barrier.init(i32 0, i32 %mbrCnt)
- %tmp3 = mul i32 %tmp, %tmp
- %tmp4 = sub i32 %tmp3, %tmp
- store i32 %tmp4, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, i32 %mbrCnt) #0 {
-; GFX12-SDAG-LABEL: test4_s_barrier_init:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_lshl_b32 s3, s3, 16
-; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-SDAG-NEXT: s_or_b32 s2, s2, s3
-; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT: s_mov_b32 m0, s2
-; GFX12-SDAG-NEXT: s_barrier_init m0
-; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test4_s_barrier_init:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0
-; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_lshl_b32 s3, 16, s3
-; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-GISEL-NEXT: s_or_b32 m0, s2, s3
-; GFX12-GISEL-NEXT: s_barrier_init m0
-; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- call void @llvm.amdgcn.s.barrier.init(i32 %bar, i32 %mbrCnt)
- %tmp3 = mul i32 %tmp, %tmp
- %tmp4 = sub i32 %tmp3, %tmp
- store i32 %tmp4, ptr addrspace(1) %tmp1
- ret void
-}
-
-define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) {
-; GFX12-SDAG-LABEL: test5_s_barrier_init_m0:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0
-; GFX12-SDAG-NEXT: s_mov_b32 m0, s0
-; GFX12-SDAG-NEXT: s_barrier_init m0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test5_s_barrier_init_m0:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v1
-; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: s_lshl_b32 s0, 16, s0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT: s_or_b32 m0, s1, s0
-; GFX12-GISEL-NEXT: s_barrier_init m0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.amdgcn.s.barrier.init(i32 %arg1, i32 %arg2)
- ret void
-}
-
-define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 {
-;
-; GFX12-SDAG-LABEL: test1_s_barrier_join:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-SDAG-NEXT: s_barrier_join -1
-; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test1_s_barrier_join:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0
-; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-GISEL-NEXT: s_barrier_join -1
-; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- call void @llvm.amdgcn.s.barrier.join(i32 -1)
- %tmp3 = mul i32 %tmp, %tmp
- %tmp4 = sub i32 %tmp3, %tmp
- store i32 %tmp4, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 {
-;
-; GFX12-SDAG-LABEL: test2_s_barrier_join:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-SDAG-NEXT: s_barrier_join 1
-; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test2_s_barrier_join:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0
-; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-GISEL-NEXT: s_barrier_join 1
-; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- call void @llvm.amdgcn.s.barrier.join(i32 1)
- %tmp3 = mul i32 %tmp, %tmp
- %tmp4 = sub i32 %tmp3, %tmp
- store i32 %tmp4, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 {
-;
-; GFX12-SDAG-LABEL: test3_s_barrier_join:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-SDAG-NEXT: s_barrier_join 0
-; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test3_s_barrier_join:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0
-; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-GISEL-NEXT: s_barrier_join 0
-; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- call void @llvm.amdgcn.s.barrier.join(i32 0)
- %tmp3 = mul i32 %tmp, %tmp
- %tmp4 = sub i32 %tmp3, %tmp
- store i32 %tmp4, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %bar) #0 {
-; GFX12-SDAG-LABEL: test4_s_barrier_join_m0:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v2, v0, v0
-; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v3, 2, v0
-; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v2, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_mov_b32 m0, s2
-; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1]
-; GFX12-SDAG-NEXT: s_barrier_join m0
-; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test4_s_barrier_join_m0:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0
-; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_mov_b32 m0, s2
-; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-GISEL-NEXT: s_barrier_join m0
-; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- call void @llvm.amdgcn.s.barrier.join(i32 %bar)
- %tmp3 = mul i32 %tmp, %tmp
- %tmp4 = sub i32 %tmp3, %tmp
- store i32 %tmp4, ptr addrspace(1) %tmp1
- ret void
-}
-
-define void @test5_s_barrier_join_m0(i32 %arg) {
-; GFX12-SDAG-LABEL: test5_s_barrier_join_m0:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: s_mov_b32 m0, s0
-; GFX12-SDAG-NEXT: s_barrier_join m0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test5_s_barrier_join_m0:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_readfirstlane_b32 m0, v0
-; GFX12-GISEL-NEXT: s_barrier_join m0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.amdgcn.s.barrier.join(i32 %arg)
- ret void
-}
-
-define void @test6_s_barrier_join_0() {
-; GFX12-LABEL: test6_s_barrier_join_0:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_barrier_join 0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.amdgcn.s.barrier.join(i32 0)
- ret void
-}
-
-define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
-; GFX12-SDAG-LABEL: test1_s_barrier_leave:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-SDAG-NEXT: s_barrier_leave
-; GFX12-SDAG-NEXT: s_cselect_b32 s3, s3, s5
-; GFX12-SDAG-NEXT: s_cselect_b32 s2, s2, s4
-; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: global_load_b32 v2, v1, s[0:1]
-; GFX12-SDAG-NEXT: global_load_b32 v1, v1, s[2:3]
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: v_mul_lo_u32 v1, v1, v2
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test1_s_barrier_leave:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-GISEL-NEXT: s_barrier_leave
-; GFX12-GISEL-NEXT: s_cselect_b32 s8, 1, 0
-; GFX12-GISEL-NEXT: s_and_b32 s8, s8, 1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GFX12-GISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
-; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GFX12-GISEL-NEXT: global_load_b32 v1, v1, s[2:3]
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[6:7]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- %0 = call i1 @llvm.amdgcn.s.barrier.leave()
- %1 = load i32, ptr addrspace(1) %a, align 4
- %b.c = select i1 %0, ptr addrspace(1) %b, ptr addrspace(1) %c
- %2 = load i32, ptr addrspace(1) %b.c, align 4
- %mul1 = mul nsw i32 %2, %1
- store i32 %mul1, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
-;
-; GFX12-SDAG-LABEL: test1_s_wakeup_barrier:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-SDAG-NEXT: s_wakeup_barrier -1
-; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test1_s_wakeup_barrier:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0
-; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-GISEL-NEXT: s_wakeup_barrier -1
-; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- call void @llvm.amdgcn.s.wakeup.barrier(i32 -1)
- %tmp3 = mul i32 %tmp, %tmp
- %tmp4 = sub i32 %tmp3, %tmp
- store i32 %tmp4, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
-;
-; GFX12-SDAG-LABEL: test2_s_wakeup_barrier:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-SDAG-NEXT: s_wakeup_barrier 1
-; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test2_s_wakeup_barrier:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0
-; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-GISEL-NEXT: s_wakeup_barrier 1
-; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- call void @llvm.amdgcn.s.wakeup.barrier(i32 1)
- %tmp3 = mul i32 %tmp, %tmp
- %tmp4 = sub i32 %tmp3, %tmp
- store i32 %tmp4, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
-;
-; GFX12-SDAG-LABEL: test3_s_wakeup_barrier:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-SDAG-NEXT: s_wakeup_barrier 0
-; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test3_s_wakeup_barrier:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0
-; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-GISEL-NEXT: s_wakeup_barrier 0
-; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- call void @llvm.amdgcn.s.wakeup.barrier(i32 0)
- %tmp3 = mul i32 %tmp, %tmp
- %tmp4 = sub i32 %tmp3, %tmp
- store i32 %tmp4, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 %bar) #0 {
-; GFX12-SDAG-LABEL: test4_s_wakeup_barrier_m0:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v2, v0, v0
-; GFX12-SDAG-NEXT: v_lshlrev_b32_e32 v3, 2, v0
-; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v2, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_mov_b32 m0, s2
-; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1]
-; GFX12-SDAG-NEXT: s_wakeup_barrier m0
-; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test4_s_wakeup_barrier_m0:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0
-; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_mov_b32 m0, s2
-; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-GISEL-NEXT: s_wakeup_barrier m0
-; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- call void @llvm.amdgcn.s.wakeup.barrier(i32 %bar)
- %tmp3 = mul i32 %tmp, %tmp
- %tmp4 = sub i32 %tmp3, %tmp
- store i32 %tmp4, ptr addrspace(1) %tmp1
- ret void
-}
-
-define void @test5_s_wakeup_barrier_m0(i32 %arg) {
-; GFX12-SDAG-LABEL: test5_s_wakeup_barrier_m0:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: s_mov_b32 m0, s0
-; GFX12-SDAG-NEXT: s_wakeup_barrier m0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test5_s_wakeup_barrier_m0:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_readfirstlane_b32 m0, v0
-; GFX12-GISEL-NEXT: s_wakeup_barrier m0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- call void @llvm.amdgcn.s.wakeup.barrier(i32 %arg)
- ret void
-}
-
-define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 {
-; GFX12-LABEL: test1_s_get_barrier_state:
-; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT: s_get_barrier_state s2, -1
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 -1)
- store i32 %state, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 {
-; GFX12-LABEL: test2_s_get_barrier_state:
-; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT: s_get_barrier_state s2, 1
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 1)
- store i32 %state, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 {
-; GFX12-LABEL: test3_s_get_barrier_state:
-; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT: s_get_barrier_state s2, 0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 0)
- store i32 %state, ptr addrspace(1) %tmp1
- ret void
-}
-
-define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i32 %bar) #0 {
-; GFX12-LABEL: test4_s_get_barrier_state_m0:
-; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 m0, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT: s_get_barrier_state s2, m0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
-; GFX12-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 %bar)
- store i32 %state, ptr addrspace(1) %tmp1
- ret void
-}
-
-define i32 @test5_s_get_barrier_state_m0(i32 %arg) {
-; GFX12-SDAG-LABEL: test5_s_get_barrier_state_m0:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT: s_mov_b32 m0, s0
-; GFX12-SDAG-NEXT: s_get_barrier_state s0, m0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test5_s_get_barrier_state_m0:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_readfirstlane_b32 m0, v0
-; GFX12-GISEL-NEXT: s_get_barrier_state s0, m0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
- %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 %arg)
- ret i32 %state
-}
-
-define i32 @test6_s_get_barrier_state_0() {
-; GFX12-LABEL: test6_s_get_barrier_state_0:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_get_barrier_state s0, 0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 0)
- ret i32 %state
-}
-
-define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 {
-; GFX12-SDAG-LABEL: test_barrier_convert:
-; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-SDAG-NEXT: v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: s_barrier_signal -1
-; GFX12-SDAG-NEXT: s_barrier_wait -1
-; GFX12-SDAG-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-SDAG-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: test_barrier_convert:
-; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GFX12-GISEL-NEXT: v_mul_lo_u32 v1, v0, v0
-; GFX12-GISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v3, v2, s[0:1]
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: s_barrier_signal -1
-; GFX12-GISEL-NEXT: s_barrier_wait -1
-; GFX12-GISEL-NEXT: global_store_b32 v3, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_endpgm
-entry:
- %tmp = call i32 @llvm.amdgcn.workitem.id.x()
- %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
- store i32 0, ptr addrspace(1) %tmp1
- call void @llvm.amdgcn.s.barrier()
- %tmp3 = mul i32 %tmp, %tmp
- %tmp4 = sub i32 %tmp3, %tmp
- store i32 %tmp4, ptr addrspace(1) %tmp1
- ret void
-}
-declare void @llvm.amdgcn.s.barrier() #1
-declare void @llvm.amdgcn.s.barrier.wait(i16) #1
-declare void @llvm.amdgcn.s.barrier.signal(i32) #1
-declare void @llvm.amdgcn.s.barrier.signal.var(i32) #1
-declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
-declare i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32) #1
-declare void @llvm.amdgcn.s.barrier.init(i32, i32) #1
-declare void @llvm.amdgcn.s.barrier.join(i32) #1
-declare i1 @llvm.amdgcn.s.barrier.leave() #1
-declare void @llvm.amdgcn.s.wakeup.barrier(i32) #1
-declare i32 @llvm.amdgcn.s.get.barrier.state(i32) #1
-declare i32 @llvm.amdgcn.s.get.barrier.state.var(i32) #1
-declare i32 @llvm.amdgcn.workitem.id.x() #2
-
-attributes #0 = { nounwind }
-attributes #1 = { convergent nounwind }
-attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
new file mode 100644
index 00000000000000..7cf88830824586
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier-lowering.ll
@@ -0,0 +1,66 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+
+ at bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+
+; CHECK: @bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !0
+; CHECK-NEXT: @bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !1
+; CHECK-NEXT: @bar1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !2
+; CHECK-NEXT: @bar1.kernel1 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison, !absolute_symbol !2
+
+define void @func1() {
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ ret void
+}
+
+define void @func2() {
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ ret void
+}
+
+define amdgpu_kernel void @kernel1() #0 {
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1.kernel1, i32 11)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 11)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) @bar1)
+ %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier()
+ call void @func1()
+ call void @func2()
+ ret void
+}
+
+define amdgpu_kernel void @kernel2() #0 {
+; CHECK-DAG: call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar1, i32 9)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar1)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+
+ call void @func2()
+ ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+declare void @llvm.amdgcn.s.barrier.wait(i16) #1
+declare void @llvm.amdgcn.s.barrier.signal(i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.leave(i16) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone }
+
+; CHECK: !0 = !{i32 8396816, i32 8396817}
+; CHECK-NEXT: !1 = !{i32 8396848, i32 8396849}
+; CHECK-NEXT: !2 = !{i32 8396832, i32 8396833}
diff --git a/llvm/test/CodeGen/AMDGPU/s-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
new file mode 100644
index 00000000000000..490dae9c7d980c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/s-barrier.ll
@@ -0,0 +1,299 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s
+
+ at bar = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar2 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+ at bar3 = internal addrspace(3) global target("amdgcn.named.barrier", 0) poison
+
+define void @func1() {
+; GFX12-SDAG-LABEL: func1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70002
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: s_barrier_signal m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 2
+; GFX12-SDAG-NEXT: s_barrier_join m0
+; GFX12-SDAG-NEXT: s_barrier_wait 1
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: func1:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70002
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: s_barrier_signal m0
+; GFX12-GISEL-NEXT: s_barrier_join 2
+; GFX12-GISEL-NEXT: s_barrier_wait 1
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ ret void
+}
+
+define void @func2() {
+; GFX12-SDAG-LABEL: func2:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70000
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: s_barrier_signal m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 0
+; GFX12-SDAG-NEXT: s_barrier_join m0
+; GFX12-SDAG-NEXT: s_barrier_wait 1
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: func2:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70000
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: s_barrier_signal m0
+; GFX12-GISEL-NEXT: s_barrier_join 0
+; GFX12-GISEL-NEXT: s_barrier_wait 1
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ ret void
+}
+
+define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+; GFX12-SDAG-LABEL: kernel1:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX12-SDAG-NEXT: s_load_b32 s0, s[2:3], 0x2c
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 0xc0001
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v31, v0
+; GFX12-SDAG-NEXT: s_barrier_init m0
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[2:3], 48
+; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_lshr_b32 s0, s0, 4
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_and_b32 s0, s0, 63
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_or_b32 s1, 0x90000, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_mov_b32 m0, s1
+; GFX12-SDAG-NEXT: s_barrier_init m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 0xc0001
+; GFX12-SDAG-NEXT: s_barrier_signal m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, s1
+; GFX12-SDAG-NEXT: s_barrier_signal m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, s0
+; GFX12-SDAG-NEXT: s_barrier_signal -1
+; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
+; GFX12-SDAG-NEXT: s_barrier_join m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 1
+; GFX12-SDAG-NEXT: s_barrier_wait 1
+; GFX12-SDAG-NEXT: s_barrier_leave
+; GFX12-SDAG-NEXT: s_wakeup_barrier m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, s0
+; GFX12-SDAG-NEXT: s_wakeup_barrier m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 1
+; GFX12-SDAG-NEXT: s_get_barrier_state s1, m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, s0
+; GFX12-SDAG-NEXT: s_get_barrier_state s0, m0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_getpc_b64 s[0:1]
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, func1 at gotpcrel32@lo+12
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, func1 at gotpcrel32@hi+24
+; GFX12-SDAG-NEXT: s_barrier_signal -1
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-SDAG-NEXT: s_barrier_wait -1
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX12-SDAG-NEXT: s_getpc_b64 s[0:1]
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-SDAG-NEXT: s_add_co_u32 s0, s0, func2 at gotpcrel32@lo+12
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_add_co_ci_u32 s1, s1, func2 at gotpcrel32@hi+24
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX12-SDAG-NEXT: s_get_barrier_state s0, -1
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: kernel1:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX12-GISEL-NEXT: s_load_b32 s0, s[2:3], 0x2c
+; GFX12-GISEL-NEXT: s_mov_b32 m0, 0xc0001
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v31, v0
+; GFX12-GISEL-NEXT: s_barrier_init m0
+; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_lshr_b32 s0, s0, 4
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_and_b32 s0, s0, 63
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_or_b32 s1, s0, 0x90000
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_mov_b32 m0, s1
+; GFX12-GISEL-NEXT: s_barrier_init m0
+; GFX12-GISEL-NEXT: s_mov_b32 m0, 0xc0001
+; GFX12-GISEL-NEXT: s_barrier_signal m0
+; GFX12-GISEL-NEXT: s_mov_b32 m0, s1
+; GFX12-GISEL-NEXT: s_barrier_signal m0
+; GFX12-GISEL-NEXT: s_barrier_signal -1
+; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
+; GFX12-GISEL-NEXT: s_mov_b32 m0, s0
+; GFX12-GISEL-NEXT: s_add_co_u32 s8, s2, 48
+; GFX12-GISEL-NEXT: s_barrier_join m0
+; GFX12-GISEL-NEXT: s_barrier_wait 1
+; GFX12-GISEL-NEXT: s_barrier_leave
+; GFX12-GISEL-NEXT: s_wakeup_barrier 1
+; GFX12-GISEL-NEXT: s_wakeup_barrier m0
+; GFX12-GISEL-NEXT: s_get_barrier_state s0, 1
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_get_barrier_state s0, m0
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s3, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_getpc_b64 s[0:1]
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, func1 at gotpcrel32@lo+12
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, func1 at gotpcrel32@hi+24
+; GFX12-GISEL-NEXT: s_barrier_signal -1
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-GISEL-NEXT: s_barrier_wait -1
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX12-GISEL-NEXT: s_add_co_u32 s8, s2, 48
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s3, 0
+; GFX12-GISEL-NEXT: s_getpc_b64 s[0:1]
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, func2 at gotpcrel32@lo+12
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, func2 at gotpcrel32@hi+24
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX12-GISEL-NEXT: s_get_barrier_state s0, -1
+; GFX12-GISEL-NEXT: s_endpgm
+ call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) @bar, i32 12)
+ call void @llvm.amdgcn.s.barrier.init(ptr addrspace(3) %in, i32 9)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 12)
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %in, i32 9)
+ call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+ %isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %in)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+ call void @llvm.amdgcn.s.barrier.leave(i16 1)
+ call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) @bar)
+ call void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3) %in)
+ %state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar)
+ %state2 = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) %in)
+ call void @llvm.amdgcn.s.barrier()
+ call void @func1()
+ call void @func2()
+ %state3 = call i32 @llvm.amdgcn.s.get.barrier.state(i32 -1)
+ ret void
+}
+
+define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
+; GFX12-SDAG-LABEL: kernel2:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX12-SDAG-NEXT: s_getpc_b64 s[4:5]
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_sext_i32_i16 s5, s5
+; GFX12-SDAG-NEXT: s_add_co_u32 s4, s4, func2 at gotpcrel32@lo+12
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_add_co_ci_u32 s5, s5, func2 at gotpcrel32@hi+24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v31, v0
+; GFX12-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70001
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[2:3], 48
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_barrier_signal m0
+; GFX12-SDAG-NEXT: s_mov_b32 m0, 1
+; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT: s_barrier_join m0
+; GFX12-SDAG-NEXT: s_barrier_wait 1
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: kernel2:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_add_co_u32 s8, s2, 48
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s9, s3, 0
+; GFX12-GISEL-NEXT: s_getpc_b64 s[2:3]
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_sext_i32_i16 s3, s3
+; GFX12-GISEL-NEXT: s_add_co_u32 s2, s2, func2 at gotpcrel32@lo+12
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s3, s3, func2 at gotpcrel32@hi+24
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v31, v0
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-GISEL-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70001
+; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_barrier_signal m0
+; GFX12-GISEL-NEXT: s_barrier_join 1
+; GFX12-GISEL-NEXT: s_barrier_wait 1
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX12-GISEL-NEXT: s_endpgm
+ call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7)
+ call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar)
+ call void @llvm.amdgcn.s.barrier.wait(i16 1)
+
+ call void @func2()
+ ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #1
+declare void @llvm.amdgcn.s.barrier.wait(i16) #1
+declare void @llvm.amdgcn.s.barrier.signal(i32) #1
+declare void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3), i32) #1
+declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32) #1
+declare void @llvm.amdgcn.s.barrier.init(ptr addrspace(3), i32) #1
+declare void @llvm.amdgcn.s.barrier.join(ptr addrspace(3)) #1
+declare void @llvm.amdgcn.s.barrier.leave(i16) #1
+declare void @llvm.amdgcn.s.wakeup.barrier(ptr addrspace(3)) #1
+declare i32 @llvm.amdgcn.s.get.barrier.state(i32) #1
+declare i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3)) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
+attributes #2 = { nounwind readnone }
More information about the cfe-commits
mailing list