[clang] [llvm] [AMDGPU] GFX12: Add Split Workgroup Barrier (PR #74836)
via cfe-commits
cfe-commits at lists.llvm.org
Fri Dec 8 05:37:07 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang
Author: Mariusz Sikora (mariusz-sikora-at-amd)
<details>
<summary>Changes</summary>
---
Patch is 112.63 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/74836.diff
27 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+16)
- (added) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl (+24)
- (added) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl (+174)
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+39)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+152)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (+3)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+44-1)
- (modified) llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (+11-1)
- (modified) llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp (+11)
- (modified) llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h (+1)
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+3)
- (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp (+1)
- (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp (+1)
- (modified) llvm/lib/Target/AMDGPU/SIDefines.h (+9)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+111-1)
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+5)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+8-1)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+7)
- (modified) llvm/lib/Target/AMDGPU/SOPInstructions.td (+112)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+1)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp (+10)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll (+77)
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll (+1366)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_sop1.s (+45)
- (modified) llvm/test/MC/AMDGPU/gfx12_asm_sopp.s (+9)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sop1.txt (+53)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt (+9)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 8b59b3790d7bc6..7465f13d552d6e 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -406,5 +406,21 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32, "iffiIb", "nc", "fp8-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f32, "ifiiIi", "nc", "fp8-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32, "ifiiIi", "nc", "fp8-insts")
+//===----------------------------------------------------------------------===//
+// GFX12+ only builtins.
+//===----------------------------------------------------------------------===//
+
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal, "vIi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_var, "vi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_wait, "vIs", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst, "bIi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_signal_isfirst_var, "bi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_init, "vii", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_join, "vi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
+
+
#undef BUILTIN
#undef TARGET_BUILTIN
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
new file mode 100644
index 00000000000000..5e0153c42825e3
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
@@ -0,0 +1,24 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -verify -S -emit-llvm -o - %s
+
+kernel void builtins_amdgcn_s_barrier_signal_err(global int* in, global int* out, int barrier) {
+
+ __builtin_amdgcn_s_barrier_signal(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_signal' must be a constant integer}}
+ __builtin_amdgcn_s_barrier_wait(-1);
+ *out = *in;
+}
+
+kernel void builtins_amdgcn_s_barrier_wait_err(global int* in, global int* out, int barrier) {
+
+ __builtin_amdgcn_s_barrier_signal(-1);
+ __builtin_amdgcn_s_barrier_wait(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_wait' must be a constant integer}}
+ *out = *in;
+}
+
+kernel void builtins_amdgcn_s_barrier_signal_isfirst_err(global int* in, global int* out, int barrier) {
+
+ __builtin_amdgcn_s_barrier_signal_isfirst(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_signal_isfirst' must be a constant integer}}
+ __builtin_amdgcn_s_barrier_wait(-1);
+ *out = *in;
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
new file mode 100644
index 00000000000000..b8d281531e218e
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12.cl
@@ -0,0 +1,174 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s
+
+// CHECK-LABEL: @test_s_barrier_signal(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -1)
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_signal()
+{
+ __builtin_amdgcn_s_barrier_signal(-1);
+ __builtin_amdgcn_s_barrier_wait(-1);
+}
+
+// CHECK-LABEL: @test_s_barrier_signal_var(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal.var(i32 [[TMP0]])
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_signal_var(int a)
+{
+ __builtin_amdgcn_s_barrier_signal_var(a);
+}
+
+// CHECK-LABEL: @test_s_barrier_signal_isfirst(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT: store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 1)
+// CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+// CHECK: if.then:
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP1]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: br label [[IF_END:%.*]]
+// CHECK: if.else:
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: br label [[IF_END]]
+// CHECK: if.end:
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_signal_isfirst(int* a, int* b, int *c)
+{
+ if(__builtin_amdgcn_s_barrier_signal_isfirst(1))
+ a = b;
+ else
+ a = c;
+
+ __builtin_amdgcn_s_barrier_wait(1);
+}
+
+// CHECK-LABEL: @test_s_barrier_isfirst_var(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[D_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT: store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT: store i32 [[D:%.*]], ptr addrspace(5) [[D_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[D_ADDR]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst.var(i32 [[TMP0]])
+// CHECK-NEXT: br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+// CHECK: if.then:
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: br label [[IF_END:%.*]]
+// CHECK: if.else:
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP3]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: br label [[IF_END]]
+// CHECK: if.end:
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 1)
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_isfirst_var(int* a, int* b, int *c, int d)
+{
+ if ( __builtin_amdgcn_s_barrier_signal_isfirst_var(d))
+ a = b;
+ else
+ a = c;
+
+ __builtin_amdgcn_s_barrier_wait(1);
+
+}
+
+// CHECK-LABEL: @test_s_barrier_init(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.init(i32 1, i32 [[TMP0]])
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_init(int a)
+{
+ __builtin_amdgcn_s_barrier_init(1, a);
+}
+
+// CHECK-LABEL: @test_s_barrier_join(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(i32 1)
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_join()
+{
+ __builtin_amdgcn_s_barrier_join(1);
+}
+
+// CHECK-LABEL: @test_s_wakeup_barrier(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @llvm.amdgcn.s.barrier.join(i32 1)
+// CHECK-NEXT: ret void
+//
+void test_s_wakeup_barrier()
+{
+ __builtin_amdgcn_s_barrier_join(1);
+}
+
+// CHECK-LABEL: @test_s_barrier_leave(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT: store ptr [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: store ptr [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT: store ptr [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = call i1 @llvm.amdgcn.s.barrier.leave()
+// CHECK-NEXT: br i1 [[TMP0]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+// CHECK: if.then:
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr addrspace(5) [[B_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP1]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: br label [[IF_END:%.*]]
+// CHECK: if.else:
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr addrspace(5) [[C_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP2]], ptr addrspace(5) [[A_ADDR]], align 8
+// CHECK-NEXT: br label [[IF_END]]
+// CHECK: if.end:
+// CHECK-NEXT: ret void
+//
+void test_s_barrier_leave(int* a, int* b, int *c)
+{
+ if (__builtin_amdgcn_s_barrier_leave())
+ a = b;
+ else
+ a = c;
+}
+
+// CHECK-LABEL: @test_s_get_barrier_state(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[STATE:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.s.get.barrier.state(i32 [[TMP0]])
+// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[STATE]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[STATE]], align 4
+// CHECK-NEXT: ret i32 [[TMP2]]
+//
+unsigned test_s_get_barrier_state(int a)
+{
+ unsigned State = __builtin_amdgcn_s_get_barrier_state(a);
+ return State;
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bc9f99783d98f2..09e88152e65d2a 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -227,6 +227,45 @@ def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty],
def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">,
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal">,
+ Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">,
+ Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst">,
+ Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
+ IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_signal_isfirst_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst_var">,
+ Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">,
+ Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent,
+ IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">,
+ Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">,
+ Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">,
+ Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent,
+ IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_barrier_leave : ClangBuiltin<"__builtin_amdgcn_s_barrier_leave">,
+ Intrinsic<[llvm_i1_ty], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
+def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrier_state">,
+ Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn,
+ IntrNoCallback, IntrNoFree]>;
+
def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">,
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index d24c7da964ce85..75fac09d0b99fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1791,6 +1791,19 @@ bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
return true;
}
}
+
+ // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
+ if (STI.hasSplitBarriers()) {
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
+ .addImm(AMDGPU::Barrier::WORKGROUP);
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
+ .addImm(AMDGPU::Barrier::WORKGROUP);
+ MI.eraseFromParent();
+ return true;
+ }
+
return selectImpl(MI, *CoverageInfo);
}
@@ -2137,6 +2150,16 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
break;
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
return selectDSBvhStackIntrinsic(I);
+ case Intrinsic::amdgcn_s_barrier_init:
+ case Intrinsic::amdgcn_s_barrier_join:
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ case Intrinsic::amdgcn_s_get_barrier_state:
+ return selectNamedBarrierInst(I, IntrinsicID);
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst:
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
+ return selectSBarrierSignalIsfirst(I, IntrinsicID);
+ case Intrinsic::amdgcn_s_barrier_leave:
+ return selectSBarrierLeave(I);
}
return selectImpl(I, *CoverageInfo);
}
@@ -5239,6 +5262,135 @@ AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
}};
}
+bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
+ MachineInstr &I, Intrinsic::ID IntrID) const {
+ MachineBasicBlock *MBB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ Register CCReg = I.getOperand(0).getReg();
+
+ bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var;
+
+ if (HasM0) {
+ auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(I.getOperand(2).getReg());
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0));
+ if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI))
+ return false;
+ } else {
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
+ .addImm(I.getOperand(2).getImm());
+ }
+
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
+
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
+ *MRI);
+}
+
+unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
+ if (HasInlineConst) {
+ switch (IntrID) {
+ default:
+ llvm_unreachable("not a named barrier op");
+ case Intrinsic::amdgcn_s_barrier_init:
+ return AMDGPU::S_BARRIER_INIT_IMM;
+ case Intrinsic::amdgcn_s_barrier_join:
+ return AMDGPU::S_BARRIER_JOIN_IMM;
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ return AMDGPU::S_WAKEUP_BARRIER_IMM;
+ case Intrinsic::amdgcn_s_get_barrier_state:
+ return AMDGPU::S_GET_BARRIER_STATE_IMM;
+ };
+ } else {
+ switch (IntrID) {
+ default:
+ llvm_unreachable("not a named barrier op");
+ case Intrinsic::amdgcn_s_barrier_init:
+ return AMDGPU::S_BARRIER_INIT_M0;
+ case Intrinsic::amdgcn_s_barrier_join:
+ return AMDGPU::S_BARRIER_JOIN_M0;
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ return AMDGPU::S_WAKEUP_BARRIER_M0;
+ case Intrinsic::amdgcn_s_get_barrier_state:
+ return AMDGPU::S_GET_BARRIER_STATE_M0;
+ };
+ }
+}
+
+bool AMDGPUInstructionSelector::selectNamedBarrierInst(
+ MachineInstr &I, Intrinsic::ID IntrID) const {
+ MachineBasicBlock *MBB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state
+ ? I.getOperand(2)
+ : I.getOperand(1);
+ std::optional<int64_t> BarValImm =
+ getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
+ Register M0Val;
+ Register TmpReg0;
+
+ // For S_BARRIER_INIT, member count will always be read from M0[16:22]
+ if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
+ Register MemberCount = I.getOperand(2).getReg();
+ TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ // TODO: This should be expanded during legalization so that the the S_LSHL
+ // and S_OR can be constant-folded
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
+ .addImm(16)
+ .addReg(MemberCount);
+ M0Val = TmpReg0;
+ }
+
+ // If not inlinable, get reference to barrier depending on the instruction
+ if (!BarValImm) {
+ if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
+ // If reference to barrier id is not an inlinable constant then it must be
+ // referenced with M0[4:0]. Perform an OR with the member count to include
+ // it in M0 for S_BARRIER_INIT.
+ Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1)
+ .addReg(BarOp.getReg())
+ .addReg(TmpReg0);
+ M0Val = TmpReg1;
+ } else {
+ M0Val = BarOp.getReg();
+ }
+ }
+
+ // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required.
+ if (M0Val) {
+ auto CopyMIB =
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val);
+ constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
+ }
+
+ MachineInstrBuilder MIB;
+ unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
+ MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
+
+ if (IntrID == Intrinsic::amdgcn_s_get_barrier_state)
+ MIB.addDef(I.getOperand(0).getReg());
+
+ if (BarValImm)
+ MIB.addImm(*BarValImm);
+
+ I.eraseFromParent();
+ return true;
+}
+bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ Register CCReg = I.getOperand(0).getReg();
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE));
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
+
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
+ *MRI);
+}
+
void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/74836
More information about the cfe-commits
mailing list