[clang] 25708b3 - [NVPTX, CUDA] barrier intrinsics and builtins for sm_90
Artem Belevich via cfe-commits
cfe-commits at lists.llvm.org
Thu May 25 11:58:46 PDT 2023
Author: Artem Belevich
Date: 2023-05-25T11:57:57-07:00
New Revision: 25708b3df6e359123d5bce137652af812e168cfc
URL: https://github.com/llvm/llvm-project/commit/25708b3df6e359123d5bce137652af812e168cfc
DIFF: https://github.com/llvm/llvm-project/commit/25708b3df6e359123d5bce137652af812e168cfc.diff
LOG: [NVPTX, CUDA] barrier intrinsics and builtins for sm_90
Differential Revision: https://reviews.llvm.org/D151363
Added:
Modified:
clang/include/clang/Basic/BuiltinsNVPTX.def
clang/lib/CodeGen/CGBuiltin.cpp
clang/test/CodeGenCUDA/builtins-sm90.cu
llvm/include/llvm/IR/IntrinsicsNVVM.td
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 39a34eb296115..7ffb38d50a6cf 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -582,6 +582,11 @@ TARGET_BUILTIN(__nvvm_bar_warp_sync, "vUi", "n", PTX60)
TARGET_BUILTIN(__nvvm_barrier_sync, "vUi", "n", PTX60)
TARGET_BUILTIN(__nvvm_barrier_sync_cnt, "vUiUi", "n", PTX60)
+TARGET_BUILTIN(__nvvm_barrier_cluster_arrive, "v", "n", AND(SM_90,PTX78))
+TARGET_BUILTIN(__nvvm_barrier_cluster_arrive_relaxed, "v", "n", AND(SM_90,PTX80))
+TARGET_BUILTIN(__nvvm_barrier_cluster_wait, "v", "n", AND(SM_90,PTX78))
+TARGET_BUILTIN(__nvvm_fence_sc_cluster, "v", "n", AND(SM_90,PTX78))
+
// Shuffle
BUILTIN(__nvvm_shfl_down_i32, "iiii", "")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 9b151befee9b7..c4f1a436fef1b 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18962,6 +18962,18 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
return Builder.CreateCall(
CGM.getIntrinsic(Intrinsic::nvvm_getctarank_shared_cluster),
EmitScalarExpr(E->getArg(0)));
+ case NVPTX::BI__nvvm_barrier_cluster_arrive:
+ return Builder.CreateCall(
+ CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive));
+ case NVPTX::BI__nvvm_barrier_cluster_arrive_relaxed:
+ return Builder.CreateCall(
+ CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive_relaxed));
+ case NVPTX::BI__nvvm_barrier_cluster_wait:
+ return Builder.CreateCall(
+ CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_wait));
+ case NVPTX::BI__nvvm_fence_sc_cluster:
+ return Builder.CreateCall(
+ CGM.getIntrinsic(Intrinsic::nvvm_fence_sc_cluster));
default:
return nullptr;
}
diff --git a/clang/test/CodeGenCUDA/builtins-sm90.cu b/clang/test/CodeGenCUDA/builtins-sm90.cu
index 4f67c0e93cfe5..a639c7716adb1 100644
--- a/clang/test/CodeGenCUDA/builtins-sm90.cu
+++ b/clang/test/CodeGenCUDA/builtins-sm90.cu
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx78" "-target-cpu" "sm_90" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx80" "-target-cpu" "sm_90" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
// CHECK: define{{.*}} void @_Z6kernelPlPvj(
__attribute__((global)) void kernel(long *out, void *ptr, unsigned u) {
@@ -57,5 +57,14 @@ __attribute__((global)) void kernel(long *out, void *ptr, unsigned u) {
// CHECK: call i32 @llvm.nvvm.getctarank.shared.cluster(ptr addrspace(3) {{.*}})
out[i++] = __nvvm_getctarank_shared_cluster(sptr);
+ // CHECK: call void @llvm.nvvm.barrier.cluster.arrive()
+ __nvvm_barrier_cluster_arrive();
+ // CHECK: call void @llvm.nvvm.barrier.cluster.arrive.relaxed()
+ __nvvm_barrier_cluster_arrive_relaxed();
+ // CHECK: call void @llvm.nvvm.barrier.cluster.wait()
+ __nvvm_barrier_cluster_wait();
+ // CHECK: call void @llvm.nvvm.fence.sc.cluster()
+ __nvvm_fence_sc_cluster();
+
// CHECK: ret void
}
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 5ffcd4bfad376..7e4ad18cf5321 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1358,6 +1358,14 @@ let TargetPrefix = "nvvm" in {
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>,
ClangBuiltin<"__nvvm_barrier_sync_cnt">;
+ // barrier.cluster.[wait, arrive, arrive.relaxed]
+ def int_nvvm_barrier_cluster_arrive :
+ Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>;
+ def int_nvvm_barrier_cluster_arrive_relaxed :
+ Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>;
+ def int_nvvm_barrier_cluster_wait :
+ Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>;
+
// Membar
def int_nvvm_membar_cta : ClangBuiltin<"__nvvm_membar_cta">,
Intrinsic<[], [], [IntrNoCallback]>;
@@ -1365,6 +1373,8 @@ let TargetPrefix = "nvvm" in {
Intrinsic<[], [], [IntrNoCallback]>;
def int_nvvm_membar_sys : ClangBuiltin<"__nvvm_membar_sys">,
Intrinsic<[], [], [IntrNoCallback]>;
+ def int_nvvm_fence_sc_cluster:
+ Intrinsic<[], [], [IntrNoCallback]>;
// Async Copy
def int_nvvm_cp_async_mbarrier_arrive :
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 989556df66521..16ae89cebfc8b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -132,6 +132,18 @@ def INT_BARRIER_SYNC_CNT_II : NVPTXInst<(outs), (ins i32imm:$id, i32imm:$cnt),
"barrier.sync \t$id, $cnt;",
[(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>,
Requires<[hasPTX<60>, hasSM<30>]>;
+class INT_BARRIER_CLUSTER<string variant, Intrinsic Intr,
+ list<Predicate> Preds = [hasPTX<78>, hasSM<90>]>:
+ NVPTXInst<(outs), (ins), "barrier.cluster."# variant #";", [(Intr)]>,
+ Requires<Preds>;
+
+def barrier_cluster_arrive:
+ INT_BARRIER_CLUSTER<"arrive", int_nvvm_barrier_cluster_arrive>;
+def barrier_cluster_arrive_relaxed:
+ INT_BARRIER_CLUSTER<"arrive.relaxed",
+ int_nvvm_barrier_cluster_arrive_relaxed, [hasPTX<80>, hasSM<90>]>;
+def barrier_cluster_wait:
+ INT_BARRIER_CLUSTER<"wait", int_nvvm_barrier_cluster_wait>;
class SHFL_INSTR<bit sync, string mode, string reg, bit return_pred,
bit offset_imm, bit mask_imm, bit threadmask_imm>
@@ -303,6 +315,9 @@ def INT_MEMBAR_CTA : MEMBAR<"membar.cta;", int_nvvm_membar_cta>;
def INT_MEMBAR_GL : MEMBAR<"membar.gl;", int_nvvm_membar_gl>;
def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>;
+def INT_FENCE_SC_CLUSTER:
+ MEMBAR<"fence.sc.cluster;", int_nvvm_fence_sc_cluster>,
+ Requires<[hasPTX<78>, hasSM<90>]>;
//-----------------------------------
// Async Copy Functions
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll b/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll
index 83b4208e2580a..ff2f60c04d370 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78| FileCheck --check-prefixes=CHECK %s
-; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
; CHECK-LABEL: test_isspacep
define i1 @test_isspacep_shared_cluster(ptr %p) {
@@ -120,6 +120,19 @@ define i1 @test_is_explicit_cluster() {
ret i1 %x
}
+; CHECK-LABEL: test_barrier_cluster(
+define void @test_barrier_cluster() {
+; CHECK: barrier.cluster.arrive;
+ call void @llvm.nvvm.barrier.cluster.arrive()
+; CHECK: barrier.cluster.arrive.relaxed;
+ call void @llvm.nvvm.barrier.cluster.arrive.relaxed()
+; CHECK: barrier.cluster.wait;
+ call void @llvm.nvvm.barrier.cluster.wait()
+; CHECK: fence.sc.cluster
+ call void @llvm.nvvm.fence.sc.cluster()
+ ret void
+}
+
declare i1 @llvm.nvvm.isspacep.shared.cluster(ptr %p);
declare ptr @llvm.nvvm.mapa(ptr %p, i32 %r);
@@ -137,3 +150,7 @@ declare i32 @llvm.nvvm.read.ptx.sreg.nclusterid.w()
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.ctarank()
declare i32 @llvm.nvvm.read.ptx.sreg.cluster.nctarank()
declare i1 @llvm.nvvm.is_explicit_cluster()
+declare void @llvm.nvvm.barrier.cluster.arrive()
+declare void @llvm.nvvm.barrier.cluster.arrive.relaxed()
+declare void @llvm.nvvm.barrier.cluster.wait()
+declare void @llvm.nvvm.fence.sc.cluster()
More information about the cfe-commits
mailing list