[clang] 02c2468 - [Clang][NVPTX] Add NVPTX intrinsics and builtins for CUDA PTX cp.async instructions
Artem Belevich via cfe-commits
cfe-commits at lists.llvm.org
Mon May 17 09:47:36 PDT 2021
Author: Stuart Adams
Date: 2021-05-17T09:46:59-07:00
New Revision: 02c2468864bbb37f7b279aff84961815c1500b6c
URL: https://github.com/llvm/llvm-project/commit/02c2468864bbb37f7b279aff84961815c1500b6c
DIFF: https://github.com/llvm/llvm-project/commit/02c2468864bbb37f7b279aff84961815c1500b6c.diff
LOG: [Clang][NVPTX] Add NVPTX intrinsics and builtins for CUDA PTX cp.async instructions
Adds NVPTX builtins and intrinsics for the CUDA PTX `cp.async` instructions for
`sm_80` architecture or newer.
PTX ISA description of `cp.async`:
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive
Authored-by: Stuart Adams <stuart.adams at codeplay.com>
Co-Authored-by: Alexander Johnston <alexander at codeplay.com>
Differential Revision: https://reviews.llvm.org/D100394
Added:
llvm/test/CodeGen/NVPTX/async-copy.ll
llvm/test/CodeGen/NVPTX/mbarrier.ll
Modified:
clang/include/clang/Basic/BuiltinsNVPTX.def
clang/test/CodeGen/builtins-nvptx.c
llvm/include/llvm/IR/IntrinsicsNVVM.td
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index b225ddcfa3fa0..3feea854df819 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -462,6 +462,29 @@ BUILTIN(__nvvm_membar_cta, "v", "")
BUILTIN(__nvvm_membar_gl, "v", "")
BUILTIN(__nvvm_membar_sys, "v", "")
+// mbarrier
+
+TARGET_BUILTIN(__nvvm_mbarrier_init, "vWi*i", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_mbarrier_init_shared, "vWi*3i", "", AND(SM_80,PTX70))
+
+TARGET_BUILTIN(__nvvm_mbarrier_inval, "vWi*", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_mbarrier_inval_shared, "vWi*3", "", AND(SM_80,PTX70))
+
+TARGET_BUILTIN(__nvvm_mbarrier_arrive, "WiWi*", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_mbarrier_arrive_shared, "WiWi*3", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_mbarrier_arrive_noComplete, "WiWi*i", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_mbarrier_arrive_noComplete_shared, "WiWi*3i", "", AND(SM_80,PTX70))
+
+TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop, "WiWi*", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop_shared, "WiWi*3", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop_noComplete, "WiWi*i", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop_noComplete_shared, "WiWi*3i", "", AND(SM_80,PTX70))
+
+TARGET_BUILTIN(__nvvm_mbarrier_test_wait, "bWi*Wi", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_mbarrier_test_wait_shared, "bWi*3Wi", "", AND(SM_80,PTX70))
+
+TARGET_BUILTIN(__nvvm_mbarrier_pending_count, "iWi", "", AND(SM_80,PTX70))
+
// Memcpy, Memset
BUILTIN(__nvvm_memcpy, "vUc*Uc*zi","")
@@ -726,6 +749,21 @@ TARGET_BUILTIN(__imma_m8n8k32_mma_s4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63))
TARGET_BUILTIN(__imma_m8n8k32_mma_u4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63))
TARGET_BUILTIN(__imma_m8n8k32_st_c_i32, "vi*iC*UiIi", "", AND(SM_75,PTX63))
+// Async Copy
+TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive, "vWi*", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_shared, "vWi*3", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc, "vWi*", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc_shared, "vWi*3", "", AND(SM_80,PTX70))
+
+TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_4, "vv*3vC*1", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_8, "vv*3vC*1", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_cg_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70))
+
+TARGET_BUILTIN(__nvvm_cp_async_commit_group, "v", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_wait_group, "vIi", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_wait_all, "v", "", AND(SM_80,PTX70))
+
#undef BUILTIN
#undef TARGET_BUILTIN
#pragma pop_macro("AND")
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index 913da5d7b73b9..ec0f74291ad47 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -1,4 +1,10 @@
// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
+// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
+// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
+// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 \
// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
@@ -672,3 +678,80 @@ __device__ void nvvm_vote(int pred) {
__nvvm_vote_ballot(pred);
// CHECK: ret void
}
+
+// CHECK-LABEL: nvvm_mbarrier
+__device__ void nvvm_mbarrier(long long* addr, __attribute__((address_space(3))) long long* sharedAddr, int count, long long state) {
+ #if __CUDA_ARCH__ >= 800
+ __nvvm_mbarrier_init(addr, count);
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init
+ __nvvm_mbarrier_init_shared(sharedAddr, count);
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init.shared
+
+ __nvvm_mbarrier_inval(addr);
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval
+ __nvvm_mbarrier_inval_shared(sharedAddr);
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval.shared
+
+ __nvvm_mbarrier_arrive(addr);
+ // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive
+ __nvvm_mbarrier_arrive_shared(sharedAddr);
+ // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.shared
+ __nvvm_mbarrier_arrive_noComplete(addr, count);
+ // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete
+ __nvvm_mbarrier_arrive_noComplete_shared(sharedAddr, count);
+ // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared
+
+ __nvvm_mbarrier_arrive_drop(addr);
+ // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop
+ __nvvm_mbarrier_arrive_drop_shared(sharedAddr);
+ // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.shared
+ __nvvm_mbarrier_arrive_drop_noComplete(addr, count);
+ // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete
+ __nvvm_mbarrier_arrive_drop_noComplete_shared(sharedAddr, count);
+ // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared
+
+ __nvvm_mbarrier_test_wait(addr, state);
+ // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait
+ __nvvm_mbarrier_test_wait_shared(sharedAddr, state);
+ // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait.shared
+
+ __nvvm_mbarrier_pending_count(state);
+ // CHECK_PTX70_SM80: call i32 @llvm.nvvm.mbarrier.pending.count
+ #endif
+ // CHECK: ret void
+}
+
+// CHECK-LABEL: nvvm_async_copy
+__device__ void nvvm_async_copy(__attribute__((address_space(3))) void* dst, __attribute__((address_space(1))) const void* src, long long* addr, __attribute__((address_space(3))) long long* sharedAddr) {
+ #if __CUDA_ARCH__ >= 800
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive
+ __nvvm_cp_async_mbarrier_arrive(addr);
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared
+ __nvvm_cp_async_mbarrier_arrive_shared(sharedAddr);
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc
+ __nvvm_cp_async_mbarrier_arrive_noinc(addr);
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared
+ __nvvm_cp_async_mbarrier_arrive_noinc_shared(sharedAddr);
+
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4
+ __nvvm_cp_async_ca_shared_global_4(dst, src);
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8
+ __nvvm_cp_async_ca_shared_global_8(dst, src);
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16
+ __nvvm_cp_async_ca_shared_global_16(dst, src);
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16
+ __nvvm_cp_async_cg_shared_global_16(dst, src);
+
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.commit.group
+ __nvvm_cp_async_commit_group();
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 0)
+ __nvvm_cp_async_wait_group(0);
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 8)
+ __nvvm_cp_async_wait_group(8);
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 16)
+ __nvvm_cp_async_wait_group(16);
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.all
+ __nvvm_cp_async_wait_all();
+ #endif
+ // CHECK: ret void
+}
\ No newline at end of file
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 2ab48cfc4bb7d..fa66a4a779e19 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -31,7 +31,11 @@
// * llvm.nvvm.max.ull --> ibid.
// * llvm.nvvm.h2f --> llvm.convert.to.fp16.f32
-def llvm_anyi64ptr_ty : LLVMAnyPointerType<llvm_i64_ty>; // (space)i64*
+def llvm_global_i8ptr_ty : LLVMQualPointerType<llvm_i8_ty, 1>; // (global)i8*
+def llvm_shared_i8ptr_ty : LLVMQualPointerType<llvm_i8_ty, 3>; // (shared)i8*
+def llvm_i64ptr_ty : LLVMPointerType<llvm_i64_ty>; // i64*
+def llvm_any_i64ptr_ty : LLVMAnyPointerType<llvm_i64_ty>; // (space)i64*
+def llvm_shared_i64ptr_ty : LLVMQualPointerType<llvm_i64_ty, 3>; // (shared)i64*
//
// MISC
@@ -1052,6 +1056,110 @@ let TargetPrefix = "nvvm" in {
def int_nvvm_membar_sys : GCCBuiltin<"__nvvm_membar_sys">,
Intrinsic<[], [], []>;
+// Async Copy
+def int_nvvm_cp_async_mbarrier_arrive :
+ GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive">,
+ Intrinsic<[],[llvm_i64ptr_ty],[IntrConvergent]>;
+def int_nvvm_cp_async_mbarrier_arrive_shared :
+ GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_shared">,
+ Intrinsic<[],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
+def int_nvvm_cp_async_mbarrier_arrive_noinc :
+ GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_noinc">,
+ Intrinsic<[],[llvm_i64ptr_ty],[IntrConvergent]>;
+def int_nvvm_cp_async_mbarrier_arrive_noinc_shared :
+ GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_noinc_shared">,
+ Intrinsic<[],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
+
+def int_nvvm_cp_async_ca_shared_global_4 :
+ GCCBuiltin<"__nvvm_cp_async_ca_shared_global_4">,
+ Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
+ [IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
+ WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
+ "llvm.nvvm.cp.async.ca.shared.global.4">;
+def int_nvvm_cp_async_ca_shared_global_8 :
+ GCCBuiltin<"__nvvm_cp_async_ca_shared_global_8">,
+ Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
+ [IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
+ WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
+ "llvm.nvvm.cp.async.ca.shared.global.8">;
+def int_nvvm_cp_async_ca_shared_global_16 :
+ GCCBuiltin<"__nvvm_cp_async_ca_shared_global_16">,
+ Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
+ [IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
+ WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
+ "llvm.nvvm.cp.async.ca.shared.global.16">;
+def int_nvvm_cp_async_cg_shared_global_16 :
+ GCCBuiltin<"__nvvm_cp_async_cg_shared_global_16">,
+ Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
+ [IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
+ WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
+ "llvm.nvvm.cp.async.cg.shared.global.16">;
+
+def int_nvvm_cp_async_commit_group :
+ GCCBuiltin<"__nvvm_cp_async_commit_group">,
+ Intrinsic<[],[],[]>;
+
+def int_nvvm_cp_async_wait_group :
+ GCCBuiltin<"__nvvm_cp_async_wait_group">,
+ Intrinsic<[],[llvm_i32_ty],[ImmArg<ArgIndex<0>>]>;
+
+def int_nvvm_cp_async_wait_all :
+ GCCBuiltin<"__nvvm_cp_async_wait_all">,
+ Intrinsic<[],[],[]>;
+
+// mbarrier
+def int_nvvm_mbarrier_init : GCCBuiltin<"__nvvm_mbarrier_init">,
+ Intrinsic<[],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
+def int_nvvm_mbarrier_init_shared :
+ GCCBuiltin<"__nvvm_mbarrier_init_shared">,
+ Intrinsic<[],[llvm_shared_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
+
+def int_nvvm_mbarrier_inval : GCCBuiltin<"__nvvm_mbarrier_inval">,
+ Intrinsic<[],[llvm_i64ptr_ty],
+ [IntrConvergent, IntrWriteMem, IntrArgMemOnly,
+ WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
+def int_nvvm_mbarrier_inval_shared :
+ GCCBuiltin<"__nvvm_mbarrier_inval_shared">,
+ Intrinsic<[],[llvm_shared_i64ptr_ty],
+ [IntrConvergent, IntrWriteMem, IntrArgMemOnly,
+ WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
+
+def int_nvvm_mbarrier_arrive : GCCBuiltin<"__nvvm_mbarrier_arrive">,
+ Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty],[IntrConvergent]>;
+def int_nvvm_mbarrier_arrive_shared :
+ GCCBuiltin<"__nvvm_mbarrier_arrive_shared">,
+ Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
+def int_nvvm_mbarrier_arrive_noComplete :
+ GCCBuiltin<"__nvvm_mbarrier_arrive_noComplete">,
+ Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
+def int_nvvm_mbarrier_arrive_noComplete_shared :
+ GCCBuiltin<"__nvvm_mbarrier_arrive_noComplete_shared">,
+ Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
+
+def int_nvvm_mbarrier_arrive_drop :
+ GCCBuiltin<"__nvvm_mbarrier_arrive_drop">,
+ Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty],[IntrConvergent]>;
+def int_nvvm_mbarrier_arrive_drop_shared :
+ GCCBuiltin<"__nvvm_mbarrier_arrive_drop_shared">,
+ Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
+def int_nvvm_mbarrier_arrive_drop_noComplete :
+ GCCBuiltin<"__nvvm_mbarrier_arrive_drop_noComplete">,
+ Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
+def int_nvvm_mbarrier_arrive_drop_noComplete_shared :
+ GCCBuiltin<"__nvvm_mbarrier_arrive_drop_noComplete_shared">,
+ Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
+
+def int_nvvm_mbarrier_test_wait :
+ GCCBuiltin<"__nvvm_mbarrier_test_wait">,
+ Intrinsic<[llvm_i1_ty],[llvm_i64ptr_ty, llvm_i64_ty],[IntrConvergent]>;
+def int_nvvm_mbarrier_test_wait_shared :
+ GCCBuiltin<"__nvvm_mbarrier_test_wait_shared">,
+ Intrinsic<[llvm_i1_ty],[llvm_shared_i64ptr_ty, llvm_i64_ty],[IntrConvergent]>;
+
+def int_nvvm_mbarrier_pending_count :
+ GCCBuiltin<"__nvvm_mbarrier_pending_count">,
+ Intrinsic<[llvm_i32_ty],[llvm_i64_ty],[IntrNoMem, IntrConvergent]>;
+
// Generated within nvvm. Use for ldu on sm_20 or later. Second arg is the
// pointer's alignment.
def int_nvvm_ldu_global_i : Intrinsic<[llvm_anyint_ty],
@@ -1141,7 +1249,7 @@ def int_nvvm_move_ptr : Intrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty],
// For getting the handle from a texture or surface variable
def int_nvvm_texsurf_handle
- : Intrinsic<[llvm_i64_ty], [llvm_metadata_ty, llvm_anyi64ptr_ty],
+ : Intrinsic<[llvm_i64_ty], [llvm_metadata_ty, llvm_any_i64ptr_ty],
[IntrNoMem], "llvm.nvvm.texsurf.handle">;
def int_nvvm_texsurf_handle_internal
: Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty],
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 381ed4dd68874..e6b94d05ad5c5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -144,11 +144,13 @@ def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">;
def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">;
+def hasPTX70 : Predicate<"Subtarget->getPTXVersion() >= 70">;
def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">;
def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">;
+def hasSM80 : Predicate<"Subtarget->getSmVersion() >= 80">;
// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 8ccd47c0fcfda..0f6509311f075 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -288,6 +288,211 @@ def INT_MEMBAR_GL : MEMBAR<"membar.gl;", int_nvvm_membar_gl>;
def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>;
+//-----------------------------------
+// Async Copy Functions
+//-----------------------------------
+
+multiclass CP_ASYNC_MBARRIER_ARRIVE<string NoInc, string AddrSpace, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr),
+ !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"),
+ [(Intrin Int32Regs:$addr)]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
+ !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"),
+ [(Intrin Int64Regs:$addr)]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm CP_ASYNC_MBARRIER_ARRIVE :
+ CP_ASYNC_MBARRIER_ARRIVE<"", "", int_nvvm_cp_async_mbarrier_arrive>;
+defm CP_ASYNC_MBARRIER_ARRIVE_SHARED :
+ CP_ASYNC_MBARRIER_ARRIVE<"", ".shared", int_nvvm_cp_async_mbarrier_arrive_shared>;
+defm CP_ASYNC_MBARRIER_ARRIVE_NOINC :
+ CP_ASYNC_MBARRIER_ARRIVE<".noinc", "", int_nvvm_cp_async_mbarrier_arrive_noinc>;
+defm CP_ASYNC_MBARRIER_ARRIVE_NOINC_SHARED :
+ CP_ASYNC_MBARRIER_ARRIVE<".noinc", ".shared", int_nvvm_cp_async_mbarrier_arrive_noinc_shared>;
+
+multiclass CP_ASYNC_CA_SHARED_GLOBAL_I<string cpsize, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
+ !strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"),
+ [(Intrin Int32Regs:$dst, Int32Regs:$src)]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
+ !strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"),
+ [(Intrin Int64Regs:$dst, Int64Regs:$src)]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm CP_ASYNC_CA_SHARED_GLOBAL_4 :
+ CP_ASYNC_CA_SHARED_GLOBAL_I<"4", int_nvvm_cp_async_ca_shared_global_4>;
+
+defm CP_ASYNC_CA_SHARED_GLOBAL_8 :
+ CP_ASYNC_CA_SHARED_GLOBAL_I<"8", int_nvvm_cp_async_ca_shared_global_8>;
+
+defm CP_ASYNC_CA_SHARED_GLOBAL_16 :
+ CP_ASYNC_CA_SHARED_GLOBAL_I<"16", int_nvvm_cp_async_ca_shared_global_16>;
+
+multiclass CP_ASYNC_CG_SHARED_GLOBAL<string cpsize, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
+ !strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"),
+ [(Intrin Int32Regs:$dst, Int32Regs:$src)]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
+ !strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"),
+ [(Intrin Int64Regs:$dst, Int64Regs:$src)]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm CP_ASYNC_CG_SHARED_GLOBAL_16 :
+ CP_ASYNC_CG_SHARED_GLOBAL<"16", int_nvvm_cp_async_cg_shared_global_16>;
+
+def CP_ASYNC_COMMIT_GROUP :
+ NVPTXInst<(outs), (ins), "cp.async.commit_group;", [(int_nvvm_cp_async_commit_group)]>,
+ Requires<[hasPTX70, hasSM80]>;
+
+def CP_ASYNC_WAIT_GROUP :
+ NVPTXInst<(outs), (ins i32imm:$n), "cp.async.wait_group $n;",
+ [(int_nvvm_cp_async_wait_group (i32 timm:$n))]>,
+ Requires<[hasPTX70, hasSM80]>;
+
+def CP_ASYNC_WAIT_ALL :
+ NVPTXInst<(outs), (ins), "cp.async.wait_all;",
+ [(int_nvvm_cp_async_wait_all)]>,
+ Requires<[hasPTX70, hasSM80]>;
+
+//-----------------------------------
+// MBarrier Functions
+//-----------------------------------
+
+multiclass MBARRIER_INIT<string AddrSpace, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr, Int32Regs:$count),
+ !strconcat("mbarrier.init", AddrSpace, ".b64 [$addr], $count;"),
+ [(Intrin Int32Regs:$addr, Int32Regs:$count)]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr, Int32Regs:$count),
+ !strconcat("mbarrier.init", AddrSpace, ".b64 [$addr], $count;"),
+ [(Intrin Int64Regs:$addr, Int32Regs:$count)]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_INIT : MBARRIER_INIT<"", int_nvvm_mbarrier_init>;
+defm MBARRIER_INIT_SHARED : MBARRIER_INIT<".shared",
+ int_nvvm_mbarrier_init_shared>;
+
+multiclass MBARRIER_INVAL<string AddrSpace, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr),
+ !strconcat("mbarrier.inval", AddrSpace, ".b64 [$addr];"),
+ [(Intrin Int32Regs:$addr)]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
+ !strconcat("mbarrier.inval", AddrSpace, ".b64 [$addr];"),
+ [(Intrin Int64Regs:$addr)]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_INVAL : MBARRIER_INVAL<"", int_nvvm_mbarrier_inval>;
+defm MBARRIER_INVAL_SHARED : MBARRIER_INVAL<".shared",
+ int_nvvm_mbarrier_inval_shared>;
+
+multiclass MBARRIER_ARRIVE<string AddrSpace, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs Int64Regs:$state), (ins Int32Regs:$addr),
+ !strconcat("mbarrier.arrive", AddrSpace, ".b64 $state, [$addr];"),
+ [(set Int64Regs:$state, (Intrin Int32Regs:$addr))]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs Int64Regs:$state), (ins Int64Regs:$addr),
+ !strconcat("mbarrier.arrive", AddrSpace, ".b64 $state, [$addr];"),
+ [(set Int64Regs:$state, (Intrin Int64Regs:$addr))]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_ARRIVE : MBARRIER_ARRIVE<"", int_nvvm_mbarrier_arrive>;
+defm MBARRIER_ARRIVE_SHARED :
+ MBARRIER_ARRIVE<".shared", int_nvvm_mbarrier_arrive_shared>;
+
+multiclass MBARRIER_ARRIVE_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs Int64Regs:$state),
+ (ins Int32Regs:$addr, Int32Regs:$count),
+ !strconcat("mbarrier.arrive.noComplete", AddrSpace,
+ ".b64 $state, [$addr], $count;"),
+ [(set Int64Regs:$state, (Intrin Int32Regs:$addr, Int32Regs:$count))]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs Int64Regs:$state),
+ (ins Int64Regs:$addr, Int32Regs:$count),
+ !strconcat("mbarrier.arrive.noComplete", AddrSpace,
+ ".b64 $state, [$addr], $count;"),
+ [(set Int64Regs:$state, (Intrin Int64Regs:$addr, Int32Regs:$count))]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_ARRIVE_NOCOMPLETE :
+ MBARRIER_ARRIVE_NOCOMPLETE<"", int_nvvm_mbarrier_arrive_noComplete>;
+defm MBARRIER_ARRIVE_NOCOMPLETE_SHARED :
+ MBARRIER_ARRIVE_NOCOMPLETE<".shared", int_nvvm_mbarrier_arrive_noComplete_shared>;
+
+multiclass MBARRIER_ARRIVE_DROP<string AddrSpace, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs Int64Regs:$state), (ins Int32Regs:$addr),
+ !strconcat("mbarrier.arrive_drop", AddrSpace,
+ ".b64 $state, [$addr];"),
+ [(set Int64Regs:$state, (Intrin Int32Regs:$addr))]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs Int64Regs:$state), (ins Int64Regs:$addr),
+ !strconcat("mbarrier.arrive_drop", AddrSpace,
+ ".b64 $state, [$addr];"),
+ [(set Int64Regs:$state, (Intrin Int64Regs:$addr))]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_ARRIVE_DROP :
+ MBARRIER_ARRIVE_DROP<"", int_nvvm_mbarrier_arrive_drop>;
+defm MBARRIER_ARRIVE_DROP_SHARED :
+ MBARRIER_ARRIVE_DROP<".shared", int_nvvm_mbarrier_arrive_drop_shared>;
+
+multiclass MBARRIER_ARRIVE_DROP_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs Int64Regs:$state),
+ (ins Int32Regs:$addr, Int32Regs:$count),
+ !strconcat("mbarrier.arrive_drop.noComplete", AddrSpace,
+ ".b64 $state, [$addr], $count;"),
+ [(set Int64Regs:$state, (Intrin Int32Regs:$addr, Int32Regs:$count))]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs Int64Regs:$state),
+ (ins Int64Regs:$addr, Int32Regs:$count),
+ !strconcat("mbarrier.arrive_drop.noComplete", AddrSpace,
+ ".b64 $state, [$addr], $count;"),
+ [(set Int64Regs:$state, (Intrin Int64Regs:$addr, Int32Regs:$count))]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_ARRIVE_DROP_NOCOMPLETE :
+ MBARRIER_ARRIVE_DROP_NOCOMPLETE<"", int_nvvm_mbarrier_arrive_drop_noComplete>;
+defm MBARRIER_ARRIVE_DROP_NOCOMPLETE_SHARED :
+ MBARRIER_ARRIVE_DROP_NOCOMPLETE<".shared",
+ int_nvvm_mbarrier_arrive_drop_noComplete_shared>;
+
+multiclass MBARRIER_TEST_WAIT<string AddrSpace, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs Int1Regs:$res), (ins Int32Regs:$addr, Int64Regs:$state),
+ !strconcat("mbarrier.test_wait", AddrSpace, ".b64 $res, [$addr], $state;"),
+ [(set Int1Regs:$res, (Intrin Int32Regs:$addr, Int64Regs:$state))]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs Int1Regs:$res), (ins Int64Regs:$addr, Int64Regs:$state),
+ !strconcat("mbarrier.test_wait", AddrSpace, ".b64 $res, [$addr], $state;"),
+ [(set Int1Regs:$res, (Intrin Int64Regs:$addr, Int64Regs:$state))]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_TEST_WAIT :
+ MBARRIER_TEST_WAIT<"", int_nvvm_mbarrier_test_wait>;
+defm MBARRIER_TEST_WAIT_SHARED :
+ MBARRIER_TEST_WAIT<".shared", int_nvvm_mbarrier_test_wait_shared>;
+
+class MBARRIER_PENDING_COUNT<Intrinsic Intrin> :
+ NVPTXInst<(outs Int32Regs:$res), (ins Int64Regs:$state),
+ "mbarrier.pending_count.b64 $res, $state;",
+ [(set Int32Regs:$res, (Intrin Int64Regs:$state))]>,
+ Requires<[hasPTX70, hasSM80]>;
+
+def MBARRIER_PENDING_COUNT :
+ MBARRIER_PENDING_COUNT<int_nvvm_mbarrier_pending_count>;
+
//-----------------------------------
// Math Functions
//-----------------------------------
diff --git a/llvm/test/CodeGen/NVPTX/async-copy.ll b/llvm/test/CodeGen/NVPTX/async-copy.ll
new file mode 100644
index 0000000000000..182779c2e0500
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/async-copy.ll
@@ -0,0 +1,110 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s
+
+declare void @llvm.nvvm.cp.async.wait.group(i32)
+
+; ALL-LABEL: asyncwaitgroup
+define void @asyncwaitgroup() {
+ ; ALL: cp.async.wait_group 8;
+ tail call void @llvm.nvvm.cp.async.wait.group(i32 8)
+ ; ALL: cp.async.wait_group 0;
+ tail call void @llvm.nvvm.cp.async.wait.group(i32 0)
+ ; ALL: cp.async.wait_group 16;
+ tail call void @llvm.nvvm.cp.async.wait.group(i32 16)
+ ret void
+}
+
+declare void @llvm.nvvm.cp.async.wait.all()
+
+; ALL-LABEL: asyncwaitall
+define void @asyncwaitall() {
+; ALL: cp.async.wait_all
+ tail call void @llvm.nvvm.cp.async.wait.all()
+ ret void
+}
+
+declare void @llvm.nvvm.cp.async.commit.group()
+
+; ALL-LABEL: asynccommitgroup
+define void @asynccommitgroup() {
+; ALL: cp.async.commit_group
+ tail call void @llvm.nvvm.cp.async.commit.group()
+ ret void
+}
+
+declare void @llvm.nvvm.cp.async.mbarrier.arrive(i64* %a)
+declare void @llvm.nvvm.cp.async.mbarrier.arrive.shared(i64 addrspace(3)* %a)
+declare void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(i64* %a)
+declare void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(i64 addrspace(3)* %a)
+
+; CHECK-LABEL: asyncmbarrier
+define void @asyncmbarrier(i64* %a) {
+; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}];
+ tail call void @llvm.nvvm.cp.async.mbarrier.arrive(i64* %a)
+ ret void
+}
+
+; CHECK-LABEL: asyncmbarriershared
+define void @asyncmbarriershared(i64 addrspace(3)* %a) {
+; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}];
+ tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(i64 addrspace(3)* %a)
+ ret void
+}
+
+; CHECK-LABEL: asyncmbarriernoinc
+define void @asyncmbarriernoinc(i64* %a) {
+; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}];
+ tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(i64* %a)
+ ret void
+}
+
+; CHECK-LABEL: asyncmbarriernoincshared
+define void @asyncmbarriernoincshared(i64 addrspace(3)* %a) {
+; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}];
+ tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(i64 addrspace(3)* %a)
+ ret void
+}
+
+declare void @llvm.nvvm.cp.async.ca.shared.global.4(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
+
+; CHECK-LABEL: asynccasharedglobal4i8
+define void @asynccasharedglobal4i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) {
+; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4;
+; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4;
+ tail call void @llvm.nvvm.cp.async.ca.shared.global.4(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
+ ret void
+}
+
+declare void @llvm.nvvm.cp.async.ca.shared.global.8(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
+
+; CHECK-LABEL: asynccasharedglobal8i8
+define void @asynccasharedglobal8i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) {
+; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 8;
+; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 8;
+ tail call void @llvm.nvvm.cp.async.ca.shared.global.8(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
+ ret void
+}
+
+declare void @llvm.nvvm.cp.async.ca.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
+
+; CHECK-LABEL: asynccasharedglobal16i8
+define void @asynccasharedglobal16i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) {
+; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16;
+; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16;
+ tail call void @llvm.nvvm.cp.async.ca.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
+ ret void
+}
+
+declare void @llvm.nvvm.cp.async.cg.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
+
+; CHECK-LABEL: asynccgsharedglobal16i8
+define void @asynccgsharedglobal16i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) {
+; CHECK_PTX32: cp.async.cg.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16;
+; CHECK_PTX64: cp.async.cg.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16;
+ tail call void @llvm.nvvm.cp.async.cg.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/mbarrier.ll b/llvm/test/CodeGen/NVPTX/mbarrier.ll
new file mode 100644
index 0000000000000..160c4030d26ca
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/mbarrier.ll
@@ -0,0 +1,145 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_80 | FileCheck %s -check-prefix=CHECK_PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s -check-prefix=CHECK_PTX64
+
+declare void @llvm.nvvm.mbarrier.init(i64* %a, i32 %b)
+declare void @llvm.nvvm.mbarrier.init.shared(i64 addrspace(3)* %a, i32 %b)
+
+; CHECK-LABEL: barrierinit
+define void @barrierinit(i64* %a, i32 %b) {
+; CHECK_PTX32: mbarrier.init.b64 [%r{{[0-9]+}}], %r{{[0-9]+}};
+; CHECK_PTX64: mbarrier.init.b64 [%rd{{[0-9]+}}], %r{{[0-9]+}};
+ tail call void @llvm.nvvm.mbarrier.init(i64* %a, i32 %b)
+ ret void
+}
+
+; CHECK-LABEL: barrierinitshared
+define void @barrierinitshared(i64 addrspace(3)* %a, i32 %b) {
+; CHECK_PTX32: mbarrier.init.shared.b64 [%r{{[0-9]+}}], %r{{[0-9]+}};
+; CHECK_PTX64: mbarrier.init.shared.b64 [%rd{{[0-9]+}}], %r{{[0-9]+}};
+ tail call void @llvm.nvvm.mbarrier.init.shared(i64 addrspace(3)* %a, i32 %b)
+ ret void
+}
+
+declare void @llvm.nvvm.mbarrier.inval(i64* %a)
+declare void @llvm.nvvm.mbarrier.inval.shared(i64 addrspace(3)* %a)
+
+; CHECK-LABEL: barrierinval
+define void @barrierinval(i64* %a) {
+; CHECK_PTX32: mbarrier.inval.b64 [%r{{[0-1]+}}];
+; CHECK_PTX64: mbarrier.inval.b64 [%rd{{[0-1]+}}];
+ tail call void @llvm.nvvm.mbarrier.inval(i64* %a)
+ ret void
+}
+
+; CHECK-LABEL: barrierinvalshared
+define void @barrierinvalshared(i64 addrspace(3)* %a) {
+; CHECK_PTX32: mbarrier.inval.shared.b64 [%r{{[0-1]+}}];
+; CHECK_PTX64: mbarrier.inval.shared.b64 [%rd{{[0-1]+}}];
+ tail call void @llvm.nvvm.mbarrier.inval.shared(i64 addrspace(3)* %a)
+ ret void
+}
+
+declare i64 @llvm.nvvm.mbarrier.arrive(i64* %a)
+declare i64 @llvm.nvvm.mbarrier.arrive.shared(i64 addrspace(3)* %a)
+
+; CHECK-LABEL: barrierarrive
+define void @barrierarrive(i64* %a) {
+; CHECK_PTX32: mbarrier.arrive.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
+; CHECK_PTX64: mbarrier.arrive.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
+ %ret = tail call i64 @llvm.nvvm.mbarrier.arrive(i64* %a)
+ ret void
+}
+
+; CHECK-LABEL: barrierarriveshared
+define void @barrierarriveshared(i64 addrspace(3)* %a) {
+; CHECK_PTX32: mbarrier.arrive.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
+; CHECK_PTX64: mbarrier.arrive.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
+ %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.shared(i64 addrspace(3)* %a)
+ ret void
+}
+
+declare i64 @llvm.nvvm.mbarrier.arrive.noComplete(i64* %a, i32 %b)
+declare i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
+
+; CHECK-LABEL: barrierarrivenoComplete
+define void @barrierarrivenoComplete(i64* %a, i32 %b) {
+; CHECK_PTX32: mbarrier.arrive.noComplete.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
+; CHECK_PTX64: mbarrier.arrive.noComplete.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
+ %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.noComplete(i64* %a, i32 %b)
+ ret void
+}
+
+; CHECK-LABEL: barrierarrivenoCompleteshared
+define void @barrierarrivenoCompleteshared(i64 addrspace(3)* %a, i32 %b) {
+; CHECK_PTX32: mbarrier.arrive.noComplete.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
+; CHECK_PTX64: mbarrier.arrive.noComplete.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
+ %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
+ ret void
+}
+
+declare i64 @llvm.nvvm.mbarrier.arrive.drop(i64* %a)
+declare i64 @llvm.nvvm.mbarrier.arrive.drop.shared(i64 addrspace(3)* %a)
+
+; CHECK-LABEL: barrierarrivedrop
+define void @barrierarrivedrop(i64* %a) {
+; CHECK_PTX32: mbarrier.arrive_drop.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
+; CHECK_PTX64: mbarrier.arrive_drop.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
+ %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop(i64* %a)
+ ret void
+}
+
+; CHECK-LABEL: barrierarrivedropshared
+define void @barrierarrivedropshared(i64 addrspace(3)* %a) {
+; CHECK_PTX32: mbarrier.arrive_drop.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
+; CHECK_PTX64: mbarrier.arrive_drop.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
+ %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.shared(i64 addrspace(3)* %a)
+ ret void
+}
+
+declare i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete(i64* %a, i32 %b)
+declare i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
+
+; CHECK-LABEL: barrierarrivedropnoComplete
+define void @barrierarrivedropnoComplete(i64* %a, i32 %b) {
+; CHECK_PTX32: mbarrier.arrive_drop.noComplete.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
+; CHECK_PTX64: mbarrier.arrive_drop.noComplete.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
+ %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete(i64* %a, i32 %b)
+ ret void
+}
+
+; CHECK-LABEL: barrierarrivedropnoCompleteshared
+define void @barrierarrivedropnoCompleteshared(i64 addrspace(3)* %a, i32 %b) {
+; CHECK_PTX32: mbarrier.arrive_drop.noComplete.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
+; CHECK_PTX64: mbarrier.arrive_drop.noComplete.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
+ %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
+ ret void
+}
+
+declare i1 @llvm.nvvm.mbarrier.test.wait(i64* %a, i64 %b)
+declare i1 @llvm.nvvm.mbarrier.test.wait.shared(i64 addrspace(3)* %a, i64 %b)
+
+; CHECK-LABEL: barriertestwait
+define void @barriertestwait(i64* %a, i64 %b) {
+; CHECK_PTX32: mbarrier.test_wait.b64 %p{{[0-9]+}}, [%r{{[0-9]+}}], %rd{{[0-9]+}};
+; CHECK_PTX64: mbarrier.test_wait.b64 %p{{[0-9]+}}, [%rd{{[0-9]+}}], %rd{{[0-9]+}};
+ %ret = tail call i1 @llvm.nvvm.mbarrier.test.wait(i64* %a, i64 %b)
+ ret void
+}
+
+; CHECK-LABEL: barriertestwaitshared
+define void @barriertestwaitshared(i64 addrspace(3)* %a, i64 %b) {
+; CHECK_PTX32: mbarrier.test_wait.shared.b64 %p{{[0-9]+}}, [%r{{[0-9]+}}], %rd{{[0-9]+}};
+; CHECK_PTX64: mbarrier.test_wait.shared.b64 %p{{[0-9]+}}, [%rd{{[0-9]+}}], %rd{{[0-9]+}};
+ %ret = tail call i1 @llvm.nvvm.mbarrier.test.wait.shared(i64 addrspace(3)* %a, i64 %b)
+ ret void
+}
+
+declare i32 @llvm.nvvm.mbarrier.pending.count(i64 %b)
+
+; CHECK-LABEL: barrierpendingcount
+define i32 @barrierpendingcount(i64* %a, i64 %b) {
+; CHECK_PTX32: mbarrier.pending_count.b64 %r{{[0-9]+}}, %rd{{[0-9]+}};
+; CHECK_PTX64: mbarrier.pending_count.b64 %r{{[0-9]+}}, %rd{{[0-9]+}};
+ %ret = tail call i32 @llvm.nvvm.mbarrier.pending.count(i64 %b)
+ ret i32 %ret
+}
More information about the cfe-commits
mailing list