[clang] 02c2468 - [Clang][NVPTX] Add NVPTX intrinsics and builtins for CUDA PTX cp.async instructions

Artem Belevich via cfe-commits cfe-commits at lists.llvm.org
Mon May 17 09:47:36 PDT 2021


Author: Stuart Adams
Date: 2021-05-17T09:46:59-07:00
New Revision: 02c2468864bbb37f7b279aff84961815c1500b6c

URL: https://github.com/llvm/llvm-project/commit/02c2468864bbb37f7b279aff84961815c1500b6c
DIFF: https://github.com/llvm/llvm-project/commit/02c2468864bbb37f7b279aff84961815c1500b6c.diff

LOG: [Clang][NVPTX] Add NVPTX intrinsics and builtins for CUDA PTX cp.async instructions

Adds NVPTX builtins and intrinsics for the CUDA PTX `cp.async` instructions for
`sm_80` architecture or newer.

PTX ISA description of `cp.async`:
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive

Authored-by: Stuart Adams <stuart.adams at codeplay.com>
Co-Authored-by: Alexander Johnston <alexander at codeplay.com>

Differential Revision: https://reviews.llvm.org/D100394

Added: 
    llvm/test/CodeGen/NVPTX/async-copy.ll
    llvm/test/CodeGen/NVPTX/mbarrier.ll

Modified: 
    clang/include/clang/Basic/BuiltinsNVPTX.def
    clang/test/CodeGen/builtins-nvptx.c
    llvm/include/llvm/IR/IntrinsicsNVVM.td
    llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
    llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index b225ddcfa3fa0..3feea854df819 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -462,6 +462,29 @@ BUILTIN(__nvvm_membar_cta, "v", "")
 BUILTIN(__nvvm_membar_gl, "v", "")
 BUILTIN(__nvvm_membar_sys, "v", "")
 
+// mbarrier
+
+TARGET_BUILTIN(__nvvm_mbarrier_init, "vWi*i", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_mbarrier_init_shared, "vWi*3i", "", AND(SM_80,PTX70))
+
+TARGET_BUILTIN(__nvvm_mbarrier_inval, "vWi*", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_mbarrier_inval_shared, "vWi*3", "", AND(SM_80,PTX70))
+
+TARGET_BUILTIN(__nvvm_mbarrier_arrive, "WiWi*", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_mbarrier_arrive_shared, "WiWi*3", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_mbarrier_arrive_noComplete, "WiWi*i", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_mbarrier_arrive_noComplete_shared, "WiWi*3i", "", AND(SM_80,PTX70))
+
+TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop, "WiWi*", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop_shared, "WiWi*3", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop_noComplete, "WiWi*i", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop_noComplete_shared, "WiWi*3i", "", AND(SM_80,PTX70))
+
+TARGET_BUILTIN(__nvvm_mbarrier_test_wait, "bWi*Wi", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_mbarrier_test_wait_shared, "bWi*3Wi", "", AND(SM_80,PTX70))
+
+TARGET_BUILTIN(__nvvm_mbarrier_pending_count, "iWi", "", AND(SM_80,PTX70))
+
 // Memcpy, Memset
 
 BUILTIN(__nvvm_memcpy, "vUc*Uc*zi","")
@@ -726,6 +749,21 @@ TARGET_BUILTIN(__imma_m8n8k32_mma_s4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63))
 TARGET_BUILTIN(__imma_m8n8k32_mma_u4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63))
 TARGET_BUILTIN(__imma_m8n8k32_st_c_i32, "vi*iC*UiIi", "", AND(SM_75,PTX63))
 
+// Async Copy
+TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive, "vWi*", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_shared, "vWi*3", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc, "vWi*", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc_shared, "vWi*3", "", AND(SM_80,PTX70))
+
+TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_4, "vv*3vC*1", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_8, "vv*3vC*1", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_cg_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70))
+
+TARGET_BUILTIN(__nvvm_cp_async_commit_group, "v", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_wait_group, "vIi", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_wait_all, "v", "", AND(SM_80,PTX70))
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
 #pragma pop_macro("AND")

diff  --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index 913da5d7b73b9..ec0f74291ad47 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -1,4 +1,10 @@
 // REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
+// RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s
+// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
+// RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
+// RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s
 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 \
 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
@@ -672,3 +678,80 @@ __device__ void nvvm_vote(int pred) {
   __nvvm_vote_ballot(pred);
   // CHECK: ret void
 }
+
+// CHECK-LABEL: nvvm_mbarrier
+__device__ void nvvm_mbarrier(long long* addr, __attribute__((address_space(3))) long long* sharedAddr, int count, long long state) {
+  #if __CUDA_ARCH__ >= 800
+  __nvvm_mbarrier_init(addr, count);
+  // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init
+  __nvvm_mbarrier_init_shared(sharedAddr, count);
+  // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init.shared
+
+  __nvvm_mbarrier_inval(addr);
+  // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval
+  __nvvm_mbarrier_inval_shared(sharedAddr);
+  // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval.shared
+
+  __nvvm_mbarrier_arrive(addr);
+  // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive
+  __nvvm_mbarrier_arrive_shared(sharedAddr);
+  // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.shared
+  __nvvm_mbarrier_arrive_noComplete(addr, count);
+  // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete
+  __nvvm_mbarrier_arrive_noComplete_shared(sharedAddr, count);
+  // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared
+
+  __nvvm_mbarrier_arrive_drop(addr);
+  // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop
+  __nvvm_mbarrier_arrive_drop_shared(sharedAddr);
+  // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.shared
+  __nvvm_mbarrier_arrive_drop_noComplete(addr, count);
+  // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete
+  __nvvm_mbarrier_arrive_drop_noComplete_shared(sharedAddr, count);
+  // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared
+
+  __nvvm_mbarrier_test_wait(addr, state);
+  // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait
+  __nvvm_mbarrier_test_wait_shared(sharedAddr, state);
+  // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait.shared
+
+  __nvvm_mbarrier_pending_count(state);
+  // CHECK_PTX70_SM80: call i32 @llvm.nvvm.mbarrier.pending.count
+  #endif
+  // CHECK: ret void
+}
+
+// CHECK-LABEL: nvvm_async_copy
+__device__ void nvvm_async_copy(__attribute__((address_space(3))) void* dst, __attribute__((address_space(1))) const void* src, long long* addr, __attribute__((address_space(3))) long long* sharedAddr) {
+  #if __CUDA_ARCH__ >= 800
+  // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive
+  __nvvm_cp_async_mbarrier_arrive(addr);
+  // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared
+  __nvvm_cp_async_mbarrier_arrive_shared(sharedAddr);
+  // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc
+  __nvvm_cp_async_mbarrier_arrive_noinc(addr);
+  // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared
+  __nvvm_cp_async_mbarrier_arrive_noinc_shared(sharedAddr);
+
+  // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4
+  __nvvm_cp_async_ca_shared_global_4(dst, src);
+  // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8
+  __nvvm_cp_async_ca_shared_global_8(dst, src);
+  // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16
+  __nvvm_cp_async_ca_shared_global_16(dst, src);
+  // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16
+  __nvvm_cp_async_cg_shared_global_16(dst, src);
+
+  // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.commit.group
+  __nvvm_cp_async_commit_group();
+  // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 0)
+  __nvvm_cp_async_wait_group(0);
+    // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 8)
+  __nvvm_cp_async_wait_group(8);
+    // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 16)
+  __nvvm_cp_async_wait_group(16);
+  // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.all
+  __nvvm_cp_async_wait_all();
+  #endif
+  // CHECK: ret void
+}
\ No newline at end of file

diff  --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 2ab48cfc4bb7d..fa66a4a779e19 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -31,7 +31,11 @@
 //   * llvm.nvvm.max.ull --> ibid.
 //   * llvm.nvvm.h2f     --> llvm.convert.to.fp16.f32
 
-def llvm_anyi64ptr_ty     : LLVMAnyPointerType<llvm_i64_ty>;     // (space)i64*
+def llvm_global_i8ptr_ty  : LLVMQualPointerType<llvm_i8_ty, 1>;  // (global)i8*
+def llvm_shared_i8ptr_ty  : LLVMQualPointerType<llvm_i8_ty, 3>;  // (shared)i8*
+def llvm_i64ptr_ty        : LLVMPointerType<llvm_i64_ty>;        // i64*
+def llvm_any_i64ptr_ty    : LLVMAnyPointerType<llvm_i64_ty>;     // (space)i64*
+def llvm_shared_i64ptr_ty : LLVMQualPointerType<llvm_i64_ty, 3>; // (shared)i64*
 
 //
 // MISC
@@ -1052,6 +1056,110 @@ let TargetPrefix = "nvvm" in {
   def int_nvvm_membar_sys : GCCBuiltin<"__nvvm_membar_sys">,
       Intrinsic<[], [], []>;
 
+// Async Copy
+def int_nvvm_cp_async_mbarrier_arrive :
+    GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive">,
+    Intrinsic<[],[llvm_i64ptr_ty],[IntrConvergent]>;
+def int_nvvm_cp_async_mbarrier_arrive_shared :
+    GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_shared">,
+    Intrinsic<[],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
+def int_nvvm_cp_async_mbarrier_arrive_noinc :
+    GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_noinc">,
+    Intrinsic<[],[llvm_i64ptr_ty],[IntrConvergent]>;
+def int_nvvm_cp_async_mbarrier_arrive_noinc_shared :
+    GCCBuiltin<"__nvvm_cp_async_mbarrier_arrive_noinc_shared">,
+    Intrinsic<[],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
+
+def int_nvvm_cp_async_ca_shared_global_4 :
+    GCCBuiltin<"__nvvm_cp_async_ca_shared_global_4">,
+    Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
+    [IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
+     WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
+    "llvm.nvvm.cp.async.ca.shared.global.4">;
+def int_nvvm_cp_async_ca_shared_global_8 :
+    GCCBuiltin<"__nvvm_cp_async_ca_shared_global_8">,
+    Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
+    [IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
+     WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
+    "llvm.nvvm.cp.async.ca.shared.global.8">;
+def int_nvvm_cp_async_ca_shared_global_16 :
+    GCCBuiltin<"__nvvm_cp_async_ca_shared_global_16">,
+    Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
+    [IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
+     WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
+    "llvm.nvvm.cp.async.ca.shared.global.16">;
+def int_nvvm_cp_async_cg_shared_global_16 :
+    GCCBuiltin<"__nvvm_cp_async_cg_shared_global_16">,
+    Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
+    [IntrArgMemOnly, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
+     WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
+    "llvm.nvvm.cp.async.cg.shared.global.16">;
+
+def int_nvvm_cp_async_commit_group :
+    GCCBuiltin<"__nvvm_cp_async_commit_group">,
+    Intrinsic<[],[],[]>;
+
+def int_nvvm_cp_async_wait_group :
+    GCCBuiltin<"__nvvm_cp_async_wait_group">,
+    Intrinsic<[],[llvm_i32_ty],[ImmArg<ArgIndex<0>>]>;
+
+def int_nvvm_cp_async_wait_all :
+    GCCBuiltin<"__nvvm_cp_async_wait_all">,
+    Intrinsic<[],[],[]>;
+
+// mbarrier
+def int_nvvm_mbarrier_init : GCCBuiltin<"__nvvm_mbarrier_init">,
+    Intrinsic<[],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
+def int_nvvm_mbarrier_init_shared :
+    GCCBuiltin<"__nvvm_mbarrier_init_shared">,
+    Intrinsic<[],[llvm_shared_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
+
+def int_nvvm_mbarrier_inval : GCCBuiltin<"__nvvm_mbarrier_inval">,
+    Intrinsic<[],[llvm_i64ptr_ty],
+    [IntrConvergent, IntrWriteMem, IntrArgMemOnly,
+    WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
+def int_nvvm_mbarrier_inval_shared :
+    GCCBuiltin<"__nvvm_mbarrier_inval_shared">,
+    Intrinsic<[],[llvm_shared_i64ptr_ty],
+    [IntrConvergent, IntrWriteMem, IntrArgMemOnly,
+    WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
+
+def int_nvvm_mbarrier_arrive : GCCBuiltin<"__nvvm_mbarrier_arrive">,
+    Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty],[IntrConvergent]>;
+def int_nvvm_mbarrier_arrive_shared :
+    GCCBuiltin<"__nvvm_mbarrier_arrive_shared">,
+    Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
+def int_nvvm_mbarrier_arrive_noComplete :
+    GCCBuiltin<"__nvvm_mbarrier_arrive_noComplete">,
+    Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
+def int_nvvm_mbarrier_arrive_noComplete_shared :
+    GCCBuiltin<"__nvvm_mbarrier_arrive_noComplete_shared">,
+    Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
+
+def int_nvvm_mbarrier_arrive_drop :
+    GCCBuiltin<"__nvvm_mbarrier_arrive_drop">,
+    Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty],[IntrConvergent]>;
+def int_nvvm_mbarrier_arrive_drop_shared :
+    GCCBuiltin<"__nvvm_mbarrier_arrive_drop_shared">,
+    Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty],[IntrConvergent]>;
+def int_nvvm_mbarrier_arrive_drop_noComplete :
+    GCCBuiltin<"__nvvm_mbarrier_arrive_drop_noComplete">,
+    Intrinsic<[llvm_i64_ty],[llvm_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
+def int_nvvm_mbarrier_arrive_drop_noComplete_shared :
+    GCCBuiltin<"__nvvm_mbarrier_arrive_drop_noComplete_shared">,
+    Intrinsic<[llvm_i64_ty],[llvm_shared_i64ptr_ty, llvm_i32_ty],[IntrConvergent]>;
+
+def int_nvvm_mbarrier_test_wait :
+    GCCBuiltin<"__nvvm_mbarrier_test_wait">,
+    Intrinsic<[llvm_i1_ty],[llvm_i64ptr_ty, llvm_i64_ty],[IntrConvergent]>;
+def int_nvvm_mbarrier_test_wait_shared :
+    GCCBuiltin<"__nvvm_mbarrier_test_wait_shared">,
+    Intrinsic<[llvm_i1_ty],[llvm_shared_i64ptr_ty, llvm_i64_ty],[IntrConvergent]>;
+
+def int_nvvm_mbarrier_pending_count :
+    GCCBuiltin<"__nvvm_mbarrier_pending_count">,
+    Intrinsic<[llvm_i32_ty],[llvm_i64_ty],[IntrNoMem, IntrConvergent]>;
+
 // Generated within nvvm. Use for ldu on sm_20 or later.  Second arg is the
 // pointer's alignment.
 def int_nvvm_ldu_global_i : Intrinsic<[llvm_anyint_ty],
@@ -1141,7 +1249,7 @@ def int_nvvm_move_ptr : Intrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty],
 
 // For getting the handle from a texture or surface variable
 def int_nvvm_texsurf_handle
-  : Intrinsic<[llvm_i64_ty], [llvm_metadata_ty, llvm_anyi64ptr_ty],
+  : Intrinsic<[llvm_i64_ty], [llvm_metadata_ty, llvm_any_i64ptr_ty],
               [IntrNoMem], "llvm.nvvm.texsurf.handle">;
 def int_nvvm_texsurf_handle_internal
   : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty],

diff  --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 381ed4dd68874..e6b94d05ad5c5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -144,11 +144,13 @@ def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
 def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
 def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">;
 def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">;
+def hasPTX70 : Predicate<"Subtarget->getPTXVersion() >= 70">;
 
 def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
 def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
 def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">;
 def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">;
+def hasSM80 : Predicate<"Subtarget->getSmVersion() >= 80">;
 
 // non-sync shfl instructions are not available on sm_70+ in PTX6.4+
 def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" 

diff  --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 8ccd47c0fcfda..0f6509311f075 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -288,6 +288,211 @@ def INT_MEMBAR_GL  : MEMBAR<"membar.gl;",  int_nvvm_membar_gl>;
 def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>;
 
 
+//-----------------------------------
+// Async Copy Functions
+//-----------------------------------
+
+multiclass CP_ASYNC_MBARRIER_ARRIVE<string NoInc, string AddrSpace, Intrinsic Intrin> {
+  def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr),
+            !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"),
+            [(Intrin Int32Regs:$addr)]>,
+    Requires<[hasPTX70, hasSM80]>;
+  def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
+            !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"),
+            [(Intrin Int64Regs:$addr)]>,
+    Requires<[hasPTX70, hasSM80]>;
+}
+
+defm CP_ASYNC_MBARRIER_ARRIVE :
+  CP_ASYNC_MBARRIER_ARRIVE<"", "", int_nvvm_cp_async_mbarrier_arrive>;
+defm CP_ASYNC_MBARRIER_ARRIVE_SHARED :
+  CP_ASYNC_MBARRIER_ARRIVE<"", ".shared", int_nvvm_cp_async_mbarrier_arrive_shared>;
+defm CP_ASYNC_MBARRIER_ARRIVE_NOINC :
+  CP_ASYNC_MBARRIER_ARRIVE<".noinc", "", int_nvvm_cp_async_mbarrier_arrive_noinc>;
+defm CP_ASYNC_MBARRIER_ARRIVE_NOINC_SHARED :
+  CP_ASYNC_MBARRIER_ARRIVE<".noinc", ".shared", int_nvvm_cp_async_mbarrier_arrive_noinc_shared>;
+
+multiclass CP_ASYNC_CA_SHARED_GLOBAL_I<string cpsize, Intrinsic Intrin> {
+  def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
+            !strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"),
+            [(Intrin Int32Regs:$dst, Int32Regs:$src)]>,
+    Requires<[hasPTX70, hasSM80]>;
+  def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
+            !strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"),
+            [(Intrin Int64Regs:$dst, Int64Regs:$src)]>,
+    Requires<[hasPTX70, hasSM80]>;
+}
+
+defm CP_ASYNC_CA_SHARED_GLOBAL_4 :
+  CP_ASYNC_CA_SHARED_GLOBAL_I<"4", int_nvvm_cp_async_ca_shared_global_4>;
+
+defm CP_ASYNC_CA_SHARED_GLOBAL_8 :
+  CP_ASYNC_CA_SHARED_GLOBAL_I<"8", int_nvvm_cp_async_ca_shared_global_8>;
+
+defm CP_ASYNC_CA_SHARED_GLOBAL_16 :
+  CP_ASYNC_CA_SHARED_GLOBAL_I<"16", int_nvvm_cp_async_ca_shared_global_16>;
+
+multiclass CP_ASYNC_CG_SHARED_GLOBAL<string cpsize, Intrinsic Intrin> {
+  def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
+            !strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"),
+            [(Intrin Int32Regs:$dst, Int32Regs:$src)]>,
+    Requires<[hasPTX70, hasSM80]>;
+  def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
+            !strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"),
+            [(Intrin Int64Regs:$dst, Int64Regs:$src)]>,
+    Requires<[hasPTX70, hasSM80]>;
+}
+
+defm CP_ASYNC_CG_SHARED_GLOBAL_16 :
+  CP_ASYNC_CG_SHARED_GLOBAL<"16", int_nvvm_cp_async_cg_shared_global_16>;
+
+def CP_ASYNC_COMMIT_GROUP :
+  NVPTXInst<(outs), (ins), "cp.async.commit_group;", [(int_nvvm_cp_async_commit_group)]>,
+  Requires<[hasPTX70, hasSM80]>;
+
+def CP_ASYNC_WAIT_GROUP :
+  NVPTXInst<(outs), (ins i32imm:$n), "cp.async.wait_group $n;",
+  [(int_nvvm_cp_async_wait_group (i32 timm:$n))]>,
+  Requires<[hasPTX70, hasSM80]>;
+
+def CP_ASYNC_WAIT_ALL :
+  NVPTXInst<(outs), (ins), "cp.async.wait_all;",
+  [(int_nvvm_cp_async_wait_all)]>,
+  Requires<[hasPTX70, hasSM80]>;
+
+//-----------------------------------
+// MBarrier Functions
+//-----------------------------------
+
+multiclass MBARRIER_INIT<string AddrSpace, Intrinsic Intrin> {
+  def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr, Int32Regs:$count),
+           !strconcat("mbarrier.init", AddrSpace, ".b64 [$addr], $count;"),
+    [(Intrin Int32Regs:$addr, Int32Regs:$count)]>,
+    Requires<[hasPTX70, hasSM80]>;
+  def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr, Int32Regs:$count),
+           !strconcat("mbarrier.init", AddrSpace, ".b64 [$addr], $count;"),
+    [(Intrin Int64Regs:$addr, Int32Regs:$count)]>,
+    Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_INIT : MBARRIER_INIT<"", int_nvvm_mbarrier_init>;
+defm MBARRIER_INIT_SHARED : MBARRIER_INIT<".shared",
+                                          int_nvvm_mbarrier_init_shared>;
+
+multiclass MBARRIER_INVAL<string AddrSpace, Intrinsic Intrin> {
+  def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr),
+           !strconcat("mbarrier.inval", AddrSpace, ".b64 [$addr];"),
+    [(Intrin Int32Regs:$addr)]>,
+    Requires<[hasPTX70, hasSM80]>;
+  def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr),
+           !strconcat("mbarrier.inval", AddrSpace, ".b64 [$addr];"),
+    [(Intrin Int64Regs:$addr)]>,
+    Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_INVAL : MBARRIER_INVAL<"", int_nvvm_mbarrier_inval>;
+defm MBARRIER_INVAL_SHARED : MBARRIER_INVAL<".shared",
+                                            int_nvvm_mbarrier_inval_shared>;
+
+multiclass MBARRIER_ARRIVE<string AddrSpace, Intrinsic Intrin> {
+  def _32 : NVPTXInst<(outs Int64Regs:$state), (ins Int32Regs:$addr),
+           !strconcat("mbarrier.arrive", AddrSpace, ".b64 $state, [$addr];"),
+    [(set Int64Regs:$state, (Intrin Int32Regs:$addr))]>,
+    Requires<[hasPTX70, hasSM80]>;
+  def _64 : NVPTXInst<(outs Int64Regs:$state), (ins Int64Regs:$addr),
+           !strconcat("mbarrier.arrive", AddrSpace, ".b64 $state, [$addr];"),
+    [(set Int64Regs:$state, (Intrin Int64Regs:$addr))]>,
+    Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_ARRIVE : MBARRIER_ARRIVE<"", int_nvvm_mbarrier_arrive>;
+defm MBARRIER_ARRIVE_SHARED :
+  MBARRIER_ARRIVE<".shared", int_nvvm_mbarrier_arrive_shared>;
+
+multiclass MBARRIER_ARRIVE_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> {
+  def _32 : NVPTXInst<(outs Int64Regs:$state),
+           (ins Int32Regs:$addr, Int32Regs:$count),
+           !strconcat("mbarrier.arrive.noComplete", AddrSpace,
+                      ".b64 $state, [$addr], $count;"),
+    [(set Int64Regs:$state, (Intrin Int32Regs:$addr, Int32Regs:$count))]>,
+    Requires<[hasPTX70, hasSM80]>;
+  def _64 : NVPTXInst<(outs Int64Regs:$state),
+           (ins Int64Regs:$addr, Int32Regs:$count),
+           !strconcat("mbarrier.arrive.noComplete", AddrSpace,
+                      ".b64 $state, [$addr], $count;"),
+    [(set Int64Regs:$state, (Intrin Int64Regs:$addr, Int32Regs:$count))]>,
+    Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_ARRIVE_NOCOMPLETE :
+  MBARRIER_ARRIVE_NOCOMPLETE<"", int_nvvm_mbarrier_arrive_noComplete>;
+defm MBARRIER_ARRIVE_NOCOMPLETE_SHARED :
+  MBARRIER_ARRIVE_NOCOMPLETE<".shared", int_nvvm_mbarrier_arrive_noComplete_shared>;
+
+multiclass MBARRIER_ARRIVE_DROP<string AddrSpace, Intrinsic Intrin> {
+  def _32 : NVPTXInst<(outs Int64Regs:$state), (ins Int32Regs:$addr),
+           !strconcat("mbarrier.arrive_drop", AddrSpace,
+                      ".b64 $state, [$addr];"),
+           [(set Int64Regs:$state, (Intrin Int32Regs:$addr))]>,
+    Requires<[hasPTX70, hasSM80]>;
+  def _64 : NVPTXInst<(outs Int64Regs:$state), (ins Int64Regs:$addr),
+           !strconcat("mbarrier.arrive_drop", AddrSpace,
+                      ".b64 $state, [$addr];"),
+           [(set Int64Regs:$state, (Intrin Int64Regs:$addr))]>,
+    Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_ARRIVE_DROP :
+  MBARRIER_ARRIVE_DROP<"", int_nvvm_mbarrier_arrive_drop>;
+defm MBARRIER_ARRIVE_DROP_SHARED :
+  MBARRIER_ARRIVE_DROP<".shared", int_nvvm_mbarrier_arrive_drop_shared>;
+
+multiclass MBARRIER_ARRIVE_DROP_NOCOMPLETE<string AddrSpace, Intrinsic Intrin> {
+  def _32 : NVPTXInst<(outs Int64Regs:$state),
+           (ins Int32Regs:$addr, Int32Regs:$count),
+           !strconcat("mbarrier.arrive_drop.noComplete", AddrSpace,
+                      ".b64 $state, [$addr], $count;"),
+           [(set Int64Regs:$state, (Intrin Int32Regs:$addr, Int32Regs:$count))]>,
+    Requires<[hasPTX70, hasSM80]>;
+  def _64 : NVPTXInst<(outs Int64Regs:$state),
+           (ins Int64Regs:$addr, Int32Regs:$count),
+           !strconcat("mbarrier.arrive_drop.noComplete", AddrSpace,
+                      ".b64 $state, [$addr], $count;"),
+           [(set Int64Regs:$state, (Intrin Int64Regs:$addr, Int32Regs:$count))]>,
+    Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_ARRIVE_DROP_NOCOMPLETE :
+  MBARRIER_ARRIVE_DROP_NOCOMPLETE<"", int_nvvm_mbarrier_arrive_drop_noComplete>;
+defm MBARRIER_ARRIVE_DROP_NOCOMPLETE_SHARED :
+  MBARRIER_ARRIVE_DROP_NOCOMPLETE<".shared",
+                       int_nvvm_mbarrier_arrive_drop_noComplete_shared>;
+
+multiclass MBARRIER_TEST_WAIT<string AddrSpace, Intrinsic Intrin> {
+  def _32 : NVPTXInst<(outs Int1Regs:$res), (ins Int32Regs:$addr, Int64Regs:$state),
+           !strconcat("mbarrier.test_wait", AddrSpace, ".b64 $res, [$addr], $state;"),
+           [(set Int1Regs:$res, (Intrin Int32Regs:$addr, Int64Regs:$state))]>,
+    Requires<[hasPTX70, hasSM80]>;
+  def _64 : NVPTXInst<(outs Int1Regs:$res), (ins Int64Regs:$addr, Int64Regs:$state),
+           !strconcat("mbarrier.test_wait", AddrSpace, ".b64 $res, [$addr], $state;"),
+           [(set Int1Regs:$res, (Intrin Int64Regs:$addr, Int64Regs:$state))]>,
+    Requires<[hasPTX70, hasSM80]>;
+}
+
+defm MBARRIER_TEST_WAIT :
+  MBARRIER_TEST_WAIT<"", int_nvvm_mbarrier_test_wait>;
+defm MBARRIER_TEST_WAIT_SHARED :
+  MBARRIER_TEST_WAIT<".shared", int_nvvm_mbarrier_test_wait_shared>;
+
+class MBARRIER_PENDING_COUNT<Intrinsic Intrin> :
+           NVPTXInst<(outs Int32Regs:$res), (ins Int64Regs:$state),
+           "mbarrier.pending_count.b64 $res, $state;",
+           [(set Int32Regs:$res, (Intrin Int64Regs:$state))]>,
+    Requires<[hasPTX70, hasSM80]>;
+
+def MBARRIER_PENDING_COUNT :
+  MBARRIER_PENDING_COUNT<int_nvvm_mbarrier_pending_count>;
+
 //-----------------------------------
 // Math Functions
 //-----------------------------------

diff  --git a/llvm/test/CodeGen/NVPTX/async-copy.ll b/llvm/test/CodeGen/NVPTX/async-copy.ll
new file mode 100644
index 0000000000000..182779c2e0500
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/async-copy.ll
@@ -0,0 +1,110 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s 
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s
+
+declare void @llvm.nvvm.cp.async.wait.group(i32)
+
+; ALL-LABEL: asyncwaitgroup
+define void @asyncwaitgroup() {
+  ; ALL: cp.async.wait_group 8;
+  tail call void @llvm.nvvm.cp.async.wait.group(i32 8)
+  ; ALL: cp.async.wait_group 0;
+  tail call void @llvm.nvvm.cp.async.wait.group(i32 0)
+  ; ALL: cp.async.wait_group 16;
+  tail call void @llvm.nvvm.cp.async.wait.group(i32 16)
+  ret void
+}
+
+declare void @llvm.nvvm.cp.async.wait.all()
+
+; ALL-LABEL: asyncwaitall
+define void @asyncwaitall() {
+; ALL: cp.async.wait_all
+  tail call void @llvm.nvvm.cp.async.wait.all()
+  ret void
+}
+
+declare void @llvm.nvvm.cp.async.commit.group()
+
+; ALL-LABEL: asynccommitgroup
+define void @asynccommitgroup() {
+; ALL: cp.async.commit_group
+  tail call void @llvm.nvvm.cp.async.commit.group()
+  ret void
+}
+
+declare void @llvm.nvvm.cp.async.mbarrier.arrive(i64* %a)
+declare void @llvm.nvvm.cp.async.mbarrier.arrive.shared(i64 addrspace(3)* %a)
+declare void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(i64* %a)
+declare void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(i64 addrspace(3)* %a)
+
+; CHECK-LABEL: asyncmbarrier
+define void @asyncmbarrier(i64* %a) {
+; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}];
+  tail call void @llvm.nvvm.cp.async.mbarrier.arrive(i64* %a)
+  ret void
+}
+
+; CHECK-LABEL: asyncmbarriershared
+define void @asyncmbarriershared(i64 addrspace(3)* %a) {
+; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}];
+  tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(i64 addrspace(3)* %a)
+  ret void
+}
+
+; CHECK-LABEL: asyncmbarriernoinc
+define void @asyncmbarriernoinc(i64* %a) {
+; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}];
+  tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(i64* %a)
+  ret void
+}
+
+; CHECK-LABEL: asyncmbarriernoincshared
+define void @asyncmbarriernoincshared(i64 addrspace(3)* %a) {
+; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}];
+  tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(i64 addrspace(3)* %a)
+  ret void
+}
+
+declare void @llvm.nvvm.cp.async.ca.shared.global.4(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
+
+; CHECK-LABEL: asynccasharedglobal4i8
+define void @asynccasharedglobal4i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) {
+; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4;
+; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4;
+  tail call void @llvm.nvvm.cp.async.ca.shared.global.4(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
+  ret void
+}
+
+declare void @llvm.nvvm.cp.async.ca.shared.global.8(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
+
+; CHECK-LABEL: asynccasharedglobal8i8
+define void @asynccasharedglobal8i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) {
+; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 8;
+; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 8;
+  tail call void @llvm.nvvm.cp.async.ca.shared.global.8(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
+  ret void
+}
+
+declare void @llvm.nvvm.cp.async.ca.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
+
+; CHECK-LABEL: asynccasharedglobal16i8
+define void @asynccasharedglobal16i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) {
+; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16;
+; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16;
+  tail call void @llvm.nvvm.cp.async.ca.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
+  ret void
+}
+
+declare void @llvm.nvvm.cp.async.cg.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
+
+; CHECK-LABEL: asynccgsharedglobal16i8
+define void @asynccgsharedglobal16i8(i8 addrspace(3)* %a, i8 addrspace(1)* %b) {
+; CHECK_PTX32: cp.async.cg.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16;
+; CHECK_PTX64: cp.async.cg.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16;
+  tail call void @llvm.nvvm.cp.async.cg.shared.global.16(i8 addrspace(3)* %a, i8 addrspace(1)* %b)
+  ret void
+}

diff  --git a/llvm/test/CodeGen/NVPTX/mbarrier.ll b/llvm/test/CodeGen/NVPTX/mbarrier.ll
new file mode 100644
index 0000000000000..160c4030d26ca
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/mbarrier.ll
@@ -0,0 +1,145 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_80 | FileCheck %s -check-prefix=CHECK_PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s -check-prefix=CHECK_PTX64
+
+declare void @llvm.nvvm.mbarrier.init(i64* %a, i32 %b)
+declare void @llvm.nvvm.mbarrier.init.shared(i64 addrspace(3)* %a, i32 %b)
+
+; CHECK-LABEL: barrierinit
+define void @barrierinit(i64* %a, i32 %b) {
+; CHECK_PTX32: mbarrier.init.b64 [%r{{[0-9]+}}], %r{{[0-9]+}};
+; CHECK_PTX64: mbarrier.init.b64 [%rd{{[0-9]+}}], %r{{[0-9]+}};
+  tail call void @llvm.nvvm.mbarrier.init(i64* %a, i32 %b)
+  ret void
+}
+
+; CHECK-LABEL: barrierinitshared
+define void @barrierinitshared(i64 addrspace(3)* %a, i32 %b) {
+; CHECK_PTX32: mbarrier.init.shared.b64 [%r{{[0-9]+}}], %r{{[0-9]+}};
+; CHECK_PTX64: mbarrier.init.shared.b64 [%rd{{[0-9]+}}], %r{{[0-9]+}};
+  tail call void @llvm.nvvm.mbarrier.init.shared(i64 addrspace(3)* %a, i32 %b)
+  ret void
+}
+
+declare void @llvm.nvvm.mbarrier.inval(i64* %a)
+declare void @llvm.nvvm.mbarrier.inval.shared(i64 addrspace(3)* %a)
+
+; CHECK-LABEL: barrierinval
+define void @barrierinval(i64* %a) {
+; CHECK_PTX32: mbarrier.inval.b64 [%r{{[0-1]+}}];
+; CHECK_PTX64: mbarrier.inval.b64 [%rd{{[0-1]+}}];
+  tail call void @llvm.nvvm.mbarrier.inval(i64* %a)
+  ret void
+}
+
+; CHECK-LABEL: barrierinvalshared
+define void @barrierinvalshared(i64 addrspace(3)* %a) {
+; CHECK_PTX32: mbarrier.inval.shared.b64 [%r{{[0-1]+}}];
+; CHECK_PTX64: mbarrier.inval.shared.b64 [%rd{{[0-1]+}}];
+  tail call void @llvm.nvvm.mbarrier.inval.shared(i64 addrspace(3)* %a)
+  ret void
+}
+
+declare i64 @llvm.nvvm.mbarrier.arrive(i64* %a)
+declare i64 @llvm.nvvm.mbarrier.arrive.shared(i64 addrspace(3)* %a)
+
+; CHECK-LABEL: barrierarrive
+define void @barrierarrive(i64* %a) {
+; CHECK_PTX32: mbarrier.arrive.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
+; CHECK_PTX64: mbarrier.arrive.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
+  %ret = tail call i64 @llvm.nvvm.mbarrier.arrive(i64* %a)
+  ret void
+}
+
+; CHECK-LABEL: barrierarriveshared
+define void @barrierarriveshared(i64 addrspace(3)* %a) {
+; CHECK_PTX32: mbarrier.arrive.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
+; CHECK_PTX64: mbarrier.arrive.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
+  %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.shared(i64 addrspace(3)* %a)
+  ret void
+}
+
+declare i64 @llvm.nvvm.mbarrier.arrive.noComplete(i64* %a, i32 %b)
+declare i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
+
+; CHECK-LABEL: barrierarrivenoComplete
+define void @barrierarrivenoComplete(i64* %a, i32 %b) {
+; CHECK_PTX32: mbarrier.arrive.noComplete.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
+; CHECK_PTX64: mbarrier.arrive.noComplete.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
+  %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.noComplete(i64* %a, i32 %b)
+  ret void
+}
+
+; CHECK-LABEL: barrierarrivenoCompleteshared
+define void @barrierarrivenoCompleteshared(i64 addrspace(3)* %a, i32 %b) {
+; CHECK_PTX32: mbarrier.arrive.noComplete.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
+; CHECK_PTX64: mbarrier.arrive.noComplete.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
+  %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
+  ret void
+}
+
+declare i64 @llvm.nvvm.mbarrier.arrive.drop(i64* %a)
+declare i64 @llvm.nvvm.mbarrier.arrive.drop.shared(i64 addrspace(3)* %a)
+
+; CHECK-LABEL: barrierarrivedrop
+define void @barrierarrivedrop(i64* %a) {
+; CHECK_PTX32: mbarrier.arrive_drop.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
+; CHECK_PTX64: mbarrier.arrive_drop.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
+  %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop(i64* %a)
+  ret void
+}
+
+; CHECK-LABEL: barrierarrivedropshared
+define void @barrierarrivedropshared(i64 addrspace(3)* %a) {
+; CHECK_PTX32: mbarrier.arrive_drop.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}];
+; CHECK_PTX64: mbarrier.arrive_drop.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}];
+  %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.shared(i64 addrspace(3)* %a)
+  ret void
+}
+
+declare i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete(i64* %a, i32 %b)
+declare i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
+
+; CHECK-LABEL: barrierarrivedropnoComplete
+define void @barrierarrivedropnoComplete(i64* %a, i32 %b) {
+; CHECK_PTX32: mbarrier.arrive_drop.noComplete.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
+; CHECK_PTX64: mbarrier.arrive_drop.noComplete.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
+  %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete(i64* %a, i32 %b)
+  ret void
+}
+
+; CHECK-LABEL: barrierarrivedropnoCompleteshared
+define void @barrierarrivedropnoCompleteshared(i64 addrspace(3)* %a, i32 %b) {
+; CHECK_PTX32: mbarrier.arrive_drop.noComplete.shared.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}], %r{{[0-9]+}};
+; CHECK_PTX64: mbarrier.arrive_drop.noComplete.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}], %r{{[0-9]+}};
+  %ret = tail call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared(i64 addrspace(3)* %a, i32 %b)
+  ret void
+}
+
+declare i1 @llvm.nvvm.mbarrier.test.wait(i64* %a, i64 %b)
+declare i1 @llvm.nvvm.mbarrier.test.wait.shared(i64 addrspace(3)* %a, i64 %b)
+
+; CHECK-LABEL: barriertestwait
+define void @barriertestwait(i64* %a, i64 %b) {
+; CHECK_PTX32: mbarrier.test_wait.b64 %p{{[0-9]+}}, [%r{{[0-9]+}}], %rd{{[0-9]+}};
+; CHECK_PTX64: mbarrier.test_wait.b64 %p{{[0-9]+}}, [%rd{{[0-9]+}}], %rd{{[0-9]+}};
+  %ret = tail call i1 @llvm.nvvm.mbarrier.test.wait(i64* %a, i64 %b)
+  ret void
+}
+
+; CHECK-LABEL: barriertestwaitshared
+define void @barriertestwaitshared(i64 addrspace(3)* %a, i64 %b) {
+; CHECK_PTX32: mbarrier.test_wait.shared.b64 %p{{[0-9]+}}, [%r{{[0-9]+}}], %rd{{[0-9]+}};
+; CHECK_PTX64: mbarrier.test_wait.shared.b64 %p{{[0-9]+}}, [%rd{{[0-9]+}}], %rd{{[0-9]+}};
+  %ret = tail call i1 @llvm.nvvm.mbarrier.test.wait.shared(i64 addrspace(3)* %a, i64 %b)
+  ret void
+}
+
+declare i32 @llvm.nvvm.mbarrier.pending.count(i64 %b)
+
+; CHECK-LABEL: barrierpendingcount
+define i32 @barrierpendingcount(i64* %a, i64 %b) {
+; CHECK_PTX32: mbarrier.pending_count.b64 %r{{[0-9]+}}, %rd{{[0-9]+}};
+; CHECK_PTX64: mbarrier.pending_count.b64 %r{{[0-9]+}}, %rd{{[0-9]+}};
+  %ret = tail call i32 @llvm.nvvm.mbarrier.pending.count(i64 %b)
+  ret i32 %ret
+}


        


More information about the cfe-commits mailing list