[clang] 0e43eb2 - Revert "[NVPTX/CUDA] added an optional src_size argument to __nvvm_cp_async*"
Artem Belevich via cfe-commits
cfe-commits at lists.llvm.org
Thu May 18 11:46:00 PDT 2023
Author: Artem Belevich
Date: 2023-05-18T11:45:06-07:00
New Revision: 0e43eb24bd791e8b857e7347ac3646a4454d97e3
URL: https://github.com/llvm/llvm-project/commit/0e43eb24bd791e8b857e7347ac3646a4454d97e3
DIFF: https://github.com/llvm/llvm-project/commit/0e43eb24bd791e8b857e7347ac3646a4454d97e3.diff
LOG: Revert "[NVPTX/CUDA] added an optional src_size argument to __nvvm_cp_async*"
Breaks MLIR which happens to be using the intrinsics.
This reverts commit e7b9c2f00fa04ef8d9b69ee0e36d7775823dbe6b.
Added:
Modified:
clang/include/clang/Basic/BuiltinsNVPTX.def
clang/include/clang/Sema/Sema.h
clang/lib/CodeGen/CGBuiltin.cpp
clang/lib/Sema/SemaChecking.cpp
clang/test/CodeGen/builtins-nvptx.c
clang/test/SemaCUDA/builtins.cu
llvm/include/llvm/IR/IntrinsicsNVVM.td
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
llvm/test/CodeGen/NVPTX/async-copy.ll
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 9179fa47eb690..96531def77a78 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -968,10 +968,10 @@ TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_shared, "vWi*3", "", AND(SM_80,PT
TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc, "vWi*", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc_shared, "vWi*3", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_4, "vv*3vC*1.", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_8, "vv*3vC*1.", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*3vC*1.", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_cp_async_cg_shared_global_16, "vv*3vC*1.", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_4, "vv*3vC*1", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_8, "vv*3vC*1", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70))
+TARGET_BUILTIN(__nvvm_cp_async_cg_shared_global_16, "vv*3vC*1", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_commit_group, "v", "", AND(SM_80,PTX70))
TARGET_BUILTIN(__nvvm_cp_async_wait_group, "vIi", "", AND(SM_80,PTX70))
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 085662d225294..5296d7000b5cc 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -13558,8 +13558,6 @@ class Sema final {
bool CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI,
unsigned BuiltinID,
CallExpr *TheCall);
- bool CheckNVPTXBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
- CallExpr *TheCall);
bool SemaBuiltinVAStart(unsigned BuiltinID, CallExpr *TheCall);
bool SemaBuiltinVAStartARMMicrosoft(CallExpr *Call);
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 040d57c190b7c..02345651f0abb 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18177,15 +18177,6 @@ static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF,
{Ptr, CGF.EmitScalarExpr(E->getArg(1))});
}
-static Value *MakeCpAsync(unsigned IntrinsicID, CodeGenFunction &CGF,
- const CallExpr *E, int SrcSize) {
- Value *SrcSizeArg = E->getNumArgs() == 3 ? CGF.EmitScalarExpr(E->getArg(2))
- : CGF.Builder.getInt32(SrcSize);
- return CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicID),
- {CGF.EmitScalarExpr(E->getArg(0)),
- CGF.EmitScalarExpr(E->getArg(1)), SrcSizeArg});
-}
-
static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
const CallExpr *E, CodeGenFunction &CGF) {
auto &C = CGF.CGM.getContext();
@@ -18849,18 +18840,6 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
case NVPTX::BI__nvvm_ldu_h2: {
return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
}
- case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
- return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4, *this, E,
- 4);
- case NVPTX::BI__nvvm_cp_async_ca_shared_global_8:
- return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_8, *this, E,
- 8);
- case NVPTX::BI__nvvm_cp_async_ca_shared_global_16:
- return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_16, *this, E,
- 16);
- case NVPTX::BI__nvvm_cp_async_cg_shared_global_16:
- return MakeCpAsync(Intrinsic::nvvm_cp_async_cg_shared_global_16, *this, E,
- 16);
default:
return nullptr;
}
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 06739beba4f91..7cd74329d8f32 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2028,9 +2028,6 @@ bool Sema::CheckTSBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
case llvm::Triple::wasm32:
case llvm::Triple::wasm64:
return CheckWebAssemblyBuiltinFunctionCall(TI, BuiltinID, TheCall);
- case llvm::Triple::nvptx:
- case llvm::Triple::nvptx64:
- return CheckNVPTXBuiltinFunctionCall(TI, BuiltinID, TheCall);
}
}
@@ -4818,20 +4815,6 @@ bool Sema::CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI,
return false;
}
-bool Sema::CheckNVPTXBuiltinFunctionCall(const TargetInfo &TI,
- unsigned BuiltinID,
- CallExpr *TheCall) {
- switch (BuiltinID) {
- case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
- case NVPTX::BI__nvvm_cp_async_ca_shared_global_8:
- case NVPTX::BI__nvvm_cp_async_ca_shared_global_16:
- case NVPTX::BI__nvvm_cp_async_cg_shared_global_16:
- return checkArgCountAtMost(*this, TheCall, 3);
- }
-
- return false;
-}
-
/// SemaBuiltinCpuSupports - Handle __builtin_cpu_supports(char *).
/// This checks that the target supports __builtin_cpu_supports and
/// that the string argument is constant and valid.
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index 628349364dcf2..48a2d0241528b 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -830,24 +830,15 @@ __device__ void nvvm_async_copy(__attribute__((address_space(3))) void* dst, __a
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared
__nvvm_cp_async_mbarrier_arrive_noinc_shared(sharedAddr);
- // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4({{.*}}, i32 4)
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4
__nvvm_cp_async_ca_shared_global_4(dst, src);
- // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8({{.*}}, i32 8)
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8
__nvvm_cp_async_ca_shared_global_8(dst, src);
- // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16({{.*}}, i32 16)
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16
__nvvm_cp_async_ca_shared_global_16(dst, src);
- // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16({{.*}}, i32 16)
+ // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16
__nvvm_cp_async_cg_shared_global_16(dst, src);
- // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4({{.*}}, i32 2)
- __nvvm_cp_async_ca_shared_global_4(dst, src, 2);
- // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8({{.*}}, i32 2)
- __nvvm_cp_async_ca_shared_global_8(dst, src, 2);
- // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16({{.*}}, i32 2)
- __nvvm_cp_async_ca_shared_global_16(dst, src, 2);
- // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16({{.*}}, i32 2)
- __nvvm_cp_async_cg_shared_global_16(dst, src, 2);
-
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.commit.group
__nvvm_cp_async_commit_group();
// CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 0)
diff --git a/clang/test/SemaCUDA/builtins.cu b/clang/test/SemaCUDA/builtins.cu
index a3c416a96af79..78a333e511a5d 100644
--- a/clang/test/SemaCUDA/builtins.cu
+++ b/clang/test/SemaCUDA/builtins.cu
@@ -10,7 +10,6 @@
// RUN: -fsyntax-only -verify=host %s
// RUN: %clang_cc1 -triple nvptx64-unknown-cuda -fcuda-is-device \
// RUN: -aux-triple x86_64-unknown-unknown \
-// RUN: -target-cpu sm_80 -target-feature +ptx70 \
// RUN: -fsyntax-only -verify=dev %s
#if !(defined(__amd64__) && defined(__PTX__))
@@ -29,23 +28,3 @@ __attribute__((device)) void df() {
int y = __builtin_ia32_rdtsc(); // dev-error {{reference to __host__ function '__builtin_ia32_rdtsc' in __device__ function}}
x = __builtin_abs(1);
}
-
-#if __CUDA_ARCH__ >= 800
-__attribute__((device)) void nvvm_async_copy(__attribute__((address_space(3))) void* dst,
- __attribute__((address_space(1))) const void* src) {
- __nvvm_cp_async_ca_shared_global_4(dst, src);
- __nvvm_cp_async_ca_shared_global_8(dst, src);
- __nvvm_cp_async_ca_shared_global_16(dst, src);
- __nvvm_cp_async_cg_shared_global_16(dst, src);
-
- __nvvm_cp_async_ca_shared_global_4(dst, src, 2);
- __nvvm_cp_async_ca_shared_global_8(dst, src, 2);
- __nvvm_cp_async_ca_shared_global_16(dst, src, 2);
- __nvvm_cp_async_cg_shared_global_16(dst, src, 2);
-
- __nvvm_cp_async_ca_shared_global_4(dst, src, 2, 3); // dev-error {{too many arguments to function call}}
- __nvvm_cp_async_ca_shared_global_8(dst, src, 2, 3); // dev-error {{too many arguments to function call}}
- __nvvm_cp_async_ca_shared_global_16(dst, src, 2, 3); // dev-error {{too many arguments to function call}}
- __nvvm_cp_async_cg_shared_global_16(dst, src, 2, 3); // dev-error {{too many arguments to function call}}
-}
-#endif
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 2633027a6a62b..d1a2537fe662d 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1381,22 +1381,26 @@ def int_nvvm_cp_async_mbarrier_arrive_noinc_shared :
Intrinsic<[],[llvm_shared_i64ptr_ty],[IntrConvergent, IntrNoCallback]>;
def int_nvvm_cp_async_ca_shared_global_4 :
- Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty, llvm_i32_ty],
+ ClangBuiltin<"__nvvm_cp_async_ca_shared_global_4">,
+ Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
[IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
"llvm.nvvm.cp.async.ca.shared.global.4">;
def int_nvvm_cp_async_ca_shared_global_8 :
- Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty, llvm_i32_ty],
+ ClangBuiltin<"__nvvm_cp_async_ca_shared_global_8">,
+ Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
[IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
"llvm.nvvm.cp.async.ca.shared.global.8">;
def int_nvvm_cp_async_ca_shared_global_16 :
- Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty, llvm_i32_ty],
+ ClangBuiltin<"__nvvm_cp_async_ca_shared_global_16">,
+ Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
[IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
"llvm.nvvm.cp.async.ca.shared.global.16">;
def int_nvvm_cp_async_cg_shared_global_16 :
- Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty, llvm_i32_ty],
+ ClangBuiltin<"__nvvm_cp_async_cg_shared_global_16">,
+ Intrinsic<[],[llvm_shared_i8ptr_ty, llvm_global_i8ptr_ty],
[IntrArgMemOnly, IntrNoCallback, NoAlias<ArgIndex<0>>, NoAlias<ArgIndex<1>>,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>],
"llvm.nvvm.cp.async.cg.shared.global.16">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index c8b971c402623..1192cc0784084 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -328,36 +328,39 @@ defm CP_ASYNC_MBARRIER_ARRIVE_NOINC :
defm CP_ASYNC_MBARRIER_ARRIVE_NOINC_SHARED :
CP_ASYNC_MBARRIER_ARRIVE<".noinc", ".shared", int_nvvm_cp_async_mbarrier_arrive_noinc_shared>;
-multiclass CP_ASYNC_SHARED_GLOBAL_I<string cc, string cpsize, Intrinsic Intrin> {
- def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, Int32Regs:$src_size),
- !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
- [(Intrin Int32Regs:$dst, Int32Regs:$src, Int32Regs:$src_size)]>,
+multiclass CP_ASYNC_CA_SHARED_GLOBAL_I<string cpsize, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
+ !strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"),
+ [(Intrin Int32Regs:$dst, Int32Regs:$src)]>,
Requires<[hasPTX70, hasSM80]>;
- def _32i: NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, i32imm:$src_size),
- !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
- [(Intrin Int32Regs:$dst, Int32Regs:$src, imm:$src_size)]>,
- Requires<[hasPTX70, hasSM80]>;
- def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, Int32Regs:$src_size),
- !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
- [(Intrin Int64Regs:$dst, Int64Regs:$src, Int32Regs:$src_size)]>,
- Requires<[hasPTX70, hasSM80]>;
- def _64i: NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, i32imm:$src_size),
- !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"),
- [(Intrin Int64Regs:$dst, Int64Regs:$src, imm:$src_size)]>,
+ def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
+ !strconcat("cp.async.ca.shared.global [$dst], [$src], ", cpsize, ";"),
+ [(Intrin Int64Regs:$dst, Int64Regs:$src)]>,
Requires<[hasPTX70, hasSM80]>;
}
defm CP_ASYNC_CA_SHARED_GLOBAL_4 :
- CP_ASYNC_SHARED_GLOBAL_I<"ca", "4", int_nvvm_cp_async_ca_shared_global_4>;
+ CP_ASYNC_CA_SHARED_GLOBAL_I<"4", int_nvvm_cp_async_ca_shared_global_4>;
defm CP_ASYNC_CA_SHARED_GLOBAL_8 :
- CP_ASYNC_SHARED_GLOBAL_I<"ca", "8", int_nvvm_cp_async_ca_shared_global_8>;
+ CP_ASYNC_CA_SHARED_GLOBAL_I<"8", int_nvvm_cp_async_ca_shared_global_8>;
defm CP_ASYNC_CA_SHARED_GLOBAL_16 :
- CP_ASYNC_SHARED_GLOBAL_I<"ca", "16", int_nvvm_cp_async_ca_shared_global_16>;
+ CP_ASYNC_CA_SHARED_GLOBAL_I<"16", int_nvvm_cp_async_ca_shared_global_16>;
+
+multiclass CP_ASYNC_CG_SHARED_GLOBAL<string cpsize, Intrinsic Intrin> {
+ def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src),
+ !strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"),
+ [(Intrin Int32Regs:$dst, Int32Regs:$src)]>,
+ Requires<[hasPTX70, hasSM80]>;
+ def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src),
+ !strconcat("cp.async.cg.shared.global [$dst], [$src], ", cpsize, ";"),
+ [(Intrin Int64Regs:$dst, Int64Regs:$src)]>,
+ Requires<[hasPTX70, hasSM80]>;
+}
defm CP_ASYNC_CG_SHARED_GLOBAL_16 :
- CP_ASYNC_SHARED_GLOBAL_I<"cg", "16", int_nvvm_cp_async_cg_shared_global_16>;
+ CP_ASYNC_CG_SHARED_GLOBAL<"16", int_nvvm_cp_async_cg_shared_global_16>;
def CP_ASYNC_COMMIT_GROUP :
NVPTXInst<(outs), (ins), "cp.async.commit_group;", [(int_nvvm_cp_async_commit_group)]>,
diff --git a/llvm/test/CodeGen/NVPTX/async-copy.ll b/llvm/test/CodeGen/NVPTX/async-copy.ll
index a1bea301dd467..55c7a6d4874c5 100644
--- a/llvm/test/CodeGen/NVPTX/async-copy.ll
+++ b/llvm/test/CodeGen/NVPTX/async-copy.ll
@@ -1,35 +1,35 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX32 %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=CHECK,CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX32 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck -check-prefixes=ALL,CHECK_PTX64 %s
; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
; RUN: %if ptxas-11.0 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %}
declare void @llvm.nvvm.cp.async.wait.group(i32)
-; CHECK-LABEL: asyncwaitgroup
+; ALL-LABEL: asyncwaitgroup
define void @asyncwaitgroup() {
- ; CHECK: cp.async.wait_group 8;
+ ; ALL: cp.async.wait_group 8;
tail call void @llvm.nvvm.cp.async.wait.group(i32 8)
- ; CHECK: cp.async.wait_group 0;
+ ; ALL: cp.async.wait_group 0;
tail call void @llvm.nvvm.cp.async.wait.group(i32 0)
- ; CHECK: cp.async.wait_group 16;
+ ; ALL: cp.async.wait_group 16;
tail call void @llvm.nvvm.cp.async.wait.group(i32 16)
ret void
}
declare void @llvm.nvvm.cp.async.wait.all()
-; CHECK-LABEL: asyncwaitall
+; ALL-LABEL: asyncwaitall
define void @asyncwaitall() {
-; CHECK: cp.async.wait_all
+; ALL: cp.async.wait_all
tail call void @llvm.nvvm.cp.async.wait.all()
ret void
}
declare void @llvm.nvvm.cp.async.commit.group()
-; CHECK-LABEL: asynccommitgroup
+; ALL-LABEL: asynccommitgroup
define void @asynccommitgroup() {
-; CHECK: cp.async.commit_group
+; ALL: cp.async.commit_group
tail call void @llvm.nvvm.cp.async.commit.group()
ret void
}
@@ -41,75 +41,72 @@ declare void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %
; CHECK-LABEL: asyncmbarrier
define void @asyncmbarrier(ptr %a) {
-; The distinction between PTX32/PTX64 here is only to capture pointer register type
-; in R to be used in subsequent tests.
-; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%[[R:r]]{{[0-9]+}}];
-; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%[[R:rd]]{{[0-9]+}}];
+; CHECK_PTX32: cp.async.mbarrier.arrive.b64 [%r{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.b64 [%rd{{[0-9]+}}];
tail call void @llvm.nvvm.cp.async.mbarrier.arrive(ptr %a)
ret void
}
; CHECK-LABEL: asyncmbarriershared
define void @asyncmbarriershared(ptr addrspace(3) %a) {
-; CHECK: cp.async.mbarrier.arrive.shared.b64 [%[[R]]{{[0-9]+}}];
+; CHECK_PTX32: cp.async.mbarrier.arrive.shared.b64 [%r{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.shared.b64 [%rd{{[0-9]+}}];
tail call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %a)
ret void
}
; CHECK-LABEL: asyncmbarriernoinc
define void @asyncmbarriernoinc(ptr %a) {
-; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%[[R]]{{[0-9]+}}];
+; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.b64 [%r{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.b64 [%rd{{[0-9]+}}];
tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %a)
ret void
}
; CHECK-LABEL: asyncmbarriernoincshared
define void @asyncmbarriernoincshared(ptr addrspace(3) %a) {
-; CHECK: cp.async.mbarrier.arrive.noinc.shared.b64 [%[[R]]{{[0-9]+}}];
+; CHECK_PTX32: cp.async.mbarrier.arrive.noinc.shared.b64 [%r{{[0-9]+}}];
+; CHECK_PTX64: cp.async.mbarrier.arrive.noinc.shared.b64 [%rd{{[0-9]+}}];
tail call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %a)
ret void
}
-declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
+declare void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b)
; CHECK-LABEL: asynccasharedglobal4i8
-define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
-; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 4, %r{{[0-9]+}};
-; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 4, 1;
- tail call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
- tail call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1)
+define void @asynccasharedglobal4i8(ptr addrspace(3) %a, ptr addrspace(1) %b) {
+; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 4;
+; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 4;
+ tail call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %a, ptr addrspace(1) %b)
ret void
}
-declare void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
+declare void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %a, ptr addrspace(1) %b)
; CHECK-LABEL: asynccasharedglobal8i8
-define void @asynccasharedglobal8i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
-; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 8, %r{{[0-9]+}};
-; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 8, 1;
- tail call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
- tail call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1)
+define void @asynccasharedglobal8i8(ptr addrspace(3) %a, ptr addrspace(1) %b) {
+; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 8;
+; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 8;
+ tail call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %a, ptr addrspace(1) %b)
ret void
}
-declare void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
+declare void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b)
; CHECK-LABEL: asynccasharedglobal16i8
-define void @asynccasharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
-; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, %r{{[0-9]+}};
-; CHECK: cp.async.ca.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, 1;
- tail call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
- tail call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1)
+define void @asynccasharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b) {
+; CHECK_PTX32: cp.async.ca.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16;
+; CHECK_PTX64: cp.async.ca.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16;
+ tail call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b)
ret void
}
-declare void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
+declare void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b)
; CHECK-LABEL: asynccgsharedglobal16i8
-define void @asynccgsharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c) {
-; CHECK: cp.async.cg.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, %r{{[0-9]+}};
-; CHECK: cp.async.cg.shared.global [%[[R]]{{[0-9]+}}], [%[[R]]{{[0-9]+}}], 16, 1;
- tail call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 %c)
- tail call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b, i32 1)
+define void @asynccgsharedglobal16i8(ptr addrspace(3) %a, ptr addrspace(1) %b) {
+; CHECK_PTX32: cp.async.cg.shared.global [%r{{[0-9]+}}], [%r{{[0-9]+}}], 16;
+; CHECK_PTX64: cp.async.cg.shared.global [%rd{{[0-9]+}}], [%rd{{[0-9]+}}], 16;
+ tail call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %a, ptr addrspace(1) %b)
ret void
}
More information about the cfe-commits
mailing list