[clang] Reland [NVPTX] Add support for maxclusterrank in launch_bounds (#66496) (PR #67667)
Jakub Chlanda via cfe-commits
cfe-commits at lists.llvm.org
Thu Sep 28 05:49:19 PDT 2023
https://github.com/jchlanda created https://github.com/llvm/llvm-project/pull/67667
This reverts commit 0afbcb20fd908f8bf9073697423da097be7db592.
>From d8c1372998a74dfbfea921bf049575e5e9c0c5a7 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub at codeplay.com>
Date: Thu, 28 Sep 2023 13:30:27 +0100
Subject: [PATCH] Reland [NVPTX] Add support for maxclusterrank in
launch_bounds (#66496)
This reverts commit 0afbcb20fd908f8bf9073697423da097be7db592.
---
clang/include/clang/Basic/Attr.td | 3 +-
.../clang/Basic/DiagnosticSemaKinds.td | 4 +
clang/include/clang/Sema/Sema.h | 5 +-
clang/lib/Basic/Targets/NVPTX.h | 2 +
clang/lib/CodeGen/Targets/NVPTX.cpp | 12 ++-
clang/lib/Parse/ParseOpenMP.cpp | 3 +-
clang/lib/Sema/SemaDeclAttr.cpp | 43 ++++++++---
.../lib/Sema/SemaTemplateInstantiateDecl.cpp | 10 ++-
clang/test/CodeGenCUDA/launch-bounds.cu | 69 +++++++++++++++++
clang/test/SemaCUDA/launch_bounds.cu | 7 +-
clang/test/SemaCUDA/launch_bounds_sm_90.cu | 57 ++++++++++++++
llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 77 ++++++++-----------
llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 4 +
llvm/lib/Target/NVPTX/NVPTXUtilities.h | 1 +
llvm/test/CodeGen/NVPTX/maxclusterrank.ll | 26 +++++++
15 files changed, 262 insertions(+), 61 deletions(-)
create mode 100644 clang/test/SemaCUDA/launch_bounds_sm_90.cu
create mode 100644 llvm/test/CodeGen/NVPTX/maxclusterrank.ll
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index dd4d45171db4899..fbc27d166ed9dd1 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1267,7 +1267,8 @@ def CUDAInvalidTarget : InheritableAttr {
def CUDALaunchBounds : InheritableAttr {
let Spellings = [GNU<"launch_bounds">, Declspec<"__launch_bounds__">];
- let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>];
+ let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>,
+ ExprArgument<"MaxBlocks", 1>];
let LangOpts = [CUDA];
let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
// An AST node is created for this attribute, but is not used by other parts
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index f4eb02fd9570c2f..29362df68365350 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11850,6 +11850,10 @@ def err_sycl_special_type_num_init_method : Error<
"types with 'sycl_special_class' attribute must have one and only one '__init' "
"method defined">;
+def warn_cuda_maxclusterrank_sm_90 : Warning<
+ "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
+ "%1 attribute">, InGroup<IgnoredAttributes>;
+
def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must "
"have a bit size of at least %select{2|1}0">;
def err_bit_int_max_size : Error<"%select{signed|unsigned}0 _BitInt of bit "
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 712db0a3dd895d5..e13524b5f3b30cf 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11053,12 +11053,13 @@ class Sema final {
/// Create an CUDALaunchBoundsAttr attribute.
CUDALaunchBoundsAttr *CreateLaunchBoundsAttr(const AttributeCommonInfo &CI,
Expr *MaxThreads,
- Expr *MinBlocks);
+ Expr *MinBlocks,
+ Expr *MaxBlocks);
/// AddLaunchBoundsAttr - Adds a launch_bounds attribute to a particular
/// declaration.
void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI,
- Expr *MaxThreads, Expr *MinBlocks);
+ Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks);
/// AddModeAttr - Adds a mode attribute to a particular declaration.
void AddModeAttr(Decl *D, const AttributeCommonInfo &CI, IdentifierInfo *Name,
diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h
index 6fa0b8df97d7894..20d76b702a9426e 100644
--- a/clang/lib/Basic/Targets/NVPTX.h
+++ b/clang/lib/Basic/Targets/NVPTX.h
@@ -181,6 +181,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo {
bool hasBitIntType() const override { return true; }
bool hasBFloat16Type() const override { return true; }
+
+ CudaArch getGPU() const { return GPU; }
};
} // namespace targets
} // namespace clang
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index 0d4bbd795648008..64d019a10514d60 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -296,8 +296,8 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(
NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
MaxThreads.getExtValue());
- // min blocks is an optional argument for CUDALaunchBoundsAttr. If it was
- // not specified in __launch_bounds__ or if the user specified a 0 value,
+ // min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it
+ // was not specified in __launch_bounds__ or if the user specified a 0 value,
// we don't have to add a PTX directive.
if (Attr->getMinBlocks()) {
llvm::APSInt MinBlocks(32);
@@ -307,6 +307,14 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(
NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "minctasm",
MinBlocks.getExtValue());
}
+ if (Attr->getMaxBlocks()) {
+ llvm::APSInt MaxBlocks(32);
+ MaxBlocks = Attr->getMaxBlocks()->EvaluateKnownConstInt(getContext());
+ if (MaxBlocks > 0)
+ // Create !{<func-ref>, metadata !"maxclusterrank", i32 <val>} node
+ NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxclusterrank",
+ MaxBlocks.getExtValue());
+ }
}
std::unique_ptr<TargetCodeGenInfo>
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 605b97617432ed3..8a8a126bf7244d4 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -3739,7 +3739,8 @@ OMPClause *Parser::ParseOpenMPOMPXAttributesClause(bool ParseOnly) {
continue;
if (auto *A = Actions.CreateLaunchBoundsAttr(
PA, PA.getArgAsExpr(0),
- PA.getNumArgs() > 1 ? PA.getArgAsExpr(1) : nullptr))
+ PA.getNumArgs() > 1 ? PA.getArgAsExpr(1) : nullptr,
+ PA.getNumArgs() > 2 ? PA.getArgAsExpr(2) : nullptr))
Attrs.push_back(A);
continue;
default:
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 090a54eedaa07d0..a17378675b22777 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5608,6 +5608,14 @@ bool Sema::CheckRegparmAttr(const ParsedAttr &AL, unsigned &numParams) {
return false;
}
+// Helper to get CudaArch.
+static CudaArch getCudaArch(const TargetInfo &TI) {
+ if (!TI.getTriple().isNVPTX())
+ llvm_unreachable("getCudaArch is only valid for NVPTX triple");
+ auto &TO = TI.getTargetOpts();
+ return StringToCudaArch(TO.CPU);
+}
+
// Checks whether an argument of launch_bounds attribute is
// acceptable, performs implicit conversion to Rvalue, and returns
// non-nullptr Expr result on success. Otherwise, it returns nullptr
@@ -5651,34 +5659,51 @@ static Expr *makeLaunchBoundsArgExpr(Sema &S, Expr *E,
CUDALaunchBoundsAttr *
Sema::CreateLaunchBoundsAttr(const AttributeCommonInfo &CI, Expr *MaxThreads,
- Expr *MinBlocks) {
- CUDALaunchBoundsAttr TmpAttr(Context, CI, MaxThreads, MinBlocks);
+ Expr *MinBlocks, Expr *MaxBlocks) {
+ CUDALaunchBoundsAttr TmpAttr(Context, CI, MaxThreads, MinBlocks, MaxBlocks);
MaxThreads = makeLaunchBoundsArgExpr(*this, MaxThreads, TmpAttr, 0);
- if (MaxThreads == nullptr)
+ if (!MaxThreads)
return nullptr;
if (MinBlocks) {
MinBlocks = makeLaunchBoundsArgExpr(*this, MinBlocks, TmpAttr, 1);
- if (MinBlocks == nullptr)
+ if (!MinBlocks)
return nullptr;
}
+ if (MaxBlocks) {
+ // '.maxclusterrank' ptx directive requires .target sm_90 or higher.
+ auto SM = getCudaArch(Context.getTargetInfo());
+ if (SM == CudaArch::UNKNOWN || SM < CudaArch::SM_90) {
+ Diag(MaxBlocks->getBeginLoc(), diag::warn_cuda_maxclusterrank_sm_90)
+ << CudaArchToString(SM) << CI << MaxBlocks->getSourceRange();
+ // Ignore it by setting MaxBlocks to null;
+ MaxBlocks = nullptr;
+ } else {
+ MaxBlocks = makeLaunchBoundsArgExpr(*this, MaxBlocks, TmpAttr, 2);
+ if (!MaxBlocks)
+ return nullptr;
+ }
+ }
+
return ::new (Context)
- CUDALaunchBoundsAttr(Context, CI, MaxThreads, MinBlocks);
+ CUDALaunchBoundsAttr(Context, CI, MaxThreads, MinBlocks, MaxBlocks);
}
void Sema::AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI,
- Expr *MaxThreads, Expr *MinBlocks) {
- if (auto *Attr = CreateLaunchBoundsAttr(CI, MaxThreads, MinBlocks))
+ Expr *MaxThreads, Expr *MinBlocks,
+ Expr *MaxBlocks) {
+ if (auto *Attr = CreateLaunchBoundsAttr(CI, MaxThreads, MinBlocks, MaxBlocks))
D->addAttr(Attr);
}
static void handleLaunchBoundsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
- if (!AL.checkAtLeastNumArgs(S, 1) || !AL.checkAtMostNumArgs(S, 2))
+ if (!AL.checkAtLeastNumArgs(S, 1) || !AL.checkAtMostNumArgs(S, 3))
return;
S.AddLaunchBoundsAttr(D, AL, AL.getArgAsExpr(0),
- AL.getNumArgs() > 1 ? AL.getArgAsExpr(1) : nullptr);
+ AL.getNumArgs() > 1 ? AL.getArgAsExpr(1) : nullptr,
+ AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
}
static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D,
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index fa839e9b71a3cf9..1aa4036756f3692 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -302,7 +302,15 @@ static void instantiateDependentCUDALaunchBoundsAttr(
MinBlocks = Result.getAs<Expr>();
}
- S.AddLaunchBoundsAttr(New, Attr, MaxThreads, MinBlocks);
+ Expr *MaxBlocks = nullptr;
+ if (Attr.getMaxBlocks()) {
+ Result = S.SubstExpr(Attr.getMaxBlocks(), TemplateArgs);
+ if (Result.isInvalid())
+ return;
+ MaxBlocks = Result.getAs<Expr>();
+ }
+
+ S.AddLaunchBoundsAttr(New, Attr, MaxThreads, MinBlocks, MaxBlocks);
}
static void
diff --git a/clang/test/CodeGenCUDA/launch-bounds.cu b/clang/test/CodeGenCUDA/launch-bounds.cu
index 58bcc410201f35f..31ca9216b413e92 100644
--- a/clang/test/CodeGenCUDA/launch-bounds.cu
+++ b/clang/test/CodeGenCUDA/launch-bounds.cu
@@ -1,9 +1,13 @@
// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -target-cpu sm_90 -DUSE_MAX_BLOCKS -fcuda-is-device -emit-llvm -o - | FileCheck -check-prefix=CHECK_MAX_BLOCKS %s
#include "Inputs/cuda.h"
#define MAX_THREADS_PER_BLOCK 256
#define MIN_BLOCKS_PER_MP 2
+#ifdef USE_MAX_BLOCKS
+#define MAX_BLOCKS_PER_MP 4
+#endif
// Test both max threads per block and Min cta per sm.
extern "C" {
@@ -17,6 +21,21 @@ Kernel1()
// CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"maxntidx", i32 256}
// CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"minctasm", i32 2}
+#ifdef USE_MAX_BLOCKS
+// Test max threads per block and min/max cta per sm.
+extern "C" {
+__global__ void
+__launch_bounds__( MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP )
+Kernel1_sm_90()
+{
+}
+}
+
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxntidx", i32 256}
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"minctasm", i32 2}
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxclusterrank", i32 4}
+#endif // USE_MAX_BLOCKS
+
// Test only max threads per block. Min cta per sm defaults to 0, and
// CodeGen doesn't output a zero value for minctasm.
extern "C" {
@@ -50,6 +69,20 @@ template __global__ void Kernel4<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256}
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"minctasm", i32 2}
+#ifdef USE_MAX_BLOCKS
+template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
+__global__ void
+__launch_bounds__(max_threads_per_block, min_blocks_per_mp, max_blocks_per_mp)
+Kernel4_sm_90()
+{
+}
+template __global__ void Kernel4_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
+
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxntidx", i32 256}
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"minctasm", i32 2}
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxclusterrank", i32 4}
+#endif //USE_MAX_BLOCKS
+
const int constint = 100;
template <int max_threads_per_block, int min_blocks_per_mp>
__global__ void
@@ -63,6 +96,23 @@ template __global__ void Kernel5<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"maxntidx", i32 356}
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"minctasm", i32 258}
+#ifdef USE_MAX_BLOCKS
+
+template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
+__global__ void
+__launch_bounds__(max_threads_per_block + constint,
+ min_blocks_per_mp + max_threads_per_block,
+ max_blocks_per_mp + max_threads_per_block)
+Kernel5_sm_90()
+{
+}
+template __global__ void Kernel5_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
+
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxntidx", i32 356}
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"minctasm", i32 258}
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxclusterrank", i32 260}
+#endif //USE_MAX_BLOCKS
+
// Make sure we don't emit negative launch bounds values.
__global__ void
__launch_bounds__( -MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP )
@@ -80,7 +130,26 @@ Kernel7()
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"maxntidx",
// CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"minctasm",
+#ifdef USE_MAX_BLOCKS
+__global__ void
+__launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP, -MAX_BLOCKS_PER_MP )
+Kernel7_sm_90()
+{
+}
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxntidx",
+// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"minctasm",
+// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxclusterrank",
+#endif // USE_MAX_BLOCKS
+
const char constchar = 12;
__global__ void __launch_bounds__(constint, constchar) Kernel8() {}
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100
// CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"minctasm", i32 12
+
+#ifdef USE_MAX_BLOCKS
+const char constchar_2 = 14;
+__global__ void __launch_bounds__(constint, constchar, constchar_2) Kernel8_sm_90() {}
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxntidx", i32 100
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"minctasm", i32 12
+// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxclusterrank", i32 14
+#endif // USE_MAX_BLOCKS
diff --git a/clang/test/SemaCUDA/launch_bounds.cu b/clang/test/SemaCUDA/launch_bounds.cu
index 0ca0c0145d8bbb6..045f4756929593c 100644
--- a/clang/test/SemaCUDA/launch_bounds.cu
+++ b/clang/test/SemaCUDA/launch_bounds.cu
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -triple nvptx-unknown-unknown -target-cpu sm_75 -verify %s
#include "Inputs/cuda.h"
@@ -11,8 +11,9 @@ __launch_bounds__(0x10000000000000000) void TestWayTooBigArg(void); // expected-
__launch_bounds__(-128, 7) void TestNegArg1(void); // expected-warning {{'launch_bounds' attribute parameter 0 is negative and will be ignored}}
__launch_bounds__(128, -7) void TestNegArg2(void); // expected-warning {{'launch_bounds' attribute parameter 1 is negative and will be ignored}}
+__launch_bounds__(128, 2, -8) void TestNegArg2(void); // expected-warning {{maxclusterrank requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}}
-__launch_bounds__(1, 2, 3) void Test3Args(void); // expected-error {{'launch_bounds' attribute takes no more than 2 arguments}}
+__launch_bounds__(1, 2, 3, 4) void Test4Args(void); // expected-error {{'launch_bounds' attribute takes no more than 3 arguments}}
__launch_bounds__() void TestNoArgs(void); // expected-error {{'launch_bounds' attribute takes at least 1 argument}}
int TestNoFunction __launch_bounds__(128, 7); // expected-warning {{'launch_bounds' attribute only applies to Objective-C methods, functions, and function pointers}}
@@ -47,3 +48,5 @@ __launch_bounds__(Args) void TestTemplateVariadicArgs(void) {} // expected-error
template <int... Args>
__launch_bounds__(1, Args) void TestTemplateVariadicArgs2(void) {} // expected-error {{expression contains unexpanded parameter pack 'Args'}}
+
+__launch_bounds__(1, 2, 3) void Test3Args(void); // expected-warning {{maxclusterrank requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}}
diff --git a/clang/test/SemaCUDA/launch_bounds_sm_90.cu b/clang/test/SemaCUDA/launch_bounds_sm_90.cu
new file mode 100644
index 000000000000000..d5d902816c64c62
--- /dev/null
+++ b/clang/test/SemaCUDA/launch_bounds_sm_90.cu
@@ -0,0 +1,57 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -triple nvptx-unknown-unknown -target-cpu sm_90 -verify %s
+
+#include "Inputs/cuda.h"
+
+__launch_bounds__(128, 7) void Test2Args(void);
+__launch_bounds__(128) void Test1Arg(void);
+
+__launch_bounds__(0xffffffff) void TestMaxArg(void);
+__launch_bounds__(0x100000000) void TestTooBigArg(void); // expected-error {{integer constant expression evaluates to value 4294967296 that cannot be represented in a 32-bit unsigned integer type}}
+__launch_bounds__(0x10000000000000000) void TestWayTooBigArg(void); // expected-error {{integer literal is too large to be represented in any integer type}}
+__launch_bounds__(1, 1, 0x10000000000000000) void TestWayTooBigArg(void); // expected-error {{integer literal is too large to be represented in any integer type}}
+
+__launch_bounds__(-128, 7) void TestNegArg1(void); // expected-warning {{'launch_bounds' attribute parameter 0 is negative and will be ignored}}
+__launch_bounds__(128, -7) void TestNegArg2(void); // expected-warning {{'launch_bounds' attribute parameter 1 is negative and will be ignored}}
+__launch_bounds__(-128, 1, 7) void TestNegArg2(void); // expected-warning {{'launch_bounds' attribute parameter 0 is negative and will be ignored}}
+__launch_bounds__(128, -1, 7) void TestNegArg2(void); // expected-warning {{'launch_bounds' attribute parameter 1 is negative and will be ignored}}
+__launch_bounds__(128, 1, -7) void TestNegArg2(void); // expected-warning {{'launch_bounds' attribute parameter 2 is negative and will be ignored}}
+// expected-warning at 20 {{'launch_bounds' attribute parameter 0 is negative and will be ignored}}
+// expected-warning at 20 {{'launch_bounds' attribute parameter 1 is negative and will be ignored}}
+__launch_bounds__(-128, -1, 7) void TestNegArg2(void);
+// expected-warning at 23 {{'launch_bounds' attribute parameter 0 is negative and will be ignored}}
+// expected-warning at 23 {{'launch_bounds' attribute parameter 2 is negative and will be ignored}}
+__launch_bounds__(-128, 1, -7) void TestNegArg2(void);
+// expected-warning at 27 {{'launch_bounds' attribute parameter 0 is negative and will be ignored}}
+// expected-warning at 27 {{'launch_bounds' attribute parameter 1 is negative and will be ignored}}
+// expected-warning at 27 {{'launch_bounds' attribute parameter 2 is negative and will be ignored}}
+__launch_bounds__(-128, -1, -7) void TestNegArg2(void);
+
+
+__launch_bounds__(1, 2, 3, 4) void Test4Args(void); // expected-error {{'launch_bounds' attribute takes no more than 3 arguments}}
+__launch_bounds__() void TestNoArgs(void); // expected-error {{'launch_bounds' attribute takes at least 1 argument}}
+
+int TestNoFunction __launch_bounds__(128, 7, 13); // expected-warning {{'launch_bounds' attribute only applies to Objective-C methods, functions, and function pointers}}
+
+__launch_bounds__(true) void TestBool(void);
+__launch_bounds__(128, 1, 128.0) void TestFP(void); // expected-error {{'launch_bounds' attribute requires parameter 2 to be an integer constant}}
+__launch_bounds__(128, 1, (void*)0) void TestNullptr(void); // expected-error {{'launch_bounds' attribute requires parameter 2 to be an integer constant}}
+
+int nonconstint = 256;
+__launch_bounds__(125, 1, nonconstint) void TestNonConstInt(void); // expected-error {{'launch_bounds' attribute requires parameter 2 to be an integer constant}}
+
+const int constint = 512;
+__launch_bounds__(128, 1, constint) void TestConstInt(void);
+__launch_bounds__(128, 1, constint * 2 + 3) void TestConstIntExpr(void);
+
+template <int a, int b, int c> __launch_bounds__(a, b, c) void TestTemplate2Args(void) {}
+template void TestTemplate2Args<128,7, 13>(void);
+
+template <int a, int b, int c>
+__launch_bounds__(a + b, c + constint, a + b + c + constint) void TestTemplateExpr(void) {}
+template void TestTemplateExpr<128+constint, 3, 7>(void);
+
+template <int... Args>
+__launch_bounds__(Args) void TestTemplateVariadicArgs(void) {} // expected-error {{expression contains unexpanded parameter pack 'Args'}}
+
+template <int... Args>
+__launch_bounds__(1, 22, Args) void TestTemplateVariadicArgs2(void) {} // expected-error {{expression contains unexpanded parameter pack 'Args'}}
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 5d6127419d6318e..c1df063d80f5ffb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -537,59 +537,50 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
raw_ostream &O) const {
// If the NVVM IR has some of reqntid* specified, then output
// the reqntid directive, and set the unspecified ones to 1.
- // If none of reqntid* is specified, don't output reqntid directive.
- unsigned reqntidx, reqntidy, reqntidz;
- bool specified = false;
- if (!getReqNTIDx(F, reqntidx))
- reqntidx = 1;
- else
- specified = true;
- if (!getReqNTIDy(F, reqntidy))
- reqntidy = 1;
- else
- specified = true;
- if (!getReqNTIDz(F, reqntidz))
- reqntidz = 1;
- else
- specified = true;
-
- if (specified)
- O << ".reqntid " << reqntidx << ", " << reqntidy << ", " << reqntidz
+ // If none of Reqntid* is specified, don't output reqntid directive.
+ unsigned Reqntidx, Reqntidy, Reqntidz;
+ Reqntidx = Reqntidy = Reqntidz = 1;
+ bool ReqSpecified = false;
+ ReqSpecified |= getReqNTIDx(F, Reqntidx);
+ ReqSpecified |= getReqNTIDy(F, Reqntidy);
+ ReqSpecified |= getReqNTIDz(F, Reqntidz);
+
+ if (ReqSpecified)
+ O << ".reqntid " << Reqntidx << ", " << Reqntidy << ", " << Reqntidz
<< "\n";
// If the NVVM IR has some of maxntid* specified, then output
// the maxntid directive, and set the unspecified ones to 1.
// If none of maxntid* is specified, don't output maxntid directive.
- unsigned maxntidx, maxntidy, maxntidz;
- specified = false;
- if (!getMaxNTIDx(F, maxntidx))
- maxntidx = 1;
- else
- specified = true;
- if (!getMaxNTIDy(F, maxntidy))
- maxntidy = 1;
- else
- specified = true;
- if (!getMaxNTIDz(F, maxntidz))
- maxntidz = 1;
- else
- specified = true;
-
- if (specified)
- O << ".maxntid " << maxntidx << ", " << maxntidy << ", " << maxntidz
+ unsigned Maxntidx, Maxntidy, Maxntidz;
+ Maxntidx = Maxntidy = Maxntidz = 1;
+ bool MaxSpecified = false;
+ MaxSpecified |= getMaxNTIDx(F, Maxntidx);
+ MaxSpecified |= getMaxNTIDy(F, Maxntidy);
+ MaxSpecified |= getMaxNTIDz(F, Maxntidz);
+
+ if (MaxSpecified)
+ O << ".maxntid " << Maxntidx << ", " << Maxntidy << ", " << Maxntidz
<< "\n";
- unsigned mincta;
- if (getMinCTASm(F, mincta))
- O << ".minnctapersm " << mincta << "\n";
+ unsigned Mincta = 0;
+ if (getMinCTASm(F, Mincta))
+ O << ".minnctapersm " << Mincta << "\n";
+
+ unsigned Maxnreg = 0;
+ if (getMaxNReg(F, Maxnreg))
+ O << ".maxnreg " << Maxnreg << "\n";
- unsigned maxnreg;
- if (getMaxNReg(F, maxnreg))
- O << ".maxnreg " << maxnreg << "\n";
+ // .maxclusterrank directive requires SM_90 or higher, make sure that we
+ // filter it out for lower SM versions, as it causes a hard ptxas crash.
+ const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
+ const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
+ unsigned Maxclusterrank = 0;
+ if (getMaxClusterRank(F, Maxclusterrank) && STI->getSmVersion() >= 90)
+ O << ".maxclusterrank " << Maxclusterrank << "\n";
}
-std::string
-NVPTXAsmPrinter::getVirtualRegisterName(unsigned Reg) const {
+std::string NVPTXAsmPrinter::getVirtualRegisterName(unsigned Reg) const {
const TargetRegisterClass *RC = MRI->getRegClass(Reg);
std::string Name;
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index c3737f9fcca82a6..35302889095f862 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -262,6 +262,10 @@ bool getMaxNTIDz(const Function &F, unsigned &z) {
return findOneNVVMAnnotation(&F, "maxntidz", z);
}
+bool getMaxClusterRank(const Function &F, unsigned &x) {
+ return findOneNVVMAnnotation(&F, "maxclusterrank", x);
+}
+
bool getReqNTIDx(const Function &F, unsigned &x) {
return findOneNVVMAnnotation(&F, "reqntidx", x);
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index 521f8198911f29e..449973bb53de75c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -55,6 +55,7 @@ bool getReqNTIDx(const Function &, unsigned &);
bool getReqNTIDy(const Function &, unsigned &);
bool getReqNTIDz(const Function &, unsigned &);
+bool getMaxClusterRank(const Function &, unsigned &);
bool getMinCTASm(const Function &, unsigned &);
bool getMaxNReg(const Function &, unsigned &);
bool isKernelFunction(const Function &);
diff --git a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
new file mode 100644
index 000000000000000..828dd5e4cc400c6
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 | FileCheck %s --check-prefixes=CHECK,CHECK_SM_90
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK,CHECK_SM_80
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+; CHECK: .maxntid 128, 1, 1
+; CHECK: .minnctapersm 2
+; CHECK_SM_90: .maxclusterrank 8
+; CHECK_SM_80-NOT: .maxclusterrank 8
+
+; Make sure that for SM version prior to 90 `.maxclusterrank` directive is
+; sielently ignored.
+define dso_local void @_Z18TestMaxClusterRankv() {
+entry:
+ %a = alloca i32, align 4
+ store volatile i32 1, ptr %a, align 4
+ ret void
+}
+
+!nvvm.annotations = !{!0, !1, !2, !3}
+
+!0 = !{ptr @_Z18TestMaxClusterRankv, !"kernel", i32 1}
+!1 = !{ptr @_Z18TestMaxClusterRankv, !"maxntidx", i32 128}
+!2 = !{ptr @_Z18TestMaxClusterRankv, !"minctasm", i32 2}
+!3 = !{ptr @_Z18TestMaxClusterRankv, !"maxclusterrank", i32 8}
More information about the cfe-commits
mailing list