[llvm] 0afbcb2 - Revert "[NVPTX] Add support for maxclusterrank in launch_bounds (#66496)"

Wed Sep 27 02:01:26 PDT 2023

Author: Sam McCall
Date: 2023-09-27T10:59:04+02:00
New Revision: 0afbcb20fd908f8bf9073697423da097be7db592

URL: https://github.com/llvm/llvm-project/commit/0afbcb20fd908f8bf9073697423da097be7db592
DIFF: https://github.com/llvm/llvm-project/commit/0afbcb20fd908f8bf9073697423da097be7db592.diff

LOG: Revert "[NVPTX] Add support for maxclusterrank in launch_bounds (#66496)"

This reverts commit dfab31b41b4988b6dc8129840eba68f0c36c0f13.

SemaDeclAttr.cpp cannot depend on Basic's private headers
(lib/Basic/Targets/NVPTX.h)

Added: 
    

Modified: 
    clang/include/clang/Basic/Attr.td
    clang/include/clang/Basic/DiagnosticSemaKinds.td
    clang/include/clang/Sema/Sema.h
    clang/lib/Basic/Targets/NVPTX.h
    clang/lib/CodeGen/Targets/NVPTX.cpp
    clang/lib/Parse/ParseOpenMP.cpp
    clang/lib/Sema/SemaDeclAttr.cpp
    clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
    clang/test/CodeGenCUDA/launch-bounds.cu
    clang/test/SemaCUDA/launch_bounds.cu
    llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
    llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
    llvm/lib/Target/NVPTX/NVPTXUtilities.h

Removed: 
    clang/test/SemaCUDA/launch_bounds_sm_90.cu
    llvm/test/CodeGen/NVPTX/maxclusterrank.ll


################################################################################
diff  --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index fbc27d166ed9dd1..dd4d45171db4899 100644

--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1267,8 +1267,7 @@ def CUDAInvalidTarget : InheritableAttr {
 
 def CUDALaunchBounds : InheritableAttr {
   let Spellings = [GNU<"launch_bounds">, Declspec<"__launch_bounds__">];
-  let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>,
-              ExprArgument<"MaxBlocks", 1>];
+  let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>];
   let LangOpts = [CUDA];
   let Subjects = SubjectList<[ObjCMethod, FunctionLike]>;
   // An AST node is created for this attribute, but is not used by other parts

diff  --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index d3910bbce3b7614..3f30681a378e24f 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11853,10 +11853,6 @@ def err_sycl_special_type_num_init_method : Error<
   "types with 'sycl_special_class' attribute must have one and only one '__init' "
   "method defined">;
 
-def warn_cuda_maxclusterrank_sm_90 : Warning<
-  "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
-  "%1 attribute">, InGroup<IgnoredAttributes>;
-
 def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must "
                                  "have a bit size of at least %select{2|1}0">;
 def err_bit_int_max_size : Error<"%select{signed|unsigned}0 _BitInt of bit "

diff  --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index c36088ac38fb8cf..e4366170005a044 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11061,13 +11061,12 @@ class Sema final {
   /// Create an CUDALaunchBoundsAttr attribute.
   CUDALaunchBoundsAttr *CreateLaunchBoundsAttr(const AttributeCommonInfo &CI,
                                                Expr *MaxThreads,
-                                               Expr *MinBlocks,
-                                               Expr *MaxBlocks);
+                                               Expr *MinBlocks);
 
   /// AddLaunchBoundsAttr - Adds a launch_bounds attribute to a particular
   /// declaration.
   void AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI,
-                           Expr *MaxThreads, Expr *MinBlocks, Expr *MaxBlocks);
+                           Expr *MaxThreads, Expr *MinBlocks);
 
   /// AddModeAttr - Adds a mode attribute to a particular declaration.
   void AddModeAttr(Decl *D, const AttributeCommonInfo &CI, IdentifierInfo *Name,

diff  --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h
index 20d76b702a9426e..6fa0b8df97d7894 100644
--- a/clang/lib/Basic/Targets/NVPTX.h
+++ b/clang/lib/Basic/Targets/NVPTX.h
@@ -181,8 +181,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo {
 
   bool hasBitIntType() const override { return true; }
   bool hasBFloat16Type() const override { return true; }
-
-  CudaArch getGPU() const { return GPU; }
 };
 } // namespace targets
 } // namespace clang

diff  --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index 64d019a10514d60..0d4bbd795648008 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -296,8 +296,8 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(
     NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
                                             MaxThreads.getExtValue());
 
-  // min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it
-  // was not specified in __launch_bounds__ or if the user specified a 0 value,
+  // min blocks is an optional argument for CUDALaunchBoundsAttr. If it was
+  // not specified in __launch_bounds__ or if the user specified a 0 value,
   // we don't have to add a PTX directive.
   if (Attr->getMinBlocks()) {
     llvm::APSInt MinBlocks(32);
@@ -307,14 +307,6 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(
       NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "minctasm",
                                               MinBlocks.getExtValue());
   }
-  if (Attr->getMaxBlocks()) {
-    llvm::APSInt MaxBlocks(32);
-    MaxBlocks = Attr->getMaxBlocks()->EvaluateKnownConstInt(getContext());
-    if (MaxBlocks > 0)
-      // Create !{<func-ref>, metadata !"maxclusterrank", i32 <val>} node
-      NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxclusterrank",
-                                              MaxBlocks.getExtValue());
-  }
 }
 
 std::unique_ptr<TargetCodeGenInfo>

diff  --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 8a8a126bf7244d4..605b97617432ed3 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -3739,8 +3739,7 @@ OMPClause *Parser::ParseOpenMPOMPXAttributesClause(bool ParseOnly) {
         continue;
       if (auto *A = Actions.CreateLaunchBoundsAttr(
               PA, PA.getArgAsExpr(0),
-              PA.getNumArgs() > 1 ? PA.getArgAsExpr(1) : nullptr,
-              PA.getNumArgs() > 2 ? PA.getArgAsExpr(2) : nullptr))
+              PA.getNumArgs() > 1 ? PA.getArgAsExpr(1) : nullptr))
         Attrs.push_back(A);
       continue;
     default:

diff  --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 49c78fb2ffd3992..090a54eedaa07d0 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -10,7 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "../Basic/Targets/NVPTX.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTMutationListener.h"
@@ -5609,13 +5608,6 @@ bool Sema::CheckRegparmAttr(const ParsedAttr &AL, unsigned &numParams) {
   return false;
 }
 
-// Helper to get CudaArch.
-static CudaArch getCudaArch(const TargetInfo &TI) {
-  if (!TI.getTriple().isNVPTX())
-    llvm_unreachable("getCudaArch is only valid for NVPTX triple");
-  return static_cast<const targets::NVPTXTargetInfo *>(&TI)->getGPU();
-}
-
 // Checks whether an argument of launch_bounds attribute is
 // acceptable, performs implicit conversion to Rvalue, and returns
 // non-nullptr Expr result on success. Otherwise, it returns nullptr
@@ -5659,51 +5651,34 @@ static Expr *makeLaunchBoundsArgExpr(Sema &S, Expr *E,
 
 CUDALaunchBoundsAttr *
 Sema::CreateLaunchBoundsAttr(const AttributeCommonInfo &CI, Expr *MaxThreads,
-                             Expr *MinBlocks, Expr *MaxBlocks) {
-  CUDALaunchBoundsAttr TmpAttr(Context, CI, MaxThreads, MinBlocks, MaxBlocks);
+                             Expr *MinBlocks) {
+  CUDALaunchBoundsAttr TmpAttr(Context, CI, MaxThreads, MinBlocks);
   MaxThreads = makeLaunchBoundsArgExpr(*this, MaxThreads, TmpAttr, 0);
-  if (!MaxThreads)
+  if (MaxThreads == nullptr)
     return nullptr;
 
   if (MinBlocks) {
     MinBlocks = makeLaunchBoundsArgExpr(*this, MinBlocks, TmpAttr, 1);
-    if (!MinBlocks)
+    if (MinBlocks == nullptr)
       return nullptr;
   }
 
-  if (MaxBlocks) {
-    // '.maxclusterrank' ptx directive requires .target sm_90 or higher.
-    auto SM = getCudaArch(Context.getTargetInfo());
-    if (SM == CudaArch::UNKNOWN || SM < CudaArch::SM_90) {
-      Diag(MaxBlocks->getBeginLoc(), diag::warn_cuda_maxclusterrank_sm_90)
-          << CudaArchToString(SM) << CI << MaxBlocks->getSourceRange();
-      // Ignore it by setting MaxBlocks to null;
-      MaxBlocks = nullptr;
-    } else {
-      MaxBlocks = makeLaunchBoundsArgExpr(*this, MaxBlocks, TmpAttr, 2);
-      if (!MaxBlocks)
-        return nullptr;
-    }
-  }
-
   return ::new (Context)
-      CUDALaunchBoundsAttr(Context, CI, MaxThreads, MinBlocks, MaxBlocks);
+      CUDALaunchBoundsAttr(Context, CI, MaxThreads, MinBlocks);
 }
 
 void Sema::AddLaunchBoundsAttr(Decl *D, const AttributeCommonInfo &CI,
-                               Expr *MaxThreads, Expr *MinBlocks,
-                               Expr *MaxBlocks) {
-  if (auto *Attr = CreateLaunchBoundsAttr(CI, MaxThreads, MinBlocks, MaxBlocks))
+                               Expr *MaxThreads, Expr *MinBlocks) {
+  if (auto *Attr = CreateLaunchBoundsAttr(CI, MaxThreads, MinBlocks))
     D->addAttr(Attr);
 }
 
 static void handleLaunchBoundsAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
-  if (!AL.checkAtLeastNumArgs(S, 1) || !AL.checkAtMostNumArgs(S, 3))
+  if (!AL.checkAtLeastNumArgs(S, 1) || !AL.checkAtMostNumArgs(S, 2))
     return;
 
   S.AddLaunchBoundsAttr(D, AL, AL.getArgAsExpr(0),
-                        AL.getNumArgs() > 1 ? AL.getArgAsExpr(1) : nullptr,
-                        AL.getNumArgs() > 2 ? AL.getArgAsExpr(2) : nullptr);
+                        AL.getNumArgs() > 1 ? AL.getArgAsExpr(1) : nullptr);
 }
 
 static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D,

diff  --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index e9877056a197921..c4f1c4e06ac83b3 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -302,15 +302,7 @@ static void instantiateDependentCUDALaunchBoundsAttr(
     MinBlocks = Result.getAs<Expr>();
   }
 
-  Expr *MaxBlocks = nullptr;
-  if (Attr.getMaxBlocks()) {
-    Result = S.SubstExpr(Attr.getMaxBlocks(), TemplateArgs);
-    if (Result.isInvalid())
-      return;
-    MaxBlocks = Result.getAs<Expr>();
-  }
-
-  S.AddLaunchBoundsAttr(New, Attr, MaxThreads, MinBlocks, MaxBlocks);
+  S.AddLaunchBoundsAttr(New, Attr, MaxThreads, MinBlocks);
 }
 
 static void

diff  --git a/clang/test/CodeGenCUDA/launch-bounds.cu b/clang/test/CodeGenCUDA/launch-bounds.cu
index 31ca9216b413e92..58bcc410201f35f 100644
--- a/clang/test/CodeGenCUDA/launch-bounds.cu
+++ b/clang/test/CodeGenCUDA/launch-bounds.cu
@@ -1,13 +1,9 @@
 // RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -target-cpu sm_90 -DUSE_MAX_BLOCKS -fcuda-is-device -emit-llvm -o - | FileCheck -check-prefix=CHECK_MAX_BLOCKS %s
 
 #include "Inputs/cuda.h"
 
 #define MAX_THREADS_PER_BLOCK 256
 #define MIN_BLOCKS_PER_MP     2
-#ifdef USE_MAX_BLOCKS
-#define MAX_BLOCKS_PER_MP     4
-#endif
 
 // Test both max threads per block and Min cta per sm.
 extern "C" {
@@ -21,21 +17,6 @@ Kernel1()
 // CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"maxntidx", i32 256}
 // CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"minctasm", i32 2}
 
-#ifdef USE_MAX_BLOCKS
-// Test max threads per block and min/max cta per sm.
-extern "C" {
-__global__ void
-__launch_bounds__( MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP )
-Kernel1_sm_90()
-{
-}
-}
-
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxntidx", i32 256}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"minctasm", i32 2}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxclusterrank", i32 4}
-#endif // USE_MAX_BLOCKS
-
 // Test only max threads per block. Min cta per sm defaults to 0, and
 // CodeGen doesn't output a zero value for minctasm.
 extern "C" {
@@ -69,20 +50,6 @@ template __global__ void Kernel4<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
 // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256}
 // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"minctasm", i32 2}
 
-#ifdef USE_MAX_BLOCKS
-template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
-__global__ void
-__launch_bounds__(max_threads_per_block, min_blocks_per_mp, max_blocks_per_mp)
-Kernel4_sm_90()
-{
-}
-template __global__ void Kernel4_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
-
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxntidx", i32 256}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"minctasm", i32 2}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxclusterrank", i32 4}
-#endif //USE_MAX_BLOCKS
-
 const int constint = 100;
 template <int max_threads_per_block, int min_blocks_per_mp>
 __global__ void
@@ -96,23 +63,6 @@ template __global__ void Kernel5<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
 // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"maxntidx", i32 356}
 // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"minctasm", i32 258}
 
-#ifdef USE_MAX_BLOCKS
-
-template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
-__global__ void
-__launch_bounds__(max_threads_per_block + constint,
-                  min_blocks_per_mp + max_threads_per_block,
-                  max_blocks_per_mp + max_threads_per_block)
-Kernel5_sm_90()
-{
-}
-template __global__ void Kernel5_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
-
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxntidx", i32 356}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"minctasm", i32 258}
-// CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxclusterrank", i32 260}
-#endif //USE_MAX_BLOCKS
-
 // Make sure we don't emit negative launch bounds values.
 __global__ void
 __launch_bounds__( -MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP )
@@ -130,26 +80,7 @@ Kernel7()
 // CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"maxntidx",
 // CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"minctasm",
 
-#ifdef USE_MAX_BLOCKS
-__global__ void
-__launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP, -MAX_BLOCKS_PER_MP )
-Kernel7_sm_90()
-{
-}
-// CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxntidx",
-// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"minctasm",
-// CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxclusterrank",
-#endif // USE_MAX_BLOCKS
-
 const char constchar = 12;
 __global__ void __launch_bounds__(constint, constchar) Kernel8() {}
 // CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100
 // CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"minctasm", i32 12
-
-#ifdef USE_MAX_BLOCKS
-const char constchar_2 = 14;
-__global__ void __launch_bounds__(constint, constchar, constchar_2) Kernel8_sm_90() {}
-// CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxntidx", i32 100
-// CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"minctasm", i32 12
-// CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxclusterrank", i32 14
-#endif // USE_MAX_BLOCKS

diff  --git a/clang/test/SemaCUDA/launch_bounds.cu b/clang/test/SemaCUDA/launch_bounds.cu
index 045f4756929593c..0ca0c0145d8bbb6 100644
--- a/clang/test/SemaCUDA/launch_bounds.cu
+++ b/clang/test/SemaCUDA/launch_bounds.cu
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -triple nvptx-unknown-unknown -target-cpu sm_75 -verify %s
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
 
 #include "Inputs/cuda.h"
 
@@ -11,9 +11,8 @@ __launch_bounds__(0x10000000000000000) void TestWayTooBigArg(void); // expected-
 
 __launch_bounds__(-128, 7) void TestNegArg1(void); // expected-warning {{'launch_bounds' attribute parameter 0 is negative and will be ignored}}
 __launch_bounds__(128, -7) void TestNegArg2(void); // expected-warning {{'launch_bounds' attribute parameter 1 is negative and will be ignored}}
-__launch_bounds__(128, 2, -8) void TestNegArg2(void); // expected-warning {{maxclusterrank requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}}
 
-__launch_bounds__(1, 2, 3, 4) void Test4Args(void); // expected-error {{'launch_bounds' attribute takes no more than 3 arguments}}
+__launch_bounds__(1, 2, 3) void Test3Args(void); // expected-error {{'launch_bounds' attribute takes no more than 2 arguments}}
 __launch_bounds__() void TestNoArgs(void); // expected-error {{'launch_bounds' attribute takes at least 1 argument}}
 
 int TestNoFunction __launch_bounds__(128, 7); // expected-warning {{'launch_bounds' attribute only applies to Objective-C methods, functions, and function pointers}}
@@ -48,5 +47,3 @@ __launch_bounds__(Args) void TestTemplateVariadicArgs(void) {} // expected-error
 
 template <int... Args>
 __launch_bounds__(1, Args) void TestTemplateVariadicArgs2(void) {} // expected-error {{expression contains unexpanded parameter pack 'Args'}}
-
-__launch_bounds__(1, 2, 3) void Test3Args(void); // expected-warning {{maxclusterrank requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}}

diff  --git a/clang/test/SemaCUDA/launch_bounds_sm_90.cu b/clang/test/SemaCUDA/launch_bounds_sm_90.cu
deleted file mode 100644
index d5d902816c64c62..000000000000000
--- a/clang/test/SemaCUDA/launch_bounds_sm_90.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -triple nvptx-unknown-unknown -target-cpu sm_90 -verify %s
-
-#include "Inputs/cuda.h"
-
-__launch_bounds__(128, 7) void Test2Args(void);
-__launch_bounds__(128) void Test1Arg(void);
-
-__launch_bounds__(0xffffffff) void TestMaxArg(void);
-__launch_bounds__(0x100000000) void TestTooBigArg(void); // expected-error {{integer constant expression evaluates to value 4294967296 that cannot be represented in a 32-bit unsigned integer type}}
-__launch_bounds__(0x10000000000000000) void TestWayTooBigArg(void); // expected-error {{integer literal is too large to be represented in any integer type}}
-__launch_bounds__(1, 1, 0x10000000000000000) void TestWayTooBigArg(void); // expected-error {{integer literal is too large to be represented in any integer type}}
-
-__launch_bounds__(-128, 7) void TestNegArg1(void); // expected-warning {{'launch_bounds' attribute parameter 0 is negative and will be ignored}}
-__launch_bounds__(128, -7) void TestNegArg2(void); // expected-warning {{'launch_bounds' attribute parameter 1 is negative and will be ignored}}
-__launch_bounds__(-128, 1, 7) void TestNegArg2(void); // expected-warning {{'launch_bounds' attribute parameter 0 is negative and will be ignored}}
-__launch_bounds__(128, -1, 7) void TestNegArg2(void); // expected-warning {{'launch_bounds' attribute parameter 1 is negative and will be ignored}}
-__launch_bounds__(128, 1, -7) void TestNegArg2(void); // expected-warning {{'launch_bounds' attribute parameter 2 is negative and will be ignored}}
-// expected-warning at 20 {{'launch_bounds' attribute parameter 0 is negative and will be ignored}}
-// expected-warning at 20 {{'launch_bounds' attribute parameter 1 is negative and will be ignored}}
-__launch_bounds__(-128, -1, 7) void TestNegArg2(void);
-// expected-warning at 23 {{'launch_bounds' attribute parameter 0 is negative and will be ignored}}
-// expected-warning at 23 {{'launch_bounds' attribute parameter 2 is negative and will be ignored}}
-__launch_bounds__(-128, 1, -7) void TestNegArg2(void);
-// expected-warning at 27 {{'launch_bounds' attribute parameter 0 is negative and will be ignored}}
-// expected-warning at 27 {{'launch_bounds' attribute parameter 1 is negative and will be ignored}}
-// expected-warning at 27 {{'launch_bounds' attribute parameter 2 is negative and will be ignored}}
-__launch_bounds__(-128, -1, -7) void TestNegArg2(void);
-
-
-__launch_bounds__(1, 2, 3, 4) void Test4Args(void); // expected-error {{'launch_bounds' attribute takes no more than 3 arguments}}
-__launch_bounds__() void TestNoArgs(void); // expected-error {{'launch_bounds' attribute takes at least 1 argument}}
-
-int TestNoFunction __launch_bounds__(128, 7, 13); // expected-warning {{'launch_bounds' attribute only applies to Objective-C methods, functions, and function pointers}}
-
-__launch_bounds__(true) void TestBool(void);
-__launch_bounds__(128, 1, 128.0) void TestFP(void); // expected-error {{'launch_bounds' attribute requires parameter 2 to be an integer constant}}
-__launch_bounds__(128, 1, (void*)0) void TestNullptr(void); // expected-error {{'launch_bounds' attribute requires parameter 2 to be an integer constant}}
-
-int nonconstint = 256;
-__launch_bounds__(125, 1, nonconstint) void TestNonConstInt(void); // expected-error {{'launch_bounds' attribute requires parameter 2 to be an integer constant}}
-
-const int constint = 512;
-__launch_bounds__(128, 1, constint) void TestConstInt(void);
-__launch_bounds__(128, 1, constint * 2 + 3) void TestConstIntExpr(void);
-
-template <int a, int b, int c> __launch_bounds__(a, b, c) void TestTemplate2Args(void) {}
-template void TestTemplate2Args<128,7, 13>(void);
-
-template <int a, int b, int c>
-__launch_bounds__(a + b, c + constint, a + b + c + constint) void TestTemplateExpr(void) {}
-template void TestTemplateExpr<128+constint, 3, 7>(void);
-
-template <int... Args>
-__launch_bounds__(Args) void TestTemplateVariadicArgs(void) {} // expected-error {{expression contains unexpanded parameter pack 'Args'}}
-
-template <int... Args>
-__launch_bounds__(1, 22, Args) void TestTemplateVariadicArgs2(void) {} // expected-error {{expression contains unexpanded parameter pack 'Args'}}

diff  --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index c1df063d80f5ffb..5d6127419d6318e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -537,50 +537,59 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
                                                    raw_ostream &O) const {
   // If the NVVM IR has some of reqntid* specified, then output
   // the reqntid directive, and set the unspecified ones to 1.
-  // If none of Reqntid* is specified, don't output reqntid directive.
-  unsigned Reqntidx, Reqntidy, Reqntidz;
-  Reqntidx = Reqntidy = Reqntidz = 1;
-  bool ReqSpecified = false;
-  ReqSpecified |= getReqNTIDx(F, Reqntidx);
-  ReqSpecified |= getReqNTIDy(F, Reqntidy);
-  ReqSpecified |= getReqNTIDz(F, Reqntidz);
-
-  if (ReqSpecified)
-    O << ".reqntid " << Reqntidx << ", " << Reqntidy << ", " << Reqntidz
+  // If none of reqntid* is specified, don't output reqntid directive.
+  unsigned reqntidx, reqntidy, reqntidz;
+  bool specified = false;
+  if (!getReqNTIDx(F, reqntidx))
+    reqntidx = 1;
+  else
+    specified = true;
+  if (!getReqNTIDy(F, reqntidy))
+    reqntidy = 1;
+  else
+    specified = true;
+  if (!getReqNTIDz(F, reqntidz))
+    reqntidz = 1;
+  else
+    specified = true;
+
+  if (specified)
+    O << ".reqntid " << reqntidx << ", " << reqntidy << ", " << reqntidz
       << "\n";
 
   // If the NVVM IR has some of maxntid* specified, then output
   // the maxntid directive, and set the unspecified ones to 1.
   // If none of maxntid* is specified, don't output maxntid directive.
-  unsigned Maxntidx, Maxntidy, Maxntidz;
-  Maxntidx = Maxntidy = Maxntidz = 1;
-  bool MaxSpecified = false;
-  MaxSpecified |= getMaxNTIDx(F, Maxntidx);
-  MaxSpecified |= getMaxNTIDy(F, Maxntidy);
-  MaxSpecified |= getMaxNTIDz(F, Maxntidz);
-
-  if (MaxSpecified)
-    O << ".maxntid " << Maxntidx << ", " << Maxntidy << ", " << Maxntidz
-      << "\n";
+  unsigned maxntidx, maxntidy, maxntidz;
+  specified = false;
+  if (!getMaxNTIDx(F, maxntidx))
+    maxntidx = 1;
+  else
+    specified = true;
+  if (!getMaxNTIDy(F, maxntidy))
+    maxntidy = 1;
+  else
+    specified = true;
+  if (!getMaxNTIDz(F, maxntidz))
+    maxntidz = 1;
+  else
+    specified = true;
 
-  unsigned Mincta = 0;
-  if (getMinCTASm(F, Mincta))
-    O << ".minnctapersm " << Mincta << "\n";
+  if (specified)
+    O << ".maxntid " << maxntidx << ", " << maxntidy << ", " << maxntidz
+      << "\n";
 
-  unsigned Maxnreg = 0;
-  if (getMaxNReg(F, Maxnreg))
-    O << ".maxnreg " << Maxnreg << "\n";
+  unsigned mincta;
+  if (getMinCTASm(F, mincta))
+    O << ".minnctapersm " << mincta << "\n";
 
-  // .maxclusterrank directive requires SM_90 or higher, make sure that we
-  // filter it out for lower SM versions, as it causes a hard ptxas crash.
-  const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
-  const auto *STI = static_cast<const NVPTXSubtarget *>(NTM.getSubtargetImpl());
-  unsigned Maxclusterrank = 0;
-  if (getMaxClusterRank(F, Maxclusterrank) && STI->getSmVersion() >= 90)
-    O << ".maxclusterrank " << Maxclusterrank << "\n";
+  unsigned maxnreg;
+  if (getMaxNReg(F, maxnreg))
+    O << ".maxnreg " << maxnreg << "\n";
 }
 
-std::string NVPTXAsmPrinter::getVirtualRegisterName(unsigned Reg) const {
+std::string
+NVPTXAsmPrinter::getVirtualRegisterName(unsigned Reg) const {
   const TargetRegisterClass *RC = MRI->getRegClass(Reg);
 
   std::string Name;

diff  --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 35302889095f862..c3737f9fcca82a6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -262,10 +262,6 @@ bool getMaxNTIDz(const Function &F, unsigned &z) {
   return findOneNVVMAnnotation(&F, "maxntidz", z);
 }
 
-bool getMaxClusterRank(const Function &F, unsigned &x) {
-  return findOneNVVMAnnotation(&F, "maxclusterrank", x);
-}
-
 bool getReqNTIDx(const Function &F, unsigned &x) {
   return findOneNVVMAnnotation(&F, "reqntidx", x);
 }

diff  --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index 449973bb53de75c..521f8198911f29e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -55,7 +55,6 @@ bool getReqNTIDx(const Function &, unsigned &);
 bool getReqNTIDy(const Function &, unsigned &);
 bool getReqNTIDz(const Function &, unsigned &);
 
-bool getMaxClusterRank(const Function &, unsigned &);
 bool getMinCTASm(const Function &, unsigned &);
 bool getMaxNReg(const Function &, unsigned &);
 bool isKernelFunction(const Function &);

diff  --git a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll b/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
deleted file mode 100644
index 828dd5e4cc400c6..000000000000000
--- a/llvm/test/CodeGen/NVPTX/maxclusterrank.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 | FileCheck %s --check-prefixes=CHECK,CHECK_SM_90
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK,CHECK_SM_80
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-target triple = "nvptx64-unknown-unknown"
-
-; CHECK: .maxntid 128, 1, 1
-; CHECK: .minnctapersm 2
-; CHECK_SM_90: .maxclusterrank 8
-; CHECK_SM_80-NOT: .maxclusterrank 8
-
-; Make sure that for SM version prior to 90 `.maxclusterrank` directive is
-; sielently ignored.
-define dso_local void @_Z18TestMaxClusterRankv() {
-entry:
-  %a = alloca i32, align 4
-  store volatile i32 1, ptr %a, align 4
-  ret void
-}
-
-!nvvm.annotations = !{!0, !1, !2, !3}
-
-!0 = !{ptr @_Z18TestMaxClusterRankv, !"kernel", i32 1}
-!1 = !{ptr @_Z18TestMaxClusterRankv, !"maxntidx", i32 128}
-!2 = !{ptr @_Z18TestMaxClusterRankv, !"minctasm", i32 2}
-!3 = !{ptr @_Z18TestMaxClusterRankv, !"maxclusterrank", i32 8}