[PATCH] [CUDA] Allow using integral non-type template parameters as launch_bounds attribute arguments.

Mon Apr 13 09:47:40 PDT 2015

On Fri, Apr 10, 2015 at 6:22 PM, Artem Belevich <tra at google.com> wrote:
> Hi rsmith, eliben,
>
> Allow using integral non-type template parameters as launch_bounds attribute arguments.
>
> - Changed CUDALaunchBounds arguments from integers to Expr* so they can
>   be saved in AST for instantiation, if needed.
> - Added support for template instantiation of launch_bounds attrubute.
> - Moved evaluation of launch_bounds arguments to NVPTXTargetCodeGenInfo::
>   SetTargetAttributes() where it can be done after template instantiation.
> - Amended test cases.
>
> http://reviews.llvm.org/D8985
>
> Files:
>   include/clang/Basic/Attr.td
>   lib/CodeGen/TargetInfo.cpp
>   lib/Sema/SemaDeclAttr.cpp
>   lib/Sema/SemaTemplateInstantiateDecl.cpp
>   test/CodeGenCUDA/launch-bounds.cu
>   test/SemaCUDA/launch_bounds.cu
>
> EMAIL PREFERENCES
>   http://reviews.llvm.org/settings/panel/emailpreferences/

> Index: include/clang/Basic/Attr.td
> ===================================================================
> --- include/clang/Basic/Attr.td
> +++ include/clang/Basic/Attr.td
> @@ -581,7 +581,7 @@
>
>  def CUDALaunchBounds : InheritableAttr {
>    let Spellings = [GNU<"launch_bounds">];
> -  let Args = [IntArgument<"MaxThreads">, DefaultIntArgument<"MinBlocks", 0>];
> +  let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>];

On the wish list for someday: a ConstIntExprArgument that does
boilerplate for you. I don't expect that for your patch, but boy, it
sure would be nice. ;-)

>    let LangOpts = [CUDA];
>    let Subjects = SubjectList<[ObjCMethod, FunctionLike], WarnDiag,
>                               "ExpectedFunctionOrMethod">;
> Index: lib/CodeGen/TargetInfo.cpp
> ===================================================================
> --- lib/CodeGen/TargetInfo.cpp
> +++ lib/CodeGen/TargetInfo.cpp
> @@ -5079,16 +5079,24 @@
>      }
>      if (FD->hasAttr<CUDALaunchBoundsAttr>()) {
>        // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
> -      addNVVMMetadata(F, "maxntidx",
> -                      FD->getAttr<CUDALaunchBoundsAttr>()->getMaxThreads());
> -      // min blocks is a default argument for CUDALaunchBoundsAttr, so getting a
> -      // zero value from getMinBlocks either means it was not specified in
> -      // __launch_bounds__ or the user specified a 0 value. In both cases, we
> +      CUDALaunchBoundsAttr *Attr = FD->getAttr<CUDALaunchBoundsAttr>();

Push this up into the if block instead of checking hasAttr and then
calling getAttr.

> +      llvm::APSInt MaxThreads;
> +      if (Attr->getMaxThreads()->EvaluateAsInt(MaxThreads, M.getContext()))
> +        addNVVMMetadata(F, "maxntidx", MaxThreads.getExtValue());

Are a negative number of max threads allowed? I noticed that -1 seems
to get passed through unhindered.

> +      else
> +        llvm_unreachable("launch_bounds arg 1 evaluation failed.");
> +
> +      // min blocks is a default argument for CUDALaunchBoundsAttr. If it was
> +      // not specified in __launch_bounds__ or the user specified a 0 value, we
>        // don't have to add a PTX directive.
> -      int MinCTASM = FD->getAttr<CUDALaunchBoundsAttr>()->getMinBlocks();
> -      if (MinCTASM > 0) {
> -        // Create !{<func-ref>, metadata !"minctasm", i32 <val>} node
> -        addNVVMMetadata(F, "minctasm", MinCTASM);
> +      if (Attr->getMinBlocks()) {
> +        llvm::APSInt MinBlocks;
> +        if (Attr->getMinBlocks()->EvaluateAsInt(MinBlocks, M.getContext())) {
> +          if (MinBlocks > 0)
> +            // Create !{<func-ref>, metadata !"minctasm", i32 <val>} node
> +            addNVVMMetadata(F, "minctasm", MinBlocks.getExtValue());

Silently disallowing negative or zero values is unkind; that should
really be a warning with a test case.

> +        } else
> +          llvm_unreachable("launch_bounds arg 2 evaluation failed.");
>        }
>      }
>    }
> Index: lib/Sema/SemaDeclAttr.cpp
> ===================================================================
> --- lib/Sema/SemaDeclAttr.cpp
> +++ lib/Sema/SemaDeclAttr.cpp
> @@ -3457,20 +3457,38 @@
>    return false;
>  }
>
> +static bool isAcceptableLaunchBoundsArgument(Sema &S, Expr *E) {
> +  return E->getType()->isIntegerType() &&
> +         !E->containsUnexpandedParameterPack() &&
> +         (E->isInstantiationDependent() || E->isEvaluatable(S.Context));
> +}
> +
>  static void handleLaunchBoundsAttr(Sema &S, Decl *D,
>                                     const AttributeList &Attr) {
> -  uint32_t MaxThreads, MinBlocks = 0;
> -  if (!checkUInt32Argument(S, Attr, Attr.getArgAsExpr(0), MaxThreads, 1))
> +  if (!checkAttributeAtLeastNumArgs(S, Attr, 1) ||
> +      !checkAttributeAtMostNumArgs(S, Attr, 2))
>      return;
> -  if (Attr.getNumArgs() > 1 && !checkUInt32Argument(S, Attr,
> -                                                    Attr.getArgAsExpr(1),
> -                                                    MinBlocks, 2))

One thing the old code did that the new code does not is test the
validity of the value (for instance, that it's 32-bit instead of
64-bit) and diagnose.

> +
> +  Expr *MaxThreads = Attr.getArgAsExpr(0);
> +  if (!isAcceptableLaunchBoundsArgument(S, MaxThreads)) {
> +    S.Diag(Attr.getLoc(), diag::err_attribute_argument_n_type)
> +        << Attr.getName() << 0 << AANT_ArgumentIntegerConstant
> +        << MaxThreads->getSourceRange();
> +    return;
> +  }
> +
> +  Expr *MinBlocks = Attr.getNumArgs() > 1 ? Attr.getArgAsExpr(1) : nullptr;
> +  if (MinBlocks && !isAcceptableLaunchBoundsArgument(S, MinBlocks)) {
> +    S.Diag(Attr.getLoc(), diag::err_attribute_argument_n_type)
> +        << Attr.getName() << 1 << AANT_ArgumentIntegerConstant
> +        << MinBlocks->getSourceRange();
>      return;
> +  }
>
> -  D->addAttr(::new (S.Context)
> -              CUDALaunchBoundsAttr(Attr.getRange(), S.Context,
> -                                  MaxThreads, MinBlocks,
> -                                  Attr.getAttributeSpellingListIndex()));
> +  D->addAttr(::new (S.Context) CUDALaunchBoundsAttr(
> +      Attr.getRange(), S.Context, MaxThreads, MinBlocks,
> +      Attr.getAttributeSpellingListIndex()));
> +  return;
>  }
>
>  static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D,
> Index: lib/Sema/SemaTemplateInstantiateDecl.cpp
> ===================================================================
> --- lib/Sema/SemaTemplateInstantiateDecl.cpp
> +++ lib/Sema/SemaTemplateInstantiateDecl.cpp
> @@ -202,6 +202,37 @@
>    New->addAttr(EIA);
>  }
>
> +static void instantiateDependentCUDALaunchBoundsAttr(
> +    Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
> +    const CUDALaunchBoundsAttr *A, Decl *New) {
> +
> +  // LaunchBounds arguments are constant expressions
> +  EnterExpressionEvaluationContext Unevaluated(S, Sema::ConstantEvaluated);
> +  ExprResult MaxThreadsResult = S.SubstExpr(A->getMaxThreads(), TemplateArgs);
> +  if (MaxThreadsResult.isInvalid()) {
> +    S.Diag(A->getLocation(), diag::err_attribute_argument_n_type)
> +        << A->getSpelling() << 0 << AANT_ArgumentIntegerConstant
> +        << A->getMaxThreads()->getSourceRange();

You should be able to just pass A instead of callign getSpelling on it
(this will also properly quote things).

> +      return;
> +  }
> +
> +  Expr *MinBlocksExpr = A->getMinBlocks();
> +  if (MinBlocksExpr) {
> +    ExprResult MinBlocksResult = S.SubstExpr(A->getMinBlocks(), TemplateArgs);
> +    if (MinBlocksResult.isInvalid()) {
> +      S.Diag(A->getLocation(), diag::err_attribute_argument_n_type)
> +          << A->getSpelling() << 1 << AANT_ArgumentIntegerConstant
> +          << A->getMinBlocks()->getSourceRange();

Same here as above.

> +      return;
> +    }
> +    MinBlocksExpr = MinBlocksResult.getAs<Expr>();
> +  }
> +
> +  New->addAttr(::new (S.Context) CUDALaunchBoundsAttr(
> +      A->getRange(), S.Context, MaxThreadsResult.getAs<Expr>(), MinBlocksExpr,
> +      A->getSpellingListIndex()));
> +}
> +
>  void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
>                              const Decl *Tmpl, Decl *New,
>                              LateInstantiatedAttrVec *LateAttrs,
> @@ -233,6 +264,13 @@
>        continue;
>      }
>
> +    const CUDALaunchBoundsAttr *CUDALaunchBounds =
> +        dyn_cast<CUDALaunchBoundsAttr>(TmplAttr);
> +    if (CUDALaunchBounds) {

Fold the dyn_cast into the if statement, use auto.

> +      instantiateDependentCUDALaunchBoundsAttr(*this, TemplateArgs,
> +                                               CUDALaunchBounds, New);
> +      continue;
> +    }
>      // Existing DLL attribute on the instantiation takes precedence.
>      if (TmplAttr->getKind() == attr::DLLExport ||
>          TmplAttr->getKind() == attr::DLLImport) {
> Index: test/CodeGenCUDA/launch-bounds.cu
> ===================================================================
> --- test/CodeGenCUDA/launch-bounds.cu
> +++ test/CodeGenCUDA/launch-bounds.cu
> @@ -28,3 +28,23 @@
>  }
>
>  // CHECK: !{{[0-9]+}} = !{void ()* @Kernel2, !"maxntidx", i32 256}
> +
> +template <int max_threads_per_block>
> +__global__ void
> +__launch_bounds__(max_threads_per_block)
> +Kernel3()
> +{
> +}
> +
> +template void Kernel3<MAX_THREADS_PER_BLOCK>();
> +// CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel3{{.*}}, !"maxntidx", i32 256}
> +
> +template <int max_threads_per_block, int min_blocks_per_mp>
> +__global__ void
> +__launch_bounds__(max_threads_per_block, min_blocks_per_mp)
> +Kernel4()
> +{
> +}
> +template void Kernel4<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
> +// CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256}
> +// CHECK: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel4{{.*}}, !"minctasm", i32 2}
> Index: test/SemaCUDA/launch_bounds.cu
> ===================================================================
> --- test/SemaCUDA/launch_bounds.cu
> +++ test/SemaCUDA/launch_bounds.cu
> @@ -9,3 +9,9 @@
>  __launch_bounds__() void Test4(void); // expected-error {{'launch_bounds' attribute takes at least 1 argument}}
>
>  int Test5 __launch_bounds__(128, 7); // expected-warning {{'launch_bounds' attribute only applies to functions and methods}}
> +
> +template <int a, int b> __launch_bounds__(a, b) void Test6(void) {}
> +template void Test6<128,7>(void);
> +
> +template <int a> __launch_bounds__(a) void Test7(void) {}
> +template void Test7<128>(void);
>

You're missing tests that use actual expressions. It would also be
good to capture some tests with expressions you would expect to fail.

~Aaron