[PATCH] Add attributes for AMDGPU register limits.

Mon Nov 17 10:32:06 PST 2014

On Sat, Nov 15, 2014 at 3:56 PM, Matt Arsenault
<Matthew.Arsenault at amd.com> wrote:
> This is a performance hint that can be applied to kernels
> to attempt to limit the number of used registers. This adds two attributes,
> amdgpu_num_vgpr and amdgpu_num_sgpr. Alternatively, there could only be one
> with two arguments. That version would also be usable for Evergreen, which could
> ignore the second argument as a request for the number of SGPRs.
>
> http://reviews.llvm.org/D6288
>
> Files:
>   include/clang/Basic/Attr.td
>   include/clang/Basic/AttrDocs.td
>   include/clang/Basic/DiagnosticSemaKinds.td
>   include/clang/Sema/AttributeList.h
>   lib/CodeGen/TargetInfo.cpp
>   lib/Sema/SemaDeclAttr.cpp
>   test/CodeGenOpenCL/amdgpu-num-gpr-attr.cl
>   test/SemaOpenCL/amdgpu-num-register-attrs.cl

Thank you for working on this! Comments below:

> Index: include/clang/Basic/Attr.td
> ===================================================================
> --- include/clang/Basic/Attr.td
> +++ include/clang/Basic/Attr.td
> @@ -115,6 +115,10 @@
>  def FunctionLike : SubsetSubject<DeclBase,
>                                    [{S->getFunctionType(false) != NULL}]>;
>
> +def KernelFunction : SubsetSubject<Function, [{
> +  S->hasAttr<OpenCLKernelAttr>()
> +}]>;

Doesn't CUDA have kernel functions as well?

> +
>  // HasFunctionProto is a more strict version of FunctionLike, so it should
>  // never be specified in a Subjects list along with FunctionLike (due to the
>  // inclusive nature of subject testing).
> @@ -241,6 +245,7 @@
>    let OSes = ["Win32"];
>  }
>  def TargetMips : TargetArch<["mips", "mipsel"]>;
> +def TargetAMDGPU : TargetArch<["r600"]>;
>
>  class Attr {
>    // The various ways in which an attribute can be spelled in source
> @@ -859,6 +864,22 @@
>    let Documentation = [Undocumented];
>  }
>
> +def AMDGPUNumVGPR : InheritableAttr, TargetSpecificAttr<TargetAMDGPU> {
> +  let Spellings = [GCC<"amdgpu_num_vgpr">];

This spelling would also put the attribute in the gnu:: namespace. I'm
guessing it's not supported by GCC, so you probably want to use GNU
for the spelling instead of GCC.

> +  let Args = [UnsignedArgument<"NumVGPR">];
> +  let Documentation = [AMDGPURegisterDocs];
> +  let Subjects = SubjectList<[KernelFunction], ErrorDiag,
> +                             "ExpectedKernelFunction">;
> +}
> +
> +def AMDGPUNumSGPR : InheritableAttr, TargetSpecificAttr<TargetAMDGPU> {
> +  let Spellings = [GCC<"amdgpu_num_sgpr">];

Same comment here as above.

> +  let Args = [UnsignedArgument<"NumSGPR">];
> +  let Documentation = [AMDGPURegisterDocs];

This will render the same documentation a second time. The usual
approach is to create a category in the docs, and then generate
different docs in the same category. See the calling convention
attributes as an example.

> +  let Subjects = SubjectList<[KernelFunction], ErrorDiag,
> +                              "ExpectedKernelFunction">;
> +}
> +
>  def NoSplitStack : InheritableAttr {
>    let Spellings = [GCC<"no_split_stack">];
>    let Subjects = SubjectList<[Function], ErrorDiag>;
> Index: include/clang/Basic/AttrDocs.td
> ===================================================================
> --- include/clang/Basic/AttrDocs.td
> +++ include/clang/Basic/AttrDocs.td
> @@ -673,6 +673,31 @@
>    }];
>  }
>
> +def AMDGPURegisterDocs : Documentation {
> +  let Category = DocCatFunction;
> +  let Content = [{
> +Clang supports the
> +``__attribute__((amdgpu_num_vgpr(<num_registers>)))`` and
> +``__attribute__((amdgpu_num_sgpr(<num_registers>)))`` attributes on
> +AMD GPU targets. This attribute may be attached to a kernel function
> +definition and is an optimization hint to the backend for the maximum
> +number of registers to use. This is useful in cases where register
> +limited occupancy is known to be an important factor for the performance
> +for the kernel.
> +
> +The semantics are as follows:
> +
> +- The backend will attempt to limit the number of used registers to
> +  the specified values,

"specified value" instead, since there's only one value specified?

> but the exact number used is not
> +  guaranteed. The number used may be rounded up

Rounded up to what?

> to satisfy the
> +  allocation requirements or ABI constraints of the subtarget. The
> +  backend may also use fewer registers than requested whenever
> +  possible.

Is there a problem if you specify more registers than are available?

> +
> +- 0 implies the default no limit on register usage.
> +  }];
> +}
> +
>  def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> {
>    let Content = [{
>  Clang supports several different calling conventions, depending on the target
> Index: include/clang/Basic/DiagnosticSemaKinds.td
> ===================================================================
> --- include/clang/Basic/DiagnosticSemaKinds.td
> +++ include/clang/Basic/DiagnosticSemaKinds.td
> @@ -2237,7 +2237,7 @@
>    "Objective-C instance methods|init methods of interface or class extension declarations|"
>    "variables, functions and classes|Objective-C protocols|"
>    "functions and global variables|structs or typedefs|"
> -  "interface or protocol declarations}1">,
> +  "interface or protocol declarations|kernel functions}1">,
>    InGroup<IgnoredAttributes>;
>  def err_attribute_wrong_decl_type : Error<warn_attribute_wrong_decl_type.Text>;
>  def warn_type_attribute_wrong_type : Warning<
> Index: include/clang/Sema/AttributeList.h
> ===================================================================
> --- include/clang/Sema/AttributeList.h
> +++ include/clang/Sema/AttributeList.h
> @@ -843,7 +843,8 @@
>    ExpectedObjectiveCProtocol,
>    ExpectedFunctionGlobalVarMethodOrProperty,
>    ExpectedStructOrTypedef,
> -  ExpectedObjectiveCInterfaceOrProtocol
> +  ExpectedObjectiveCInterfaceOrProtocol,
> +  ExpectedKernelFunction
>  };
>
>  }  // end namespace clang
> Index: lib/CodeGen/TargetInfo.cpp
> ===================================================================
> --- lib/CodeGen/TargetInfo.cpp
> +++ lib/CodeGen/TargetInfo.cpp
> @@ -20,6 +20,7 @@
>  #include "clang/AST/RecordLayout.h"
>  #include "clang/CodeGen/CGFunctionInfo.h"
>  #include "clang/Frontend/CodeGenOptions.h"
> +#include "llvm/ADT/StringExtras.h"
>  #include "llvm/ADT/Triple.h"
>  #include "llvm/IR/DataLayout.h"
>  #include "llvm/IR/Type.h"
> @@ -6172,6 +6173,40 @@
>    return AddrTyped;
>  }
>
> +//===----------------------------------------------------------------------===//
> +// AMDGPU ABI Implementation
> +//===----------------------------------------------------------------------===//
> +
> +namespace {
> +
> +class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
> +public:
> +  AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
> +    : TargetCodeGenInfo(new DefaultABIInfo(CGT)) {}
> +  void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
> +                           CodeGen::CodeGenModule &M) const override;
> +};
> +
> +}
> +
> +void AMDGPUTargetCodeGenInfo::SetTargetAttributes(const Decl *D,
> +                                                  llvm::GlobalValue *GV,
> +                                                  CodeGen::CodeGenModule &M) const {

80-col limit.

> +  const FunctionDecl *FD = dyn_cast<FunctionDecl>(D);
> +  if (!FD)
> +    return;
> +
> +  if (const AMDGPUNumVGPRAttr *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
> +    llvm::Function *F = cast<llvm::Function>(GV);
> +    F->addFnAttr("amdgpu_num_vgpr", llvm::utostr(Attr->getNumVGPR()));
> +  }
> +
> +  if (const AMDGPUNumSGPRAttr *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
> +    llvm::Function *F = cast<llvm::Function>(GV);
> +    F->addFnAttr("amdgpu_num_sgpr", llvm::utostr(Attr->getNumSGPR()));
> +  }
> +}

Can make better use of auto in here, if you'd like. ;-)

> +
>
>  //===----------------------------------------------------------------------===//
>  // SPARC v9 ABI Implementation.
> @@ -7233,6 +7268,8 @@
>    }
>    case llvm::Triple::hexagon:
>      return *(TheTargetCodeGenInfo = new HexagonTargetCodeGenInfo(Types));
> +  case llvm::Triple::r600:
> +    return *(TheTargetCodeGenInfo = new AMDGPUTargetCodeGenInfo(Types));
>    case llvm::Triple::sparcv9:
>      return *(TheTargetCodeGenInfo = new SparcV9TargetCodeGenInfo(Types));
>    case llvm::Triple::xcore:
> Index: lib/Sema/SemaDeclAttr.cpp
> ===================================================================
> --- lib/Sema/SemaDeclAttr.cpp
> +++ lib/Sema/SemaDeclAttr.cpp
> @@ -3940,6 +3940,38 @@
>      handleARMInterruptAttr(S, D, Attr);
>  }
>
> +static void handleAMDGPUNumRegAttr(Sema &S, Decl *D,
> +                                   const AttributeList &Attr,
> +                                   bool IsVGPR) {

The signature for this function should match the signatures for all
the other handle* functions. The usual pattern is to make this a
helper function that's called from a handle*Attr function. This allows
us to replace that giant switch statement at some point in the future
(which has been an ongoing project).

> +  if (!checkAttributeNumArgs(S, Attr, 1))
> +    return;

No need for this check, it's already handled by the common handler.

> +
> +  Expr *NumRegsExpr = static_cast<Expr *>(Attr.getArgAsExpr(0));
> +  llvm::APSInt NumRegs(32);
> +
> +  if (!NumRegsExpr->isIntegerConstantExpr(NumRegs, S.Context)) {
> +    S.Diag(Attr.getLoc(), diag::err_attribute_argument_type)
> +      << Attr.getName() << AANT_ArgumentIntegerConstant
> +      << NumRegsExpr->getSourceRange();
> +    return;
> +  }

Instead, you should use checkUInt32Argument().

> +
> +  if (IsVGPR) {
> +    D->addAttr(::new (S.Context)
> +               AMDGPUNumVGPRAttr(Attr.getLoc(), S.Context,
> +                                 NumRegs.getZExtValue(),
> +                                 Attr.getAttributeSpellingListIndex()));
> +  } else {
> +    D->addAttr(::new (S.Context)
> +               AMDGPUNumSGPRAttr(Attr.getLoc(), S.Context,
> +                                 NumRegs.getZExtValue(),
> +                                 Attr.getAttributeSpellingListIndex()));
> +  }
> +
> +
> +  D->addAttr(UsedAttr::CreateImplicit(S.Context));
> +}
> +
>  static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
>                                                const AttributeList& Attr) {
>    // If we try to apply it to a function pointer, don't warn, but don't
> @@ -4247,6 +4279,12 @@
>    case AttributeList::AT_NoMips16:
>      handleSimpleAttribute<NoMips16Attr>(S, D, Attr);
>      break;
> +  case AttributeList::AT_AMDGPUNumVGPR:
> +    handleAMDGPUNumRegAttr(S, D, Attr, true);
> +    break;
> +  case AttributeList::AT_AMDGPUNumSGPR:
> +    handleAMDGPUNumRegAttr(S, D, Attr, false);
> +    break;
>    case AttributeList::AT_IBAction:
>      handleSimpleAttribute<IBActionAttr>(S, D, Attr);
>      break;
> Index: test/CodeGenOpenCL/amdgpu-num-gpr-attr.cl
> ===================================================================
> --- /dev/null
> +++ test/CodeGenOpenCL/amdgpu-num-gpr-attr.cl
> @@ -0,0 +1,28 @@
> +// RUN: %clang_cc1 -triple r600-- -target-cpu tahiti -O0 -emit-llvm -o - %s | FileCheck %s
> +
> +__attribute__((amdgpu_num_vgpr(64)))
> +kernel void test_num_vgpr64() {
> +// CHECK: define void @test_num_vgpr64() [[ATTR_VGPR64:#[0-9]+]]
> +}
> +
> +__attribute__((amdgpu_num_sgpr(32)))
> +kernel void test_num_sgpr32() {
> +// CHECK: define void @test_num_sgpr32() [[ATTR_SGPR32:#[0-9]+]]
> +}
> +
> +__attribute__((amdgpu_num_vgpr(64), amdgpu_num_sgpr(32)))
> +kernel void test_num_vgpr64_sgpr32() {
> +// CHECK: define void @test_num_vgpr64_sgpr32() [[ATTR_VGPR64_SGPR32:#[0-9]+]]
> +
> +}
> +
> +__attribute__((amdgpu_num_sgpr(20), amdgpu_num_vgpr(40)))
> +kernel void test_num_sgpr20_vgpr40() {
> +// CHECK: define void @test_num_sgpr20_vgpr40() [[ATTR_SGPR20_VGPR40:#[0-9]+]]

I'd like to see a test when 0 is passed to the attribute.

> +}
> +
> +
> +// CHECK-DAG: attributes [[ATTR_VGPR64]] = { nounwind "amdgpu_num_vgpr"="64"
> +// CHECK-DAG: attributes [[ATTR_SGPR32]] = { nounwind "amdgpu_num_sgpr"="32"
> +// CHECK-DAG: attributes [[ATTR_VGPR64_SGPR32]] = { nounwind "amdgpu_num_sgpr"="32" "amdgpu_num_vgpr"="64"
> +// CHECK-DAG: attributes [[ATTR_SGPR20_VGPR40]] = { nounwind "amdgpu_num_sgpr"="20" "amdgpu_num_vgpr"="40"
> Index: test/SemaOpenCL/amdgpu-num-register-attrs.cl
> ===================================================================
> --- /dev/null
> +++ test/SemaOpenCL/amdgpu-num-register-attrs.cl
> @@ -0,0 +1,20 @@
> +// RUN: %clang_cc1 -triple r600-- -verify -fsyntax-only %s
> +
> +typedef __attribute__((amdgpu_num_vgpr(128))) struct FooStruct { // expected-error {{'amdgpu_num_vgpr' attribute only applies to kernel functions}}
> +  int x;
> +  float y;
> +} FooStruct;
> +
> +
> +__attribute__((amdgpu_num_vgpr("ABC"))) kernel void foo2() {} // expected-error {{'amdgpu_num_vgpr' attribute requires an integer constant}}
> +__attribute__((amdgpu_num_sgpr("ABC"))) kernel void foo3() {} // expected-error {{'amdgpu_num_sgpr' attribute requires an integer constant}}
> +
> +
> +__attribute__((amdgpu_num_vgpr(40))) void foo4() {} // expected-error {{'amdgpu_num_vgpr' attribute only applies to kernel functions}}
> +__attribute__((amdgpu_num_sgpr(64))) void foo5() {} // expected-error {{'amdgpu_num_sgpr' attribute only applies to kernel functions}}
> +
> +__attribute__((amdgpu_num_vgpr(40))) kernel void foo7() {}
> +__attribute__((amdgpu_num_sgpr(64))) kernel void foo8() {}
> +__attribute__((amdgpu_num_vgpr(40), amdgpu_num_sgpr(64))) kernel void foo9() {}

I'd like to see tests that accept 0 as an argument and that properly
handle invalid values (like values that cannot be stored in a 32-bit
number).

> +
> +
>

~Aaron