[PATCH] Add attributes for AMDGPU register limits.

Thu Dec 4 08:10:39 PST 2014

On Wed, Dec 3, 2014 at 4:39 PM, Matt Arsenault
<Matthew.Arsenault at amd.com> wrote:
> Address review comments. Don't bother emitting the attribute if it's 0
>
> http://reviews.llvm.org/D6288
>
> Files:
>   include/clang/Basic/Attr.td
>   include/clang/Basic/AttrDocs.td
>   include/clang/Basic/DiagnosticSemaKinds.td
>   include/clang/Sema/AttributeList.h
>   lib/CodeGen/TargetInfo.cpp
>   lib/Sema/SemaDeclAttr.cpp
>   test/CodeGenOpenCL/amdgpu-num-gpr-attr.cl
>   test/SemaOpenCL/amdgpu-num-register-attrs.cl

Some comments and questions below.

> Index: include/clang/Basic/Attr.td
> ===================================================================
> --- include/clang/Basic/Attr.td
> +++ include/clang/Basic/Attr.td
> @@ -115,6 +115,10 @@
>  def FunctionLike : SubsetSubject<DeclBase,
>                                    [{S->getFunctionType(false) != NULL}]>;
>
> +def KernelFunction : SubsetSubject<Function, [{
> +  S->hasAttr<OpenCLKernelAttr>()
> +}]>;

Still not quite in love with this -- CUDA has kernel functions as
well, so the name is misleading.

> +
>  // HasFunctionProto is a more strict version of FunctionLike, so it should
>  // never be specified in a Subjects list along with FunctionLike (due to the
>  // inclusive nature of subject testing).
> @@ -243,6 +247,7 @@
>    let OSes = ["Win32"];
>  }
>  def TargetMips : TargetArch<["mips", "mipsel"]>;
> +def TargetAMDGPU : TargetArch<["r600"]>;
>
>  class Attr {
>    // The various ways in which an attribute can be spelled in source
> @@ -885,6 +890,22 @@
>    let Documentation = [Undocumented];
>  }
>
> +def AMDGPUNumVGPR : InheritableAttr, TargetSpecificAttr<TargetAMDGPU> {
> +  let Spellings = [GNU<"amdgpu_num_vgpr">];
> +  let Args = [UnsignedArgument<"NumVGPR">];
> +  let Documentation = [AMDGPUNumVGPRDocs];
> +  let Subjects = SubjectList<[KernelFunction], ErrorDiag,
> +                             "ExpectedKernelFunction">;
> +}
> +
> +def AMDGPUNumSGPR : InheritableAttr, TargetSpecificAttr<TargetAMDGPU> {
> +  let Spellings = [GNU<"amdgpu_num_sgpr">];
> +  let Args = [UnsignedArgument<"NumSGPR">];
> +  let Documentation = [AMDGPUNumSGPRDocs];
> +  let Subjects = SubjectList<[KernelFunction], ErrorDiag,
> +                              "ExpectedKernelFunction">;
> +}
> +
>  def NoSplitStack : InheritableAttr {
>    let Spellings = [GCC<"no_split_stack">];
>    let Subjects = SubjectList<[Function], ErrorDiag>;
> Index: include/clang/Basic/AttrDocs.td
> ===================================================================
> --- include/clang/Basic/AttrDocs.td
> +++ include/clang/Basic/AttrDocs.td
> @@ -673,6 +673,64 @@
>    }];
>  }
>
> +def DocCatAMDGPURegisterAttributes :
> +  DocumentationCategory<"AMD GPU Register Attributes"> {
> +  let Content = [{
> +Clang supports attributes for controlling register usage on AMD GPU
> +targets. These attributes may be attached to a kernel function
> +definition and is an optimization hint to the backend for the maximum
> +number of registers to use. This is useful in cases where register
> +limited occupancy is known to be an important factor for the
> +performance for the kernel.
> +
> +The semantics are as follows:
> +
> +- The backend will attempt to limit the number of used registers to
> +  the specified value, but the exact number used is not
> +  guaranteed. The number used may be rounded up to satisfy the
> +  allocation requirements or ABI constraints of the subtarget. For
> +  example, on Southern Islands VGPRs may only be allocated in
> +  increments of 4, so requesting a limit of 39 VGPRs will really
> +  attempt to use up to 40. Requesting more registers than the
> +  subtarget supports will truncate to the maximum allowed The backend

Missing a period between "allowed" and "The".

> +  may also use fewer registers than requested whenever possible.
> +
> +- 0 implies the default no limit on register usage.
> +
> +- Ignored on older subtargets.

What's an "older" subtarget? Or is this left purposefully vague so
that current subtargets may become "older" in the future and behavior
changes?

> +}];
> +}
> +
> +
> +def AMDGPUNumVGPRDocs : Documentation {
> +  let Category = DocCatAMDGPURegisterAttributes;
> +  let Content = [{
> +Clang supports the
> +``__attribute__((amdgpu_num_vgpr(<num_registers>)))`` attribute on AMD
> +Southern Islands GPUs and later for controlling the number of vector
> +registers. A typical value would be between 4 and 256 in increments
> +of 4.
> +}];
> +}
> +
> +def AMDGPUNumSGPRDocs : Documentation {
> +  let Category = DocCatAMDGPURegisterAttributes;
> +  let Content = [{
> +
> +Clang supports the
> +``__attribute__((amdgpu_num_sgpr(<num_registers>)))`` attribute on AMD
> +Southern Islands GPUs and later for controlling the number of scalar
> +registers. A typical value would be between 8 and 104 in increments of
> +8.
> +
> +Due to common instruction constraints, an additional 2-4 SGPRs are
> +typically required for internal use depending on features used. This
> +value is a hint for the total number of SGPRs to use, and not the
> +number of user SGPRs, so no special consideration needs to be given
> +for these.
> +}];
> +}
> +
>  def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> {
>    let Content = [{
>  Clang supports several different calling conventions, depending on the target
> Index: include/clang/Basic/DiagnosticSemaKinds.td
> ===================================================================
> --- include/clang/Basic/DiagnosticSemaKinds.td
> +++ include/clang/Basic/DiagnosticSemaKinds.td
> @@ -2244,7 +2244,7 @@
>    "Objective-C instance methods|init methods of interface or class extension declarations|"
>    "variables, functions and classes|Objective-C protocols|"
>    "functions and global variables|structs or typedefs|"
> -  "interface or protocol declarations}1">,
> +  "interface or protocol declarations|kernel functions}1">,
>    InGroup<IgnoredAttributes>;
>  def err_attribute_wrong_decl_type : Error<warn_attribute_wrong_decl_type.Text>;
>  def warn_type_attribute_wrong_type : Warning<
> Index: include/clang/Sema/AttributeList.h
> ===================================================================
> --- include/clang/Sema/AttributeList.h
> +++ include/clang/Sema/AttributeList.h
> @@ -844,7 +844,8 @@
>    ExpectedObjectiveCProtocol,
>    ExpectedFunctionGlobalVarMethodOrProperty,
>    ExpectedStructOrTypedef,
> -  ExpectedObjectiveCInterfaceOrProtocol
> +  ExpectedObjectiveCInterfaceOrProtocol,
> +  ExpectedKernelFunction
>  };
>
>  }  // end namespace clang
> Index: lib/CodeGen/TargetInfo.cpp
> ===================================================================
> --- lib/CodeGen/TargetInfo.cpp
> +++ lib/CodeGen/TargetInfo.cpp
> @@ -20,6 +20,7 @@
>  #include "clang/AST/RecordLayout.h"
>  #include "clang/CodeGen/CGFunctionInfo.h"
>  #include "clang/Frontend/CodeGenOptions.h"
> +#include "llvm/ADT/StringExtras.h"
>  #include "llvm/ADT/Triple.h"
>  #include "llvm/IR/DataLayout.h"
>  #include "llvm/IR/Type.h"
> @@ -6082,6 +6083,45 @@
>    return AddrTyped;
>  }
>
> +//===----------------------------------------------------------------------===//
> +// AMDGPU ABI Implementation
> +//===----------------------------------------------------------------------===//
> +
> +namespace {
> +
> +class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
> +public:
> +  AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
> +    : TargetCodeGenInfo(new DefaultABIInfo(CGT)) {}
> +  void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
> +                           CodeGen::CodeGenModule &M) const override;
> +};
> +
> +}
> +
> +void AMDGPUTargetCodeGenInfo::SetTargetAttributes(
> +  const Decl *D,
> +  llvm::GlobalValue *GV,
> +  CodeGen::CodeGenModule &M) const {
> +  const FunctionDecl *FD = dyn_cast<FunctionDecl>(D);
> +  if (!FD)
> +    return;
> +
> +  if (const auto Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
> +    llvm::Function *F = cast<llvm::Function>(GV);
> +    uint32_t NumVGPR = Attr->getNumVGPR();
> +    if (NumVGPR != 0)
> +      F->addFnAttr("amdgpu_num_vgpr", llvm::utostr(NumVGPR));
> +  }
> +
> +  if (const auto Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
> +    llvm::Function *F = cast<llvm::Function>(GV);
> +    unsigned NumSGPR = Attr->getNumSGPR();
> +    if (NumSGPR != 0)
> +      F->addFnAttr("amdgpu_num_sgpr", llvm::utostr(NumSGPR));
> +  }
> +}
> +
>
>  //===----------------------------------------------------------------------===//
>  // SPARC v9 ABI Implementation.
> @@ -7143,6 +7183,8 @@
>    }
>    case llvm::Triple::hexagon:
>      return *(TheTargetCodeGenInfo = new HexagonTargetCodeGenInfo(Types));
> +  case llvm::Triple::r600:
> +    return *(TheTargetCodeGenInfo = new AMDGPUTargetCodeGenInfo(Types));
>    case llvm::Triple::sparcv9:
>      return *(TheTargetCodeGenInfo = new SparcV9TargetCodeGenInfo(Types));
>    case llvm::Triple::xcore:
> Index: lib/Sema/SemaDeclAttr.cpp
> ===================================================================
> --- lib/Sema/SemaDeclAttr.cpp
> +++ lib/Sema/SemaDeclAttr.cpp
> @@ -3940,6 +3940,42 @@
>      handleARMInterruptAttr(S, D, Attr);
>  }
>
> +static int32_t handleAMDGPUNumRegAttrCommon(Sema &S, Decl *D,
> +                                            const AttributeList &Attr) {
> +  Expr *NumRegsExpr = static_cast<Expr *>(Attr.getArgAsExpr(0));
> +
> +  uint32_t NumRegs = -1;
> +  if (!checkUInt32Argument(S, Attr, NumRegsExpr, NumRegs))
> +    return -1;
> +
> +  D->addAttr(UsedAttr::CreateImplicit(S.Context));

Why is this automatically adding a Used attribute? (I know the
original code was as well, but I'm not certain I understand it.)

> +  return NumRegs;

Implicit conversion from unsigned to signed. The return value being
signed also causes implicit conversions from signed to unsigned in the
caller when creating the attributes. Why not return a bool and return
the number of registers by reference?

> +}
> +
> +static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D,
> +                                    const AttributeList &Attr) {
> +  int32_t NumRegs = handleAMDGPUNumRegAttrCommon(S, D, Attr);
> +  if (NumRegs == -1)
> +    return;
> +
> +  D->addAttr(::new (S.Context)
> +             AMDGPUNumVGPRAttr(Attr.getLoc(), S.Context,
> +                               NumRegs,
> +                               Attr.getAttributeSpellingListIndex()));
> +}
> +
> +static void handleAMDGPUNumSGPRAttr(Sema &S, Decl *D,
> +                                    const AttributeList &Attr) {
> +  int32_t NumRegs = handleAMDGPUNumRegAttrCommon(S, D, Attr);
> +  if (NumRegs == -1)
> +    return;
> +
> +  D->addAttr(::new (S.Context)
> +             AMDGPUNumSGPRAttr(Attr.getLoc(), S.Context,
> +                               NumRegs,
> +                               Attr.getAttributeSpellingListIndex()));
> +}
> +
>  static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
>                                                const AttributeList& Attr) {
>    // If we try to apply it to a function pointer, don't warn, but don't
> @@ -4247,6 +4283,12 @@
>    case AttributeList::AT_NoMips16:
>      handleSimpleAttribute<NoMips16Attr>(S, D, Attr);
>      break;
> +  case AttributeList::AT_AMDGPUNumVGPR:
> +    handleAMDGPUNumVGPRAttr(S, D, Attr);
> +    break;
> +  case AttributeList::AT_AMDGPUNumSGPR:
> +    handleAMDGPUNumSGPRAttr(S, D, Attr);
> +    break;
>    case AttributeList::AT_IBAction:
>      handleSimpleAttribute<IBActionAttr>(S, D, Attr);
>      break;
> Index: test/CodeGenOpenCL/amdgpu-num-gpr-attr.cl
> ===================================================================
> --- /dev/null
> +++ test/CodeGenOpenCL/amdgpu-num-gpr-attr.cl
> @@ -0,0 +1,41 @@
> +// RUN: %clang_cc1 -triple r600-- -target-cpu tahiti -O0 -emit-llvm -o - %s | FileCheck %s
> +
> +__attribute__((amdgpu_num_vgpr(64)))
> +kernel void test_num_vgpr64() {
> +// CHECK: define void @test_num_vgpr64() [[ATTR_VGPR64:#[0-9]+]]
> +}
> +
> +__attribute__((amdgpu_num_sgpr(32)))
> +kernel void test_num_sgpr32() {
> +// CHECK: define void @test_num_sgpr32() [[ATTR_SGPR32:#[0-9]+]]
> +}
> +
> +__attribute__((amdgpu_num_vgpr(64), amdgpu_num_sgpr(32)))
> +kernel void test_num_vgpr64_sgpr32() {
> +// CHECK: define void @test_num_vgpr64_sgpr32() [[ATTR_VGPR64_SGPR32:#[0-9]+]]
> +
> +}
> +
> +__attribute__((amdgpu_num_sgpr(20), amdgpu_num_vgpr(40)))
> +kernel void test_num_sgpr20_vgpr40() {
> +// CHECK: define void @test_num_sgpr20_vgpr40() [[ATTR_SGPR20_VGPR40:#[0-9]+]]
> +}
> +
> +__attribute__((amdgpu_num_vgpr(0)))
> +kernel void test_num_vgpr0() {
> +}
> +
> +__attribute__((amdgpu_num_sgpr(0)))
> +kernel void test_num_sgpr0() {
> +}
> +
> +__attribute__((amdgpu_num_vgpr(0), amdgpu_num_sgpr(0)))
> +kernel void test_num_vgpr0_sgpr0() {
> +}
> +
> +// CHECK-DAG-NOT: "amdgpu_num_vgpr"="0"
> +// CHECK-DAG-NOT: "amdgpu_num_sgpr"="0"
> +// CHECK-DAG: attributes [[ATTR_VGPR64]] = { nounwind "amdgpu_num_vgpr"="64"
> +// CHECK-DAG: attributes [[ATTR_SGPR32]] = { nounwind "amdgpu_num_sgpr"="32"
> +// CHECK-DAG: attributes [[ATTR_VGPR64_SGPR32]] = { nounwind "amdgpu_num_sgpr"="32" "amdgpu_num_vgpr"="64"
> +// CHECK-DAG: attributes [[ATTR_SGPR20_VGPR40]] = { nounwind "amdgpu_num_sgpr"="20" "amdgpu_num_vgpr"="40"
> Index: test/SemaOpenCL/amdgpu-num-register-attrs.cl
> ===================================================================
> --- /dev/null
> +++ test/SemaOpenCL/amdgpu-num-register-attrs.cl
> @@ -0,0 +1,34 @@
> +// RUN: %clang_cc1 -triple r600-- -verify -fsyntax-only %s
> +
> +typedef __attribute__((amdgpu_num_vgpr(128))) struct FooStruct { // expected-error {{'amdgpu_num_vgpr' attribute only applies to kernel functions}}
> +  int x;
> +  float y;
> +} FooStruct;
> +
> +
> +__attribute__((amdgpu_num_vgpr("ABC"))) kernel void foo2() {} // expected-error {{'amdgpu_num_vgpr' attribute requires an integer constant}}
> +__attribute__((amdgpu_num_sgpr("ABC"))) kernel void foo3() {} // expected-error {{'amdgpu_num_sgpr' attribute requires an integer constant}}
> +
> +
> +__attribute__((amdgpu_num_vgpr(40))) void foo4() {} // expected-error {{'amdgpu_num_vgpr' attribute only applies to kernel functions}}
> +__attribute__((amdgpu_num_sgpr(64))) void foo5() {} // expected-error {{'amdgpu_num_sgpr' attribute only applies to kernel functions}}
> +
> +__attribute__((amdgpu_num_vgpr(40))) kernel void foo7() {}
> +__attribute__((amdgpu_num_sgpr(64))) kernel void foo8() {}
> +__attribute__((amdgpu_num_vgpr(40), amdgpu_num_sgpr(64))) kernel void foo9() {}
> +
> +// Check 0 VGPR is accepted.
> +__attribute__((amdgpu_num_vgpr(0))) kernel void foo10() {}
> +
> +// Check 0 SGPR is accepted.
> +__attribute__((amdgpu_num_sgpr(0))) kernel void foo11() {}
> +
> +// Check both 0 SGPR and VGPR is accepted.
> +__attribute__((amdgpu_num_vgpr(0), amdgpu_num_sgpr(0))) kernel void foo12() {}
> +
> +// Too large VGPR value.
> +__attribute__((amdgpu_num_vgpr(4294967296))) kernel void foo13() {} // expected-error {{integer constant expression evaluates to value 4294967296 that cannot be represented in a 32-bit unsigned integer type}}
> +
> +__attribute__((amdgpu_num_sgpr(4294967296))) kernel void foo14() {} // expected-error {{integer constant expression evaluates to value 4294967296 that cannot be represented in a 32-bit unsigned integer type}}
> +
> +__attribute__((amdgpu_num_sgpr(4294967296), amdgpu_num_vgpr(4294967296))) kernel void foo15() {} // expected-error 2 {{integer constant expression evaluates to value 4294967296 that cannot be represented in a 32-bit unsigned integer type}}
>

Thanks!

~Aaron