[clang] 30eeb74 - clang: Use byref for aggregate kernel arguments

Matt Arsenault via cfe-commits cfe-commits at lists.llvm.org
Thu Aug 6 12:52:33 PDT 2020


Author: Matt Arsenault
Date: 2020-08-06T15:52:26-04:00
New Revision: 30eeb742f1d11d7a7036e3b8a3bffc1dfd252082

URL: https://github.com/llvm/llvm-project/commit/30eeb742f1d11d7a7036e3b8a3bffc1dfd252082
DIFF: https://github.com/llvm/llvm-project/commit/30eeb742f1d11d7a7036e3b8a3bffc1dfd252082.diff

LOG: clang: Use byref for aggregate kernel arguments

Add address space to indirect abi info and use it for kernels.

Previously, indirect arguments assumed assumed a stack passed object
in the alloca address space using byval. A stack pointer is unsuitable
for kernel arguments, which are passed in a separate, constant buffer
with a different address space.

Start using the new byref for aggregate kernel arguments. Previously
these were emitted as raw struct arguments, and turned into loads in
the backend. These will lower identically, although with byref you now
have the option of applying an explicit alignment. In the future, a
reasonable implementation would use byref for all kernel arguments
(this would be a practical problem at the moment due to losing things
like noalias on pointer arguments).

This is mostly to avoid fighting the optimizer's treatment of
aggregate load/store. SROA and instcombine both turn aggregate loads
and stores into a long sequence of element loads and stores, rather
than the optimizable memcpy I would expect in this situation. Now an
explicit memcpy will be introduced up-front which is better understood
and helps eliminate the alloca in more situations.

This skips using byref in the case where HIP kernel pointer arguments
in structs are promoted to global pointers. At minimum an additional
patch is needed to allow coercion with indirect arguments. This also
skips using it for OpenCL due to the current workaround used to
support kernels calling kernels. Distinct function bodies would need
to be generated up front instead of emitting an illegal call.

Added: 
    

Modified: 
    clang/include/clang/CodeGen/CGFunctionInfo.h
    clang/lib/CodeGen/CGCall.cpp
    clang/lib/CodeGen/TargetInfo.cpp
    clang/test/CodeGenCUDA/kernel-args.cu
    clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/CodeGen/CGFunctionInfo.h b/clang/include/clang/CodeGen/CGFunctionInfo.h
index eaf5a3d5aad7..253ef946ce15 100644
--- a/clang/include/clang/CodeGen/CGFunctionInfo.h
+++ b/clang/include/clang/CodeGen/CGFunctionInfo.h
@@ -44,10 +44,23 @@ class ABIArgInfo {
     /// but also emit a zero/sign extension attribute.
     Extend,
 
-    /// Indirect - Pass the argument indirectly via a hidden pointer
-    /// with the specified alignment (0 indicates default alignment).
+    /// Indirect - Pass the argument indirectly via a hidden pointer with the
+    /// specified alignment (0 indicates default alignment) and address space.
     Indirect,
 
+    /// IndirectAliased - Similar to Indirect, but the pointer may be to an
+    /// object that is otherwise referenced.  The object is known to not be
+    /// modified through any other references for the duration of the call, and
+    /// the callee must not itself modify the object.  Because C allows
+    /// parameter variables to be modified and guarantees that they have unique
+    /// addresses, the callee must defensively copy the object into a local
+    /// variable if it might be modified or its address might be compared.
+    /// Since those are uncommon, in principle this convention allows programs
+    /// to avoid copies in more situations.  However, it may introduce *extra*
+    /// copies if the callee fails to prove that a copy is unnecessary and the
+    /// caller naturally produces an unaliased object for the argument.
+    IndirectAliased,
+
     /// Ignore - Ignore the argument (treat as void). Useful for void and
     /// empty structs.
     Ignore,
@@ -86,6 +99,7 @@ class ABIArgInfo {
     unsigned AllocaFieldIndex; // isInAlloca()
   };
   Kind TheKind;
+  unsigned IndirectAddrSpace : 24; // isIndirect()
   bool PaddingInReg : 1;
   bool InAllocaSRet : 1;    // isInAlloca()
   bool InAllocaIndirect : 1;// isInAlloca()
@@ -97,7 +111,8 @@ class ABIArgInfo {
   bool SignExt : 1;         // isExtend()
 
   bool canHavePaddingType() const {
-    return isDirect() || isExtend() || isIndirect() || isExpand();
+    return isDirect() || isExtend() || isIndirect() || isIndirectAliased() ||
+           isExpand();
   }
   void setPaddingType(llvm::Type *T) {
     assert(canHavePaddingType());
@@ -112,9 +127,10 @@ class ABIArgInfo {
 public:
   ABIArgInfo(Kind K = Direct)
       : TypeData(nullptr), PaddingType(nullptr), DirectOffset(0), TheKind(K),
-        PaddingInReg(false), InAllocaSRet(false), InAllocaIndirect(false),
-        IndirectByVal(false), IndirectRealign(false), SRetAfterThis(false),
-        InReg(false), CanBeFlattened(false), SignExt(false) {}
+        IndirectAddrSpace(0), PaddingInReg(false), InAllocaSRet(false),
+        InAllocaIndirect(false), IndirectByVal(false), IndirectRealign(false),
+        SRetAfterThis(false), InReg(false), CanBeFlattened(false),
+        SignExt(false) {}
 
   static ABIArgInfo getDirect(llvm::Type *T = nullptr, unsigned Offset = 0,
                               llvm::Type *Padding = nullptr,
@@ -180,6 +196,19 @@ class ABIArgInfo {
     AI.setPaddingType(Padding);
     return AI;
   }
+
+  /// Pass this in memory using the IR byref attribute.
+  static ABIArgInfo getIndirectAliased(CharUnits Alignment, unsigned AddrSpace,
+                                       bool Realign = false,
+                                       llvm::Type *Padding = nullptr) {
+    auto AI = ABIArgInfo(IndirectAliased);
+    AI.setIndirectAlign(Alignment);
+    AI.setIndirectRealign(Realign);
+    AI.setPaddingType(Padding);
+    AI.setIndirectAddrSpace(AddrSpace);
+    return AI;
+  }
+
   static ABIArgInfo getIndirectInReg(CharUnits Alignment, bool ByVal = true,
                                      bool Realign = false) {
     auto AI = getIndirect(Alignment, ByVal, Realign);
@@ -259,6 +288,7 @@ class ABIArgInfo {
   bool isExtend() const { return TheKind == Extend; }
   bool isIgnore() const { return TheKind == Ignore; }
   bool isIndirect() const { return TheKind == Indirect; }
+  bool isIndirectAliased() const { return TheKind == IndirectAliased; }
   bool isExpand() const { return TheKind == Expand; }
   bool isCoerceAndExpand() const { return TheKind == CoerceAndExpand; }
 
@@ -338,11 +368,11 @@ class ABIArgInfo {
 
   // Indirect accessors
   CharUnits getIndirectAlign() const {
-    assert(isIndirect() && "Invalid kind!");
+    assert((isIndirect() || isIndirectAliased()) && "Invalid kind!");
     return CharUnits::fromQuantity(IndirectAlign);
   }
   void setIndirectAlign(CharUnits IA) {
-    assert(isIndirect() && "Invalid kind!");
+    assert((isIndirect() || isIndirectAliased()) && "Invalid kind!");
     IndirectAlign = IA.getQuantity();
   }
 
@@ -355,12 +385,22 @@ class ABIArgInfo {
     IndirectByVal = IBV;
   }
 
+  unsigned getIndirectAddrSpace() const {
+    assert(isIndirectAliased() && "Invalid kind!");
+    return IndirectAddrSpace;
+  }
+
+  void setIndirectAddrSpace(unsigned AddrSpace) {
+    assert(isIndirectAliased() && "Invalid kind!");
+    IndirectAddrSpace = AddrSpace;
+  }
+
   bool getIndirectRealign() const {
-    assert(isIndirect() && "Invalid kind!");
+    assert((isIndirect() || isIndirectAliased()) && "Invalid kind!");
     return IndirectRealign;
   }
   void setIndirectRealign(bool IR) {
-    assert(isIndirect() && "Invalid kind!");
+    assert((isIndirect() || isIndirectAliased()) && "Invalid kind!");
     IndirectRealign = IR;
   }
 

diff  --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 3f27e1bb8955..9d225b23e3c3 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1470,6 +1470,7 @@ void ClangToLLVMArgMapping::construct(const ASTContext &Context,
       break;
     }
     case ABIArgInfo::Indirect:
+    case ABIArgInfo::IndirectAliased:
       IRArgs.NumberOfArgs = 1;
       break;
     case ABIArgInfo::Ignore:
@@ -1560,6 +1561,7 @@ CodeGenTypes::GetFunctionType(const CGFunctionInfo &FI) {
   const ABIArgInfo &retAI = FI.getReturnInfo();
   switch (retAI.getKind()) {
   case ABIArgInfo::Expand:
+  case ABIArgInfo::IndirectAliased:
     llvm_unreachable("Invalid ABI kind for return argument");
 
   case ABIArgInfo::Extend:
@@ -1637,7 +1639,12 @@ CodeGenTypes::GetFunctionType(const CGFunctionInfo &FI) {
           CGM.getDataLayout().getAllocaAddrSpace());
       break;
     }
-
+    case ABIArgInfo::IndirectAliased: {
+      assert(NumIRArgs == 1);
+      llvm::Type *LTy = ConvertTypeForMem(it->type);
+      ArgTypes[FirstIRArg] = LTy->getPointerTo(ArgInfo.getIndirectAddrSpace());
+      break;
+    }
     case ABIArgInfo::Extend:
     case ABIArgInfo::Direct: {
       // Fast-isel and the optimizer generally like scalar values better than
@@ -2101,6 +2108,7 @@ void CodeGenModule::ConstructAttributeList(
     break;
 
   case ABIArgInfo::Expand:
+  case ABIArgInfo::IndirectAliased:
     llvm_unreachable("Invalid ABI kind for return argument");
   }
 
@@ -2184,6 +2192,9 @@ void CodeGenModule::ConstructAttributeList(
       if (AI.getIndirectByVal())
         Attrs.addByValAttr(getTypes().ConvertTypeForMem(ParamType));
 
+      // TODO: We could add the byref attribute if not byval, but it would
+      // require updating many testcases.
+
       CharUnits Align = AI.getIndirectAlign();
 
       // In a byval argument, it is important that the required
@@ -2206,6 +2217,13 @@ void CodeGenModule::ConstructAttributeList(
       // byval disables readnone and readonly.
       FuncAttrs.removeAttribute(llvm::Attribute::ReadOnly)
         .removeAttribute(llvm::Attribute::ReadNone);
+
+      break;
+    }
+    case ABIArgInfo::IndirectAliased: {
+      CharUnits Align = AI.getIndirectAlign();
+      Attrs.addByRefAttr(getTypes().ConvertTypeForMem(ParamType));
+      Attrs.addAlignmentAttr(Align.getQuantity());
       break;
     }
     case ABIArgInfo::Ignore:
@@ -2434,16 +2452,19 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
       break;
     }
 
-    case ABIArgInfo::Indirect: {
+    case ABIArgInfo::Indirect:
+    case ABIArgInfo::IndirectAliased: {
       assert(NumIRArgs == 1);
       Address ParamAddr =
           Address(Fn->getArg(FirstIRArg), ArgI.getIndirectAlign());
 
       if (!hasScalarEvaluationKind(Ty)) {
-        // Aggregates and complex variables are accessed by reference.  All we
-        // need to do is realign the value, if requested.
+        // Aggregates and complex variables are accessed by reference. All we
+        // need to do is realign the value, if requested. Also, if the address
+        // may be aliased, copy it to ensure that the parameter variable is
+        // mutable and has a unique adress, as C requires.
         Address V = ParamAddr;
-        if (ArgI.getIndirectRealign()) {
+        if (ArgI.getIndirectRealign() || ArgI.isIndirectAliased()) {
           Address AlignedTemp = CreateMemTemp(Ty, "coerce");
 
           // Copy from the incoming argument pointer to the temporary with the
@@ -3285,8 +3306,8 @@ void CodeGenFunction::EmitFunctionEpilog(const CGFunctionInfo &FI,
     }
     break;
   }
-
   case ABIArgInfo::Expand:
+  case ABIArgInfo::IndirectAliased:
     llvm_unreachable("Invalid ABI kind for return argument");
   }
 
@@ -4413,7 +4434,8 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
       break;
     }
 
-    case ABIArgInfo::Indirect: {
+    case ABIArgInfo::Indirect:
+    case ABIArgInfo::IndirectAliased: {
       assert(NumIRArgs == 1);
       if (!I->isAggregate()) {
         // Make a temporary alloca to pass the argument.
@@ -4668,12 +4690,13 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
       break;
     }
 
-    case ABIArgInfo::Expand:
+    case ABIArgInfo::Expand: {
       unsigned IRArgPos = FirstIRArg;
       ExpandTypeToArgs(I->Ty, *I, IRFuncTy, IRCallArgs, IRArgPos);
       assert(IRArgPos == FirstIRArg + NumIRArgs);
       break;
     }
+    }
   }
 
   const CGCallee &ConcreteCallee = Callee.prepareConcreteCallee(*this);
@@ -5084,6 +5107,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
     }
 
     case ABIArgInfo::Expand:
+    case ABIArgInfo::IndirectAliased:
       llvm_unreachable("Invalid ABI kind for return argument");
     }
 

diff  --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index 8c89e6bc2a64..e94945592123 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -257,6 +257,11 @@ LLVM_DUMP_METHOD void ABIArgInfo::dump() const {
        << " ByVal=" << getIndirectByVal()
        << " Realign=" << getIndirectRealign();
     break;
+  case IndirectAliased:
+    OS << "Indirect Align=" << getIndirectAlign().getQuantity()
+       << " AadrSpace=" << getIndirectAddrSpace()
+       << " Realign=" << getIndirectRealign();
+    break;
   case Expand:
     OS << "Expand";
     break;
@@ -1989,6 +1994,7 @@ static bool isArgInAlloca(const ABIArgInfo &Info) {
   case ABIArgInfo::InAlloca:
     return true;
   case ABIArgInfo::Ignore:
+  case ABIArgInfo::IndirectAliased:
     return false;
   case ABIArgInfo::Indirect:
   case ABIArgInfo::Direct:
@@ -8790,18 +8796,31 @@ ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
 
   // TODO: Can we omit empty structs?
 
-  llvm::Type *LTy = nullptr;
   if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
-    LTy = CGT.ConvertType(QualType(SeltTy, 0));
+    Ty = QualType(SeltTy, 0);
 
+  llvm::Type *OrigLTy = CGT.ConvertType(Ty);
+  llvm::Type *LTy = OrigLTy;
   if (getContext().getLangOpts().HIP) {
-    if (!LTy)
-      LTy = CGT.ConvertType(Ty);
     LTy = coerceKernelArgumentType(
-        LTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
+        OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
         /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
   }
 
+  // FIXME: Should also use this for OpenCL, but it requires addressing the
+  // problem of kernels being called.
+  //
+  // FIXME: This doesn't apply the optimization of coercing pointers in structs
+  // to global address space when using byref. This would require implementing a
+  // new kind of coercion of the in-memory type when for indirect arguments.
+  if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&
+      isAggregateTypeForABI(Ty)) {
+    return ABIArgInfo::getIndirectAliased(
+        getContext().getTypeAlignInChars(Ty),
+        getContext().getTargetAddressSpace(LangAS::opencl_constant),
+        false /*Realign*/, nullptr /*Padding*/);
+  }
+
   // If we set CanBeFlattened to true, CodeGen will expand the struct to its
   // individual elements, which confuses the Clover OpenCL backend; therefore we
   // have to set it to false here. Other args of getDirect() are just defaults.
@@ -9377,6 +9396,7 @@ Address SparcV9ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
   }
 
   case ABIArgInfo::Indirect:
+  case ABIArgInfo::IndirectAliased:
     Stride = SlotSize;
     ArgAddr = Builder.CreateElementBitCast(Addr, ArgPtrTy, "indirect");
     ArgAddr = Address(Builder.CreateLoad(ArgAddr, "indirect.arg"),
@@ -9742,6 +9762,7 @@ Address XCoreABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
     ArgSize = ArgSize.alignTo(SlotSize);
     break;
   case ABIArgInfo::Indirect:
+  case ABIArgInfo::IndirectAliased:
     Val = Builder.CreateElementBitCast(AP, ArgPtrTy);
     Val = Address(Builder.CreateLoad(Val), TypeAlign);
     ArgSize = SlotSize;

diff  --git a/clang/test/CodeGenCUDA/kernel-args.cu b/clang/test/CodeGenCUDA/kernel-args.cu
index 74d91b4d57f3..1bc613c699f2 100644
--- a/clang/test/CodeGenCUDA/kernel-args.cu
+++ b/clang/test/CodeGenCUDA/kernel-args.cu
@@ -8,14 +8,14 @@ struct A {
   int a[32];
 };
 
-// AMDGCN: define amdgpu_kernel void @_Z6kernel1A(%struct.A %x.coerce)
+// AMDGCN: define amdgpu_kernel void @_Z6kernel1A(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}})
 // NVPTX: define void @_Z6kernel1A(%struct.A* byval(%struct.A) align 4 %x)
 __global__ void kernel(A x) {
 }
 
 class Kernel {
 public:
-  // AMDGCN: define amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A %x.coerce)
+  // AMDGCN: define amdgpu_kernel void @_ZN6Kernel12memberKernelE1A(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}})
   // NVPTX: define void @_ZN6Kernel12memberKernelE1A(%struct.A* byval(%struct.A) align 4 %x)
   static __global__ void memberKernel(A x){}
   template<typename T> static __global__ void templateMemberKernel(T x) {}
@@ -29,11 +29,11 @@ void launch(void*);
 
 void test() {
   Kernel K;
-  // AMDGCN: define amdgpu_kernel void @_Z14templateKernelI1AEvT_(%struct.A %x.coerce)
+  // AMDGCN: define amdgpu_kernel void @_Z14templateKernelI1AEvT_(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}}
   // NVPTX: define void @_Z14templateKernelI1AEvT_(%struct.A* byval(%struct.A) align 4 %x)
   launch((void*)templateKernel<A>);
 
-  // AMDGCN: define amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A %x.coerce)
+  // AMDGCN: define amdgpu_kernel void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A addrspace(4)* byref(%struct.A) align 4 %{{.+}}
   // NVPTX: define void @_ZN6Kernel20templateMemberKernelI1AEEvT_(%struct.A* byval(%struct.A) align 4 %x)
   launch((void*)Kernel::templateMemberKernel<A>);
 }

diff  --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
index fd46d3cce22e..8cae24db4540 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
@@ -67,7 +67,6 @@ typedef struct struct_of_structs_arg
     int i2;
 } struct_of_structs_arg_t;
 
-// CHECK: %union.transparent_u = type { i32 }
 typedef union
 {
   int b1;
@@ -237,7 +236,7 @@ __kernel void kernel_test_struct_of_arrays_arg(struct_of_arrays_arg_t arg1) { }
 // CHECK: void @kernel_struct_of_structs_arg(%struct.struct_of_structs_arg %arg1.coerce)
 __kernel void kernel_struct_of_structs_arg(struct_of_structs_arg_t arg1) { }
 
-// CHECK: void @test_kernel_transparent_union_arg(%union.transparent_u %u.coerce)
+// CHECK: void @test_kernel_transparent_union_arg(i32 %u.coerce)
 __kernel void test_kernel_transparent_union_arg(transparent_u u) { }
 
 // CHECK: void @kernel_single_array_element_struct_arg(%struct.single_array_element_struct_arg %arg1.coerce)


        


More information about the cfe-commits mailing list