[PATCH] R600: Parse OpenCL metadata

Mon Jun 30 15:51:42 PDT 2014

On 06/23/2014 08:52 AM, Tom Stellard wrote:
> On Sun, Jun 22, 2014 at 06:03:13AM +0000, Matt Arsenault wrote:
>> http://reviews.llvm.org/D4243
>>
>> Files:
>>    lib/Target/R600/AMDGPUAsmPrinter.cpp
>>    lib/Target/R600/AMDGPUMachineFunction.cpp
>>    lib/Target/R600/AMDGPUMachineFunction.h
>>    lib/Target/R600/SIDefines.h
>>    test/CodeGen/R600/reqd_work_group_size.ll
>> Index: lib/Target/R600/AMDGPUAsmPrinter.cpp
>> ===================================================================
>> --- lib/Target/R600/AMDGPUAsmPrinter.cpp
>> +++ lib/Target/R600/AMDGPUAsmPrinter.cpp
>> @@ -78,6 +78,7 @@
>>     EmitFunctionBody();
>>   
>>     if (isVerbose()) {
>> +    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
>>       const MCSectionELF *CommentSection
>>         = Context.getELFSection(".AMDGPU.csdata",
>>                                 ELF::SHT_PROGBITS, 0,
>> @@ -92,6 +93,15 @@
>>                                    false);
>>         OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
>>                                    false);
>> +
>> +      if (MFI->hasReqdWorkGroupSize()) {
>> +        OutStreamer.emitRawComment(" NumThreadX: " +
>> +                                   Twine(MFI->getReqdWorkGroupSize(0)), false);
>> +        OutStreamer.emitRawComment(" NumThreadY: " +
>> +                                   Twine(MFI->getReqdWorkGroupSize(1)), false);
>> +        OutStreamer.emitRawComment(" NumThreadZ: " +
>> +                                   Twine(MFI->getReqdWorkGroupSize(2)), false);
>> +      }
>>       } else {
>>         R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
>>         OutStreamer.emitRawComment(
>> @@ -288,7 +298,7 @@
>>                                            const SIProgramInfo &KernelInfo) {
>>     const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
>>   
>> -  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
>> +  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
>>     unsigned RsrcReg;
>>     switch (MFI->getShaderType()) {
>>     default: // Fall through
>> @@ -316,7 +326,22 @@
>>     if (MFI->getShaderType() == ShaderType::COMPUTE) {
>>       OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
>>       OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4);
>> +
>> +    if (MFI->hasReqdWorkGroupSize()) {
>> +      OutStreamer.EmitIntValue(R_00B81C_COMPUTE_NUM_THREAD_X, 4);
>> +      OutStreamer.EmitIntValue(
>> +        S_00B81C_NUM_THREAD_FULL(MFI->getReqdWorkGroupSize(0)), 4);
>> +
>> +      OutStreamer.EmitIntValue(R_00B820_COMPUTE_NUM_THREAD_Y, 4);
>> +      OutStreamer.EmitIntValue(
>> +        S_00B820_NUM_THREAD_FULL(MFI->getReqdWorkGroupSize(1)), 4);
>> +
>> +      OutStreamer.EmitIntValue(R_00B824_COMPUTE_NUM_THREAD_Z, 4);
>> +      OutStreamer.EmitIntValue(
>> +        S_00B824_NUM_THREAD_FULL(MFI->getReqdWorkGroupSize(2)), 4);
>> +    }
>>     }
>> +
>>     if (MFI->getShaderType() == ShaderType::PIXEL) {
>>       OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
>>       OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4);
>> Index: lib/Target/R600/AMDGPUMachineFunction.cpp
>> ===================================================================
>> --- lib/Target/R600/AMDGPUMachineFunction.cpp
>> +++ lib/Target/R600/AMDGPUMachineFunction.cpp
>> @@ -1,7 +1,11 @@
>>   #include "AMDGPUMachineFunction.h"
>>   #include "AMDGPU.h"
>> +#include "llvm/CodeGen/MachineModuleInfo.h"
>>   #include "llvm/IR/Attributes.h"
>> +#include "llvm/IR/Constants.h"
>>   #include "llvm/IR/Function.h"
>> +#include "llvm/IR/Module.h"
>> +
>>   using namespace llvm;
>>   
>>   static const char *const ShaderTypeAttribute = "ShaderType";
>> @@ -12,8 +16,14 @@
>>   AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
>>     MachineFunctionInfo(),
>>     ShaderType(ShaderType::COMPUTE),
>> +  IsKernel(false),
>> +  ReqdWorkGroupSize{0},
>> +  WorkGroupSizeHint{0},
>> +  LocalMemoryObjects(),
>>     LDSSize(0) {
>> -  AttributeSet Set = MF.getFunction()->getAttributes();
>> +  const Function *F = MF.getFunction();
>> +
>> +  AttributeSet Set = F->getAttributes();
>>     Attribute A = Set.getAttribute(AttributeSet::FunctionIndex,
>>                                    ShaderTypeAttribute);
>>   
>> @@ -22,4 +32,68 @@
>>       if (Str.getAsInteger(0, ShaderType))
>>         llvm_unreachable("Can't parse shader type!");
>>     }
>> +
>> +  const MachineModuleInfo &MMI = MF.getMMI();
>> +  const Module *M = MMI.getModule();
>> +
>> +  const NamedMDNode *Kernels = M->getNamedMetadata("opencl.kernels");
>> +  if (!Kernels)
>> +    return;
>> +
>> +  for (const MDNode *K : Kernels->operands()) {
>> +    unsigned N = K->getNumOperands();
>> +    if (N == 0)
>> +      continue;
>> +
>> +    // We expect the first operand to be the function.
>> +    const Value *First = K->getOperand(0);
>> +    if (First == F) {
>> +      IsKernel = true;
>> +      findOpenCLKernelAttributes(K);
>> +      break;
>> +    }
>> +  }
>> +}
>> +
>> +static void parseWorkgroupSize(uint32_t Size[3], const MDNode *Node) {
>> +  unsigned N = Node->getNumOperands();
>> +
>> +  for (unsigned I = 0; I < std::min(N - 1, 3u); ++I) {
>> +    const ConstantInt *C = dyn_cast<ConstantInt>(Node->getOperand(I + 1));
>> +    if (!C) {
>> +      // This is malformed, just give up.
>> +      Size[0] = 0;
>> +      Size[1] = 0;
>> +      Size[2] = 0;
>> +      return;
>> +    }
>> +
>> +    Size[I] = C->getZExtValue();
>> +  }
>> +}
>> +
>> +void AMDGPUMachineFunction::findOpenCLKernelAttributes(const MDNode *Node) {
>> +  for (unsigned I = 1, E = Node->getNumOperands(); I != E; ++I) {
>> +    const MDNode *Op = dyn_cast<MDNode>(Node->getOperand(I));
>> +    if (!Op)
>> +      continue;
>> +
>> +    unsigned N = Op->getNumOperands();
>> +    if (N == 0)
>> +      continue;
>> +
>> +    const MDString *NameNode = dyn_cast<MDString>(Op->getOperand(0));
>> +    if (!NameNode)
>> +      continue;
>> +
>> +    StringRef Name = NameNode->getName();
>> +
>> +    if (N == 4 && Name == "reqd_work_group_size")
>> +      parseWorkgroupSize(ReqdWorkGroupSize, Op);
>> +    else if (N == 4 && Name == "work_group_size_hint")
>> +      parseWorkgroupSize(WorkGroupSizeHint, Op);
>> +    else if (Name == "vec_type_hint") {
>> +      // TODO: Do we care about this at all?
>> +    }
>> +  }
> Could you move the attribute parsing code to a util file, so it can also be
> used by the AMDGPUPromoteAllocas pass.
>
> -Tom

I was considering making it an analysis pass. There are a variety of 
places that might want to check for this

>>   }
>> Index: lib/Target/R600/AMDGPUMachineFunction.h
>> ===================================================================
>> --- lib/Target/R600/AMDGPUMachineFunction.h
>> +++ lib/Target/R600/AMDGPUMachineFunction.h
>> @@ -19,8 +19,13 @@
>>   namespace llvm {
>>   
>>   class AMDGPUMachineFunction : public MachineFunctionInfo {
>> -  virtual void anchor();
>>     unsigned ShaderType;
>> +  bool IsKernel;
>> +  uint32_t ReqdWorkGroupSize[3];
>> +  uint32_t WorkGroupSizeHint[3];
>> +
>> +  virtual void anchor();
>> +  void findOpenCLKernelAttributes(const MDNode *);
>>   
>>   public:
>>     AMDGPUMachineFunction(const MachineFunction &MF);
>> @@ -33,6 +38,26 @@
>>     unsigned getShaderType() const {
>>       return ShaderType;
>>     }
>> +
>> +  bool isKernel() const {
>> +    return IsKernel;
>> +  }
>> +
>> +  uint32_t getReqdWorkGroupSize(unsigned I) const {
>> +    return ReqdWorkGroupSize[I];
>> +  }
>> +
>> +  uint32_t getWorkGroupSizeHint(unsigned I) const {
>> +    return WorkGroupSizeHint[I];
>> +  }
>> +
>> +  bool hasReqdWorkGroupSize() const {
>> +    return ReqdWorkGroupSize[0] != 0;
>> +  }
>> +
>> +  uint32_t getReqdWorkGroupSizeFlat() const {
>> +    return ReqdWorkGroupSize[0] * ReqdWorkGroupSize[1] * ReqdWorkGroupSize[2];
>> +  }
>>   };
>>   
>>   }
>> Index: lib/Target/R600/SIDefines.h
>> ===================================================================
>> --- lib/Target/R600/SIDefines.h
>> +++ lib/Target/R600/SIDefines.h
>> @@ -35,4 +35,33 @@
>>   #define   S_00B84C_LDS_SIZE(x)                                        (((x) & 0x1FF) << 15)
>>   #define R_0286CC_SPI_PS_INPUT_ENA                                       0x0286CC
>>   
>> +
>> +#define R_00B804_COMPUTE_DIM_X                                          0x00B804
>> +#define R_00B808_COMPUTE_DIM_Y                                          0x00B808
>> +#define R_00B80C_COMPUTE_DIM_Z                                          0x00B80C
>> +#define R_00B810_COMPUTE_START_X                                        0x00B810
>> +#define R_00B814_COMPUTE_START_Y                                        0x00B814
>> +#define R_00B818_COMPUTE_START_Z                                        0x00B818
>> +#define R_00B81C_COMPUTE_NUM_THREAD_X                                   0x00B81C
>> +#define   S_00B81C_NUM_THREAD_FULL(x)                                 (((x) & 0xFFFF) << 0)
>> +#define   G_00B81C_NUM_THREAD_FULL(x)                                 (((x) >> 0) & 0xFFFF)
>> +#define   C_00B81C_NUM_THREAD_FULL                                    0xFFFF0000
>> +#define   S_00B81C_NUM_THREAD_PARTIAL(x)                              (((x) & 0xFFFF) << 16)
>> +#define   G_00B81C_NUM_THREAD_PARTIAL(x)                              (((x) >> 16) & 0xFFFF)
>> +#define   C_00B81C_NUM_THREAD_PARTIAL                                 0x0000FFFF
>> +#define R_00B820_COMPUTE_NUM_THREAD_Y                                   0x00B820
>> +#define   S_00B820_NUM_THREAD_FULL(x)                                 (((x) & 0xFFFF) << 0)
>> +#define   G_00B820_NUM_THREAD_FULL(x)                                 (((x) >> 0) & 0xFFFF)
>> +#define   C_00B820_NUM_THREAD_FULL                                    0xFFFF0000
>> +#define   S_00B820_NUM_THREAD_PARTIAL(x)                              (((x) & 0xFFFF) << 16)
>> +#define   G_00B820_NUM_THREAD_PARTIAL(x)                              (((x) >> 16) & 0xFFFF)
>> +#define   C_00B820_NUM_THREAD_PARTIAL                                 0x0000FFFF
>> +#define R_00B824_COMPUTE_NUM_THREAD_Z                                   0x00B824
>> +#define   S_00B824_NUM_THREAD_FULL(x)                                 (((x) & 0xFFFF) << 0)
>> +#define   G_00B824_NUM_THREAD_FULL(x)                                 (((x) >> 0) & 0xFFFF)
>> +#define   C_00B824_NUM_THREAD_FULL                                    0xFFFF0000
>> +#define   S_00B824_NUM_THREAD_PARTIAL(x)                              (((x) & 0xFFFF) << 16)
>> +#define   G_00B824_NUM_THREAD_PARTIAL(x)                              (((x) >> 16) & 0xFFFF)
>> +#define   C_00B824_NUM_THREAD_PARTIAL                                 0x0000FFFF
>> +
>>   #endif // SIDEFINES_H_
>> Index: test/CodeGen/R600/reqd_work_group_size.ll
>> ===================================================================
>> --- /dev/null
>> +++ test/CodeGen/R600/reqd_work_group_size.ll
>> @@ -0,0 +1,17 @@
>> +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
>> +
>> +; SI: NumThreadX: 32
>> +; SI: NumThreadY: 2
>> +; SI: NumThreadZ: 4
>> +define void @has_reqd_work_group_size(i32 addrspace(1)* nocapture %out) #0 {
>> +entry:
>> +  store i32 0, i32 addrspace(1)* %out, align 4
>> +  ret void
>> +}
>> +
>> +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
>> +
>> +!opencl.kernels = !{!0}
>> +
>> +!0 = metadata !{void (i32 addrspace(1)*)* @has_reqd_work_group_size, metadata !1}
>> +!1 = metadata !{metadata !"reqd_work_group_size", i32 32, i32 2, i32 4}
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits