PATCHES: R600/SI: CodeGen patches for HSA Runtime

Thu Nov 6 15:01:53 PST 2014

On 11/06/2014 02:23 PM, Tom Stellard wrote:
> Hi,
>
> Attached are patches to enable the R600 backend to emit code for the HSA runtime.
> Please review.
>
> -Tom
>
> 0001-Triple-Add-AMDGPU-evironment-type.patch
>
>
>  From 01af45d2e2f79add5047e53738e6e4eff77354b0 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Wed, 22 Oct 2014 20:07:50 -0400
> Subject: [PATCH 1/5] Triple: Add AMDGPU evironment type
>
> This will be used to tell the R600 backend how to emit program data
> in its object files.
> ---
>   include/llvm/ADT/Triple.h | 1 +
>   lib/Support/Triple.cpp    | 2 ++
>   2 files changed, 3 insertions(+)
>
> diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
> index 4432390..c1b7882 100644
> --- a/include/llvm/ADT/Triple.h
> +++ b/include/llvm/ADT/Triple.h
> @@ -155,6 +155,7 @@ public:
>       MSVC,
>       Itanium,
>       Cygnus,
> +    AMDGPU
>     };
>     enum ObjectFormatType {
>       UnknownObjectFormat,
> diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
> index 7a9dc39..c9fc785 100644
> --- a/lib/Support/Triple.cpp
> +++ b/lib/Support/Triple.cpp
> @@ -176,6 +176,7 @@ const char *Triple::getEnvironmentTypeName(EnvironmentType Kind) {
>     case MSVC: return "msvc";
>     case Itanium: return "itanium";
>     case Cygnus: return "cygnus";
> +  case AMDGPU: return "amdgpu";
>     }
>   
>     llvm_unreachable("Invalid EnvironmentType!");
> @@ -326,6 +327,7 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
>       .StartsWith("msvc", Triple::MSVC)
>       .StartsWith("itanium", Triple::Itanium)
>       .StartsWith("cygnus", Triple::Cygnus)
> +    .StartsWith("amdgpu", Triple::AMDGPU)
>       .Default(Triple::UnknownEnvironment);
>   }
>   
> -- 1.8.5.5
>
> 0002-Triple-Add-AMDHSA-operating-system-type.patch
>
>
>  From 7e11a4bf46b839936124c4a24defcf557c51c64e Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Wed, 5 Nov 2014 11:50:40 -0500
> Subject: [PATCH 2/5] Triple: Add AMDHSA operating system type
>
> This operating system type represents the AMD HSA runtime,
> and will be required by the R600 backend in order to generate
> correct code for this runtime.
> ---
>   include/llvm/ADT/Triple.h | 3 ++-
>   lib/Support/Triple.cpp    | 2 ++
>   2 files changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
> index c1b7882..b46bce8 100644
> --- a/include/llvm/ADT/Triple.h
> +++ b/include/llvm/ADT/Triple.h
> @@ -138,7 +138,8 @@ public:
>       Bitrig,
>       AIX,
>       CUDA,       // NVIDIA CUDA
> -    NVCL        // NVIDIA OpenCL
> +    NVCL,       // NVIDIA OpenCL
> +    AMDHSA      // AMD HSA Runtime
>     };
>     enum EnvironmentType {
>       UnknownEnvironment,
> diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
> index c9fc785..dbcebe8 100644
> --- a/lib/Support/Triple.cpp
> +++ b/lib/Support/Triple.cpp
> @@ -157,6 +157,7 @@ const char *Triple::getOSTypeName(OSType Kind) {
>     case AIX: return "aix";
>     case CUDA: return "cuda";
>     case NVCL: return "nvcl";
> +  case AMDHSA: return "amdhsa";
>     }
>   
>     llvm_unreachable("Invalid OSType");
> @@ -311,6 +312,7 @@ static Triple::OSType parseOS(StringRef OSName) {
>       .StartsWith("aix", Triple::AIX)
>       .StartsWith("cuda", Triple::CUDA)
>       .StartsWith("nvcl", Triple::NVCL)
> +    .StartsWith("amdhsa", Triple::AMDHSA)
>       .Default(Triple::UnknownOS);
>   }
>   
> -- 1.8.5.5
>
> 0003-R600-SI-Set-the-ATC-bit-on-all-resource-descriptors-.patch
>
>
>  From 2128345551c8830a1782536df28225ae2c9bb506 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Mon, 3 Nov 2014 15:40:13 -0500
> Subject: [PATCH 3/5] R600/SI: Set the ATC bit on all resource descriptors for
>   the HSA runtime
>
> ---
>   lib/Target/R600/AMDGPUISelDAGToDAG.cpp | 7 +++++++
>   lib/Target/R600/AMDGPUSubtarget.cpp    | 3 ++-
>   lib/Target/R600/AMDGPUSubtarget.h      | 4 ++++
>   3 files changed, 13 insertions(+), 1 deletion(-)
>
> diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> index aa9062d..a680507 100644
> --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> @@ -952,6 +952,13 @@ static SDValue buildSMovImm32(SelectionDAG *DAG, SDLoc DL, uint64_t Val) {
>   static SDValue buildRSRC(SelectionDAG *DAG, SDLoc DL, SDValue Ptr,
>                            uint32_t RsrcDword1, uint64_t RsrcDword2And3) {
>   
> +  const AMDGPUSubtarget &ST = DAG->getTarget().getSubtarget<AMDGPUSubtarget>();
> +
> +  if (ST.isAmdHsaOS()) {
> +    // HSA requires the ATC bit to be set for all resource descriptors.
> +    RsrcDword2And3 |= (1ULL << 56);
> +  }
> +
Should this constant go to an HSADefines header?

>     SDValue PtrLo = DAG->getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
>     SDValue PtrHi = DAG->getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
>     if (RsrcDword1) {
> diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
> index 9d09a19..0d693c8 100644
> --- a/lib/Target/R600/AMDGPUSubtarget.cpp
> +++ b/lib/Target/R600/AMDGPUSubtarget.cpp
> @@ -84,7 +84,8 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef GPU, StringRef FS,
>         FrameLowering(TargetFrameLowering::StackGrowsUp,
>                       64 * 16, // Maximum stack alignment (long16)
>                       0),
> -      InstrItins(getInstrItineraryForCPU(GPU)) {
> +      InstrItins(getInstrItineraryForCPU(GPU)),
> +      TargetTriple(TT) {
>     if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
>       InstrInfo.reset(new R600InstrInfo(*this));
>       TLInfo.reset(new R600TargetLowering(TM));
> diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
> index 55a0c58..ff37932 100644
> --- a/lib/Target/R600/AMDGPUSubtarget.h
> +++ b/lib/Target/R600/AMDGPUSubtarget.h
> @@ -70,6 +70,7 @@ private:
>     std::unique_ptr<AMDGPUTargetLowering> TLInfo;
>     std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
>     InstrItineraryData InstrItins;
> +  Triple TargetTriple;
>   
>   public:
>     AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS, TargetMachine &TM);
> @@ -219,6 +220,9 @@ public:
>     bool r600ALUEncoding() const {
>       return R600ALUInst;
>     }
> +  bool isAmdHsaOS() const {
> +    return TargetTriple.getOS() == Triple::AMDHSA;
> +  }
>   };
>   
>   } // End namespace llvm
> -- 1.8.5.5
>
> 0004-R600-SI-Move-more-information-into-SIProgramInfo-str.patch
>
>
>  From d67699d6c50a2eb2159239d2e431e11cc5d3e817 Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Wed, 22 Oct 2014 20:10:30 -0400
> Subject: [PATCH 4/5] R600/SI: Move more information into SIProgramInfo struct
>
> ---
>   lib/Target/R600/AMDGPUAsmPrinter.cpp | 85 ++++++++++++++++++------------------
>   lib/Target/R600/AMDGPUAsmPrinter.h   | 25 +++++++++--
>   2 files changed, 64 insertions(+), 46 deletions(-)
>
> diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
> index 5511d7c..aa29997 100644
> --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
> +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
> @@ -240,6 +240,8 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
>   
>   void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
>                                           const MachineFunction &MF) const {
> +  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
> +  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
>     uint64_t CodeSize = 0;
>     unsigned MaxSGPR = 0;
>     unsigned MaxVGPR = 0;
> @@ -340,6 +342,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
>     ProgInfo.NumVGPR = MaxVGPR + 1;
>     ProgInfo.NumSGPR = MaxSGPR + 1;
>   
> +  ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
> +  ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
>     // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
>     // register.
>     ProgInfo.FloatMode = getFPMode(MF);
> @@ -356,21 +360,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
>     ProgInfo.FlatUsed = FlatUsed;
>     ProgInfo.VCCUsed = VCCUsed;
>     ProgInfo.CodeLen = CodeSize;
> -}
>   
> -void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
> -                                         const SIProgramInfo &KernelInfo) {
> -  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
> -  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
> -
> -  unsigned RsrcReg;
> -  switch (MFI->getShaderType()) {
> -  default: // Fall through
> -  case ShaderType::COMPUTE:  RsrcReg = R_00B848_COMPUTE_PGM_RSRC1; break;
> -  case ShaderType::GEOMETRY: RsrcReg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; break;
> -  case ShaderType::PIXEL:    RsrcReg = R_00B028_SPI_SHADER_PGM_RSRC1_PS; break;
> -  case ShaderType::VERTEX:   RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break;
> -  }
> +  ProgInfo.ComputePGMRSrc1 =
> +      S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
> +      S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
> +      S_00B848_PRIORITY(ProgInfo.Priority) |
> +      S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
> +      S_00B848_PRIV(ProgInfo.Priv) |
> +      S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
> +      S_00B848_IEEE_MODE(ProgInfo.DebugMode) |
> +      S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
>   
>     unsigned LDSAlignShift;
>     if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
> @@ -384,58 +383,60 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
>     unsigned LDSSpillSize = MFI->LDSWaveSpillSize *
>                             MFI->getMaximumWorkGroupSize(MF);
>   
> -  unsigned LDSBlocks =
> -     RoundUpToAlignment(MFI->LDSSize + LDSSpillSize,
> -	                      1 << LDSAlignShift) >> LDSAlignShift;
> +  ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize;
> +  ProgInfo.LDSBlocks =
> +     RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
>   
>     // Scratch is allocated in 256 dword blocks.
>     unsigned ScratchAlignShift = 10;
>     // We need to program the hardware with the amount of scratch memory that
> -  // is used by the entire wave.  KernelInfo.ScratchSize is the amount of
> +  // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
>     // scratch memory used per thread.
> -  unsigned ScratchBlocks =
> -    RoundUpToAlignment(KernelInfo.ScratchSize * STM.getWavefrontSize(),
> +  ProgInfo.ScratchBlocks =
> +    RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(),
>                          1 << ScratchAlignShift) >> ScratchAlignShift;
>   
> -  unsigned VGPRBlocks = (KernelInfo.NumVGPR - 1) / 4;
> -  unsigned SGPRBlocks = (KernelInfo.NumSGPR - 1) / 8;
> +  ProgInfo.ComputePGMRSrc2 =
> +      S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) |
> +      S_00B02C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0);
> +}
> +
> +void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
> +                                         const SIProgramInfo &KernelInfo) {
> +  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
> +
> +  unsigned RsrcReg;
> +  switch (MFI->getShaderType()) {
> +  default: // Fall through
> +  case ShaderType::COMPUTE:  RsrcReg = R_00B848_COMPUTE_PGM_RSRC1; break;
> +  case ShaderType::GEOMETRY: RsrcReg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; break;
> +  case ShaderType::PIXEL:    RsrcReg = R_00B028_SPI_SHADER_PGM_RSRC1_PS; break;
> +  case ShaderType::VERTEX:   RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break;
> +  }
> +
>   
>     if (MFI->getShaderType() == ShaderType::COMPUTE) {
>       OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
>   
> -    const uint32_t ComputePGMRSrc1 =
> -      S_00B848_VGPRS(VGPRBlocks) |
> -      S_00B848_SGPRS(SGPRBlocks) |
> -      S_00B848_PRIORITY(KernelInfo.Priority) |
> -      S_00B848_FLOAT_MODE(KernelInfo.FloatMode) |
> -      S_00B848_PRIV(KernelInfo.Priv) |
> -      S_00B848_DX10_CLAMP(KernelInfo.DX10Clamp) |
> -      S_00B848_IEEE_MODE(KernelInfo.DebugMode) |
> -      S_00B848_IEEE_MODE(KernelInfo.IEEEMode);
> -
> -    OutStreamer.EmitIntValue(ComputePGMRSrc1, 4);
> +    OutStreamer.EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
>   
>       OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
> -    const uint32_t ComputePGMRSrc2 =
> -      S_00B84C_LDS_SIZE(LDSBlocks) |
> -      S_00B02C_SCRATCH_EN(ScratchBlocks > 0);
> -
> -    OutStreamer.EmitIntValue(ComputePGMRSrc2, 4);
> +    OutStreamer.EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
>   
>       OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
> -    OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4);
> +    OutStreamer.EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
>   
>       // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
>       // 0" comment but I don't see a corresponding field in the register spec.
>     } else {
>       OutStreamer.EmitIntValue(RsrcReg, 4);
> -    OutStreamer.EmitIntValue(S_00B028_VGPRS(VGPRBlocks) |
> -                             S_00B028_SGPRS(SGPRBlocks), 4);
> +    OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
> +                             S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
>     }
>   
>     if (MFI->getShaderType() == ShaderType::PIXEL) {
>       OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
> -    OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4);
> +    OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
>       OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
>       OutStreamer.EmitIntValue(MFI->PSInputAddr, 4);
>     }
> diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
> index b9a0767..61f86d6 100644
> --- a/lib/Target/R600/AMDGPUAsmPrinter.h
> +++ b/lib/Target/R600/AMDGPUAsmPrinter.h
> @@ -24,8 +24,8 @@ class AMDGPUAsmPrinter : public AsmPrinter {
>   private:
>     struct SIProgramInfo {
>       SIProgramInfo() :
> -      NumVGPR(0),
> -      NumSGPR(0),
> +      VGPRBlocks(0),
> +      SGPRBlocks(0),
>         Priority(0),
>         FloatMode(0),
>         Priv(0),
> @@ -33,13 +33,19 @@ private:
>         DebugMode(0),
>         IEEEMode(0),
>         ScratchSize(0),
> +      ComputePGMRSrc1(0),
> +      LDSBlocks(0),
> +      ScratchBlocks(0),
> +      ComputePGMRSrc2(0),
> +      NumVGPR(0),
> +      NumSGPR(0),
>         FlatUsed(false),
>         VCCUsed(false),
>         CodeLen(0) {}
>   
>       // Fields set in PGM_RSRC1 pm4 packet.
> -    uint32_t NumVGPR;
> -    uint32_t NumSGPR;
> +    uint32_t VGPRBlocks;
> +    uint32_t SGPRBlocks;
>       uint32_t Priority;
>       uint32_t FloatMode;
>       uint32_t Priv;
> @@ -48,6 +54,17 @@ private:
>       uint32_t IEEEMode;
>       uint32_t ScratchSize;
>   
> +    uint32_t ComputePGMRSrc1;
> +
> +    // Fields set in PGM_RSRC2 pm4 packet.
> +    uint32_t LDSBlocks;
> +    uint32_t ScratchBlocks;
> +
> +    uint32_t ComputePGMRSrc2;
> +
> +    uint32_t NumVGPR;
> +    uint32_t NumSGPR;
> +    uint32_t LDSSize;
>       bool FlatUsed;
>   
>       // Bonus information for debugging.
> -- 1.8.5.5
>
> 0005-R600-SI-Emit-amd_kernel_code_t-header-for-AMDGPU-env.patch
>
>
>  From b5947e39f6d07f7202cfdc6899fdbc3528f0e4cf Mon Sep 17 00:00:00 2001
> From: Tom Stellard<thomas.stellard at amd.com>
> Date: Wed, 22 Oct 2014 20:11:40 -0400
> Subject: [PATCH 5/5] R600/SI: Emit amd_kernel_code_t header for AMDGPU
>   environment
>
> ---
>   lib/Target/R600/AMDGPUAsmPrinter.cpp |  65 +++-
>   lib/Target/R600/AMDGPUAsmPrinter.h   |   2 +
>   lib/Target/R600/AMDGPUSubtarget.cpp  |   7 +
>   lib/Target/R600/AMDGPUSubtarget.h    |   6 +
>   lib/Target/R600/AMDKernelCodeT.h     | 692 +++++++++++++++++++++++++++++++++++
>   5 files changed, 771 insertions(+), 1 deletion(-)
>   create mode 100644 lib/Target/R600/AMDKernelCodeT.h
>
> diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
> index aa29997..b78c5029 100644
> --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
> +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
> @@ -18,6 +18,7 @@
>   
>   #include "AMDGPUAsmPrinter.h"
>   #include "AMDGPU.h"
> +#include "AMDKernelCodeT.h"
>   #include "AMDGPUSubtarget.h"
>   #include "R600Defines.h"
>   #include "R600MachineFunctionInfo.h"
> @@ -109,11 +110,17 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
>     const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config",
>                                                 ELF::SHT_PROGBITS, 0,
>                                                 SectionKind::getReadOnly());
> +
>     OutStreamer.SwitchSection(ConfigSection);
>   
>     const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
>     SIProgramInfo KernelInfo;
> -  if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
> +  if (STM.isAMDGPUEnv()) {
> +    OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
> +    getSIProgramInfo(KernelInfo, MF);
> +    EmitAmdKernelCodeT(MF, KernelInfo);
> +    OutStreamer.EmitCodeAlignment(2 << (MF.getAlignment() - 1));
> +  } else if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
Can you change this to be >= SOUTHERN_ISLANDS to be more clear?
>       getSIProgramInfo(KernelInfo, MF);
>       EmitProgramInfoSI(MF, KernelInfo);
>     } else {
> @@ -441,3 +448,59 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
>       OutStreamer.EmitIntValue(MFI->PSInputAddr, 4);
>     }
>   }
> +
> +void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
> +                                        const SIProgramInfo &KernelInfo) const {
> +  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
> +  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
> +  amd_kernel_code_t header;
> +
> +  memset(&header, 0, sizeof(header));
> +
> +  header.amd_code_version_major = AMD_CODE_VERSION_MAJOR;
> +  header.amd_code_version_minor = AMD_CODE_VERSION_MINOR;
> +
> +  header.struct_byte_size = sizeof(amd_kernel_code_t);
> +
> +  header.target_chip = STM.getAmdKernelCodeChipID();
> +
> +  header.kernel_code_entry_byte_offset = (1 << MF.getAlignment());
> +
> +  header.compute_pgm_resource_registers =
> +      KernelInfo.ComputePGMRSrc1 |
> +      (((uint64_t)KernelInfo.ComputePGMRSrc2) << 32);
> +
> +  // Code Properties:
> +  header.code_properties = AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR |
> +                           AMD_CODE_PROPERTY_IS_PTR64;
> +
> +  if (KernelInfo.FlatUsed)
> +    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
> +
> +  if (KernelInfo.ScratchBlocks)
> +    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
> +
> +  header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
> +  header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
> +
> +  // MFI->ABIArgOffset is the number of bytes for the kernel arguments
> +  // plus 36.  36 is the number of bytes reserved at the begining of the
> +  // input buffer to store work-group size information.
> +  // FIXME: We should be adding the size of the implicit arguments
> +  // to this value.
> +  header.kernarg_segment_byte_size = MFI->ABIArgOffset;
> +
> +  header.wavefront_sgpr_count = KernelInfo.NumVGPR;
> +  header.workitem_vgpr_count = KernelInfo.NumSGPR;
> +
> +  // FIXME: What values do I put for these alignments
> +  header.kernarg_segment_alignment = 0;
> +  header.group_segment_alignment = 0;
> +  header.private_segment_alignment = 0;
According to the output of SC on a random kernel,
kernarg_segment_alignment                      = 3
group_segment_alignment                        = 3
private_segment_alignment                      = 4

We should also probably add printing the same metadata to the text output.
> +
> +  header.code_type = 1; // HSA_EXT_CODE_KERNEL
> +
> +  header.wavefront_size = STM.getWavefrontSize();
> +
> +  OutStreamer.EmitBytes(StringRef((char*)&header, sizeof(header)));
> +}
> diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
> index 61f86d6..5bfbf73 100644
> --- a/lib/Target/R600/AMDGPUAsmPrinter.h
> +++ b/lib/Target/R600/AMDGPUAsmPrinter.h
> @@ -81,6 +81,8 @@ private:
>     /// can correctly setup the GPU state.
>     void EmitProgramInfoR600(const MachineFunction &MF);
>     void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
> +  void EmitAmdKernelCodeT(const MachineFunction &MF,
> +                          const SIProgramInfo &KernelInfo) const;
>   
>   public:
>     explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer);
> diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
> index 0d693c8..a805188 100644
> --- a/lib/Target/R600/AMDGPUSubtarget.cpp
> +++ b/lib/Target/R600/AMDGPUSubtarget.cpp
> @@ -108,3 +108,10 @@ unsigned AMDGPUSubtarget::getStackEntrySize() const {
>       llvm_unreachable("Illegal wavefront size.");
>     }
>   }
> +
> +unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const {
> +  switch(getGeneration()) {
> +  default: llvm_unreachable("ChipID unknown");
> +  case SEA_ISLANDS: return 12;
> +  }
> +}
> diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
> index ff37932..9fe8e3d 100644
> --- a/lib/Target/R600/AMDGPUSubtarget.h
> +++ b/lib/Target/R600/AMDGPUSubtarget.h
> @@ -201,6 +201,12 @@ public:
>       return LocalMemorySize;
>     }
>   
> +  bool isAMDGPUEnv() const {
> +    return TargetTriple.getEnvironment() == Triple::AMDGPU;
> +  }
> +
> +  unsigned getAmdKernelCodeChipID() const;
> +
>     bool enableMachineScheduler() const override {
>       return getGeneration() <= NORTHERN_ISLANDS;
>     }
>

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20141106/31bbe86f/attachment.html>