[llvm] r301938 - AMDGPU: Refactor AsmPrinter

Tue May 2 10:14:00 PDT 2017

Author: arsenm
Date: Tue May  2 12:14:00 2017
New Revision: 301938

URL: http://llvm.org/viewvc/llvm-project?rev=301938&view=rev
Log:
AMDGPU: Refactor AsmPrinter

Avoid analyzing functions multiple times. This allows
asserting that each function is only analyzed once.

Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
    llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.h

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp?rev=301938&r1=301937&r2=301938&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp Tue May  2 12:14:00 2017
@@ -149,11 +149,9 @@ void AMDGPUAsmPrinter::EmitFunctionBodyS
     return;
 
   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
-  SIProgramInfo KernelInfo;
   amd_kernel_code_t KernelCode;
   if (STM.isAmdCodeObjectV2(*MF)) {
-    getSIProgramInfo(KernelInfo, *MF);
-    getAmdKernelCode(KernelCode, KernelInfo, *MF);
+    getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
 
     OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
     getTargetStreamer().EmitAMDKernelCodeT(KernelCode);
@@ -187,7 +185,26 @@ void AMDGPUAsmPrinter::EmitGlobalVariabl
   AsmPrinter::EmitGlobalVariable(GV);
 }
 
+bool AMDGPUAsmPrinter::doFinalization(Module &M) {
+  CallGraphResourceInfo.clear();
+  return AsmPrinter::doFinalization(M);
+}
+
+// Print comments that apply to both callable functions and entry points.
+void AMDGPUAsmPrinter::emitCommonFunctionComments(
+  uint32_t NumVGPR,
+  uint32_t NumSGPR,
+  uint32_t ScratchSize,
+  uint64_t CodeSize) {
+  OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
+  OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
+  OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
+  OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
+}
+
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  CurrentProgramInfo = SIProgramInfo();
+
   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
 
   // The starting address of all shader programs must be 256 bytes aligned.
@@ -204,11 +221,19 @@ bool AMDGPUAsmPrinter::runOnMachineFunct
     OutStreamer->SwitchSection(ConfigSection);
   }
 
-  SIProgramInfo KernelInfo;
   if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-    getSIProgramInfo(KernelInfo, MF);
+    if (MFI->isEntryFunction()) {
+      getSIProgramInfo(CurrentProgramInfo, MF);
+    } else {
+      auto I = CallGraphResourceInfo.insert(
+        std::make_pair(MF.getFunction(), SIFunctionResourceInfo()));
+      SIFunctionResourceInfo &Info = I.first->second;
+      assert(I.second && "should only be called once per function");
+      Info = analyzeResourceUsage(MF);
+    }
+
     if (!STM.isAmdHsaOS()) {
-      EmitProgramInfoSI(MF, KernelInfo);
+      EmitProgramInfoSI(MF, CurrentProgramInfo);
     }
   } else {
     EmitProgramInfoR600(MF);
@@ -226,72 +251,87 @@ bool AMDGPUAsmPrinter::runOnMachineFunct
     OutStreamer->SwitchSection(CommentSection);
 
     if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-      if (MFI->isEntryFunction()) {
-        OutStreamer->emitRawComment(" Kernel info:", false);
-      } else {
+      if (!MFI->isEntryFunction()) {
         OutStreamer->emitRawComment(" Function info:", false);
+        SIFunctionResourceInfo &Info = CallGraphResourceInfo[MF.getFunction()];
+        emitCommonFunctionComments(
+          Info.NumVGPR,
+          Info.getTotalNumSGPRs(MF.getSubtarget<SISubtarget>()),
+          Info.PrivateSegmentSize,
+          getFunctionCodeSize(MF));
+        return false;
       }
 
+      OutStreamer->emitRawComment(" Kernel info:", false);
+      emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
+                                 CurrentProgramInfo.NumSGPR,
+                                 CurrentProgramInfo.ScratchSize,
+                                 getFunctionCodeSize(MF));
+
       OutStreamer->emitRawComment(" codeLenInByte = " +
                                   Twine(getFunctionCodeSize(MF)), false);
-      OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
-                                  false);
-      OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
-                                  false);
-
-      OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
-                                  false);
-      OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
-                                  false);
-      OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
-                                  false);
-      OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) +
-                                  " bytes/workgroup (compile time only)", false);
+      OutStreamer->emitRawComment(
+        " NumSgprs: " + Twine(CurrentProgramInfo.NumSGPR), false);
+      OutStreamer->emitRawComment(
+        " NumVgprs: " + Twine(CurrentProgramInfo.NumVGPR), false);
 
-      if (!MFI->isEntryFunction())
-        return false;
+      OutStreamer->emitRawComment(
+        " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
+      OutStreamer->emitRawComment(
+        " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
+      OutStreamer->emitRawComment(
+        " ScratchSize: " + Twine(CurrentProgramInfo.ScratchSize), false);
+      OutStreamer->emitRawComment(
+        " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
+        " bytes/workgroup (compile time only)", false);
 
-      OutStreamer->emitRawComment(" SGPRBlocks: " +
-                                  Twine(KernelInfo.SGPRBlocks), false);
-      OutStreamer->emitRawComment(" VGPRBlocks: " +
-                                  Twine(KernelInfo.VGPRBlocks), false);
-
-      OutStreamer->emitRawComment(" NumSGPRsForWavesPerEU: " +
-                                  Twine(KernelInfo.NumSGPRsForWavesPerEU), false);
-      OutStreamer->emitRawComment(" NumVGPRsForWavesPerEU: " +
-                                  Twine(KernelInfo.NumVGPRsForWavesPerEU), false);
-
-      OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst),
-                                  false);
-      OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),
-                                  false);
+      OutStreamer->emitRawComment(
+        " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
+      OutStreamer->emitRawComment(
+        " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
+
+      OutStreamer->emitRawComment(
+        " NumSGPRsForWavesPerEU: " +
+        Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
+      OutStreamer->emitRawComment(
+        " NumVGPRsForWavesPerEU: " +
+        Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
+
+      OutStreamer->emitRawComment(
+        " ReservedVGPRFirst: " + Twine(CurrentProgramInfo.ReservedVGPRFirst),
+        false);
+      OutStreamer->emitRawComment(
+        " ReservedVGPRCount: " + Twine(CurrentProgramInfo.ReservedVGPRCount),
+        false);
 
       if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
-        OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
-                                    Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
-        OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" +
-                                    Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false);
+        OutStreamer->emitRawComment(
+          " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
+          Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
+        OutStreamer->emitRawComment(
+          " DebuggerPrivateSegmentBufferSGPR: s" +
+          Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);
       }
 
-      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
-                                  Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
-                                  false);
-      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
-                                  Twine(G_00B84C_TRAP_HANDLER(KernelInfo.ComputePGMRSrc2)),
-                                  false);
-      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
-                                  Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)),
-                                  false);
-      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
-                                  Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)),
-                                  false);
-      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
-                                  Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)),
-                                  false);
-      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
-                                  Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)),
-                                  false);
-
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:USER_SGPR: " +
+        Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
+        Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
+        Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
+        Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
+        Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+      OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
+        Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
+        false);
     } else {
       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
       OutStreamer->emitRawComment(
@@ -407,71 +447,117 @@ static bool hasAnyNonFlatUseOfReg(const
   return false;
 }
 
-void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
-                                        const MachineFunction &MF) const {
-  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIInstrInfo *TII = STM.getInstrInfo();
-  const SIRegisterInfo *RI = &TII->getRegisterInfo();
+static unsigned getNumExtraSGPRs(const SISubtarget &ST,
+                                 bool VCCUsed,
+                                 bool FlatScrUsed) {
+  unsigned ExtraSGPRs = 0;
+  if (VCCUsed)
+    ExtraSGPRs = 2;
 
+  if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
+    if (FlatScrUsed)
+      ExtraSGPRs = 4;
+  } else {
+    if (ST.isXNACKEnabled())
+      ExtraSGPRs = 4;
 
-  MCPhysReg NumVGPRReg = AMDGPU::NoRegister;
-  for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
-    if (MRI.isPhysRegUsed(Reg)) {
-      NumVGPRReg = Reg;
-      break;
-    }
+    if (FlatScrUsed)
+      ExtraSGPRs = 6;
   }
 
-  MCPhysReg NumSGPRReg = AMDGPU::NoRegister;
-  for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
-    if (MRI.isPhysRegUsed(Reg)) {
-      NumSGPRReg = Reg;
-      break;
-    }
-  }
+  return ExtraSGPRs;
+}
 
-  // We found the maximum register index. They start at 0, so add one to get the
-  // number of registers.
-  ProgInfo.NumVGPR = NumVGPRReg == AMDGPU::NoRegister ? 0 :
-    RI->getHWRegIndex(NumVGPRReg) + 1;
-  ProgInfo.NumSGPR = NumSGPRReg == AMDGPU::NoRegister ? 0 :
-    RI->getHWRegIndex(NumSGPRReg) + 1;
-  unsigned ExtraSGPRs = 0;
+int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
+  const SISubtarget &ST) const {
+  return NumExplicitSGPR + getNumExtraSGPRs(ST, UsesVCC, UsesFlatScratch);
+}
 
-  ProgInfo.VCCUsed = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
-                     MRI.isPhysRegUsed(AMDGPU::VCC_HI);
-  if (ProgInfo.VCCUsed)
-    ExtraSGPRs = 2;
+AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
+  const MachineFunction &MF) const {
+  SIFunctionResourceInfo Info;
 
-  ProgInfo.FlatUsed = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
-                      MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+  Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
+                         MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);
 
   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
-  // instructions aren't used to access the scratch buffer. Inline assembly
-  // may need it though.
+  // instructions aren't used to access the scratch buffer. Inline assembly may
+  // need it though.
   //
   // If we only have implicit uses of flat_scr on flat instructions, it is not
   // really needed.
-  if (ProgInfo.FlatUsed && !MFI->hasFlatScratchInit() &&
+  if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
-    ProgInfo.FlatUsed = false;
+    Info.UsesFlatScratch = false;
   }
 
-  if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
-    if (ProgInfo.FlatUsed)
-      ExtraSGPRs = 4;
-  } else {
-    if (STM.isXNACKEnabled())
-      ExtraSGPRs = 4;
+  Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
+  Info.PrivateSegmentSize = FrameInfo.getStackSize();
 
-    if (ProgInfo.FlatUsed)
-      ExtraSGPRs = 6;
+  if (!FrameInfo.hasCalls()) {
+    Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
+                   MRI.isPhysRegUsed(AMDGPU::VCC_HI);
+
+    // If there are no calls, MachineRegisterInfo can tell us the used register
+    // count easily.
+
+    MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
+    for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
+      if (MRI.isPhysRegUsed(Reg)) {
+        HighestVGPRReg = Reg;
+        break;
+      }
+    }
+
+    MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
+    for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
+      if (MRI.isPhysRegUsed(Reg)) {
+        HighestSGPRReg = Reg;
+        break;
+      }
+    }
+
+    // We found the maximum register index. They start at 0, so add one to get the
+    // number of registers.
+    Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
+      TRI.getHWRegIndex(HighestVGPRReg) + 1;
+    Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
+      TRI.getHWRegIndex(HighestSGPRReg) + 1;
+
+    return Info;
   }
 
+  llvm_unreachable("calls not implemented");
+}
+
+void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
+                                        const MachineFunction &MF) {
+  SIFunctionResourceInfo Info = analyzeResourceUsage(MF);
+
+  ProgInfo.NumVGPR = Info.NumVGPR;
+  ProgInfo.NumSGPR = Info.NumExplicitSGPR;
+  ProgInfo.ScratchSize = Info.PrivateSegmentSize;
+  ProgInfo.VCCUsed = Info.UsesVCC;
+  ProgInfo.FlatUsed = Info.UsesFlatScratch;
+  ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
+
+  const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const SIInstrInfo *TII = STM.getInstrInfo();
+  const SIRegisterInfo *RI = &TII->getRegisterInfo();
+
+  unsigned ExtraSGPRs = getNumExtraSGPRs(STM,
+                                         ProgInfo.VCCUsed,
+                                         ProgInfo.FlatUsed);
   unsigned ExtraVGPRs = STM.getReservedNumVGPRs(MF);
 
   // Check the addressable register limit before we add ExtraSGPRs.
@@ -574,9 +660,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(
   // Make clamp modifier on NaN input returns 0.
   ProgInfo.DX10Clamp = STM.enableDX10Clamp();
 
-  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
-  ProgInfo.ScratchSize = FrameInfo.getStackSize();
-
   unsigned LDSAlignShift;
   if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {
     // LDS is allocated in 64 dword blocks.
@@ -646,7 +729,7 @@ static unsigned getRsrcReg(CallingConv::
 }
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
-                                         const SIProgramInfo &KernelInfo) {
+                                         const SIProgramInfo &CurrentProgramInfo) {
   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv());
@@ -654,29 +737,29 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI
   if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
     OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
 
-    OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
+    OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4);
 
     OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
-    OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
+    OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc2, 4);
 
     OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
-    OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+    OutStreamer->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
 
     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
     // 0" comment but I don't see a corresponding field in the register spec.
   } else {
     OutStreamer->EmitIntValue(RsrcReg, 4);
-    OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
-                              S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
+    OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
+                              S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
     if (STM.isVGPRSpillingEnabled(*MF.getFunction())) {
       OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
-      OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+      OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
     }
   }
 
   if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) {
     OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
-    OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
+    OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4);
     OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
     OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
     OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
@@ -704,7 +787,7 @@ static amd_element_byte_size_t getElemen
 }
 
 void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
-                                        const SIProgramInfo &KernelInfo,
+                                        const SIProgramInfo &CurrentProgramInfo,
                                         const MachineFunction &MF) const {
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
@@ -712,10 +795,13 @@ void AMDGPUAsmPrinter::getAmdKernelCode(
   AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits());
 
   Out.compute_pgm_resource_registers =
-      KernelInfo.ComputePGMRSrc1 |
-      (KernelInfo.ComputePGMRSrc2 << 32);
+      CurrentProgramInfo.ComputePGMRSrc1 |
+      (CurrentProgramInfo.ComputePGMRSrc2 << 32);
   Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
 
+  if (CurrentProgramInfo.DynamicCallStack)
+    Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
+
   AMD_HSA_BITS_SET(Out.code_properties,
                    AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
                    getElementByteSizeValue(STM.getMaxPrivateElementSize()));
@@ -767,12 +853,12 @@ void AMDGPUAsmPrinter::getAmdKernelCode(
   // FIXME: Should use getKernArgSize
   Out.kernarg_segment_byte_size =
     STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
-  Out.wavefront_sgpr_count = KernelInfo.NumSGPR;
-  Out.workitem_vgpr_count = KernelInfo.NumVGPR;
-  Out.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
-  Out.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
-  Out.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
-  Out.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
+  Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
+  Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
+  Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
+  Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
+  Out.reserved_vgpr_first = CurrentProgramInfo.ReservedVGPRFirst;
+  Out.reserved_vgpr_count = CurrentProgramInfo.ReservedVGPRCount;
 
   // These alignment values are specified in powers of two, so alignment =
   // 2^n.  The minimum alignment is 2^4 = 16.
@@ -781,9 +867,9 @@ void AMDGPUAsmPrinter::getAmdKernelCode(
 
   if (STM.debuggerEmitPrologue()) {
     Out.debug_wavefront_private_segment_offset_sgpr =
-      KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
+      CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
     Out.debug_private_segment_buffer_sgpr =
-      KernelInfo.DebuggerPrivateSegmentBufferSGPR;
+      CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR;
   }
 }
 

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.h?rev=301938&r1=301937&r2=301938&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.h Tue May  2 12:14:00 2017
@@ -30,9 +30,26 @@ namespace llvm {
 
 class AMDGPUTargetStreamer;
 class MCOperand;
+class SISubtarget;
 
 class AMDGPUAsmPrinter final : public AsmPrinter {
 private:
+  // Track resource usage for callee functions.
+  struct SIFunctionResourceInfo {
+    // Track the number of explicitly used VGPRs. Special registers reserved at
+    // the end are tracked separately.
+    int32_t NumVGPR = 0;
+    int32_t NumExplicitSGPR = 0;
+    uint32_t PrivateSegmentSize = 0;
+    bool UsesVCC = false;
+    bool UsesFlatScratch = false;
+    bool HasDynamicallySizedStack = false;
+    bool HasRecursion = false;
+
+    int32_t getTotalNumSGPRs(const SISubtarget &ST) const;
+  };
+
+  // Track resource usage for kernels / entry functions.
   struct SIProgramInfo {
     // Fields set in PGM_RSRC1 pm4 packet.
     uint32_t VGPRBlocks = 0;
@@ -83,14 +100,23 @@ private:
     uint16_t DebuggerPrivateSegmentBufferSGPR =
         std::numeric_limits<uint16_t>::max();
 
+    // Whether there is recursion, dynamic allocas, indirect calls or some other
+    // reason there may be statically unknown stack usage.
+    bool DynamicCallStack = false;
+
     // Bonus information for debugging.
     bool VCCUsed = false;
 
     SIProgramInfo() = default;
   };
 
+  SIProgramInfo CurrentProgramInfo;
+  DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
+
   uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
-  void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const;
+  SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const;
+
+  void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
   void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo,
                         const MachineFunction &MF) const;
   void findNumUsedRegistersSI(const MachineFunction &MF,
@@ -101,6 +127,10 @@ private:
   /// can correctly setup the GPU state.
   void EmitProgramInfoR600(const MachineFunction &MF);
   void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
+  void emitCommonFunctionComments(uint32_t NumVGPR,
+                                  uint32_t NumSGPR,
+                                  uint32_t ScratchSize,
+                                  uint64_t CodeSize);
 
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM,
@@ -112,6 +142,7 @@ public:
 
   AMDGPUTargetStreamer& getTargetStreamer() const;
 
+  bool doFinalization(Module &M) override;
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   /// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated