[llvm] 935abab - AMDGPU: Use module level register maximums for unknown callees

Fri Feb 4 12:56:08 PST 2022

Author: Matt Arsenault
Date: 2022-02-04T15:56:03-05:00
New Revision: 935abab65cafb509f60e76bd7255dfe03befde85

URL: https://github.com/llvm/llvm-project/commit/935abab65cafb509f60e76bd7255dfe03befde85
DIFF: https://github.com/llvm/llvm-project/commit/935abab65cafb509f60e76bd7255dfe03befde85.diff

LOG: AMDGPU: Use module level register maximums for unknown callees

Compute the theoretical register budget based on the IR function
signature/attributes, and use the global maximum register budgets for
unknown callees.

This should fix the kernel reported register usage in the presence of
indirect calls. The previous fix in
2b08f6af62afbf32e89a6a392dbafa92c62f7bdf was incorrect becauset it was
only taking the maximum in the known call graph, and missing something
that was either outside of it or codegened later.

This fixes a second case I discovered where calls to aliases also did
not work as expected. CallGraphAnalysis misses these, so functions
called through aliases were not codegened ahead of callers as
expected. CallGraphAnalysis should probably be fixed to understand
this case, and there's likely a bug with IPRA here. This fixes
numerous failures in the conformance test at -O0.

Added: 
    llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
    llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
    llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
    llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
    llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
    llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
    llvm/lib/Target/AMDGPU/GCNSubtarget.h
    llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
    llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
    llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
    llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
    llvm/test/CodeGen/AMDGPU/indirect-call.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index cb511e5e34839..71f022079fa09 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -25,6 +25,7 @@
 
 #include "AMDGPUResourceUsageAnalysis.h"
 #include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/Analysis/CallGraph.h"
@@ -102,8 +103,7 @@ bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) {
   if (!TPC)
     return false;
 
-  const TargetMachine &TM = TPC->getTM<TargetMachine>();
-  bool HasIndirectCall = false;
+  TM = static_cast<const GCNTargetMachine *>(&TPC->getTM<TargetMachine>());
 
   for (CallGraphNode *I : SCC) {
     Function *F = I->getFunction();
@@ -118,19 +118,14 @@ bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) {
         std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
     SIFunctionResourceInfo &Info = CI.first->second;
     assert(CI.second && "should only be called once per function");
-    Info = analyzeResourceUsage(MF, TM);
-    HasIndirectCall |= Info.HasIndirectCall;
+    Info = analyzeResourceUsage(MF);
   }
 
-  if (HasIndirectCall)
-    propagateIndirectCallRegisterUsage();
-
   return false;
 }
 
 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
-AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
-    const MachineFunction &MF, const TargetMachine &TM) const {
+AMDGPUResourceUsageAnalysis::analyzeResourceUsage(const MachineFunction &MF) {
   SIFunctionResourceInfo Info;
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -476,9 +471,16 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
               std::max(CalleeFrameSize,
                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
 
+          const SIFunctionResourceInfo &WorstCase =
+              getWorstCaseResourceInfo(*MF.getFunction().getParent());
+          MaxSGPR = std::max(WorstCase.NumExplicitSGPR - 1, MaxSGPR);
+          MaxVGPR = std::max(WorstCase.NumVGPR - 1, MaxVGPR);
+          MaxAGPR = std::max(WorstCase.NumAGPR - 1, MaxAGPR);
+
           // Register usage of indirect calls gets handled later
           Info.UsesVCC = true;
-          Info.UsesFlatScratch = ST.hasFlatAddressSpace();
+          Info.UsesFlatScratch |=
+              WorstCase.UsesFlatScratch && ST.hasFlatAddressSpace();
           Info.HasDynamicallySizedStack = true;
           Info.HasIndirectCall = true;
         } else {
@@ -507,31 +509,49 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
   return Info;
 }
 
-void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
-  // Collect the maximum number of registers from non-hardware-entrypoints.
-  // All these functions are potential targets for indirect calls.
-  int32_t NonKernelMaxSGPRs = 0;
-  int32_t NonKernelMaxVGPRs = 0;
-  int32_t NonKernelMaxAGPRs = 0;
-
-  for (const auto &I : CallGraphResourceInfo) {
-    if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
-      auto &Info = I.getSecond();
-      NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
-      NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
-      NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
-    }
-  }
+const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &
+AMDGPUResourceUsageAnalysis::getWorstCaseResourceInfo(const Module &M) {
+  if (ModuleWorstCaseInfo)
+    return *ModuleWorstCaseInfo;
+
+  computeWorstCaseModuleRegisterUsage(M);
+  return *ModuleWorstCaseInfo;
+}
+
+/// Find the worst case register usage for all callable functions in the module,
+/// assuming all reachable functions are defined in the current module.
+void AMDGPUResourceUsageAnalysis::computeWorstCaseModuleRegisterUsage(
+    const Module &M) {
+  assert(!ModuleWorstCaseInfo);
+  ModuleWorstCaseInfo = SIFunctionResourceInfo();
+  ModuleWorstCaseInfo->UsesVCC = true;
+  ModuleWorstCaseInfo->HasDynamicallySizedStack = true;
+  ModuleWorstCaseInfo->HasRecursion = true;
+  ModuleWorstCaseInfo->HasIndirectCall = true;
+
+  for (const Function &F : M) {
+    if (F.isIntrinsic())
+      continue;
 
-  // Add register usage for functions with indirect calls.
-  // For calls to unknown functions, we assume the maximum register usage of
-  // all non-hardware-entrypoints in the current module.
-  for (auto &I : CallGraphResourceInfo) {
-    auto &Info = I.getSecond();
-    if (Info.HasIndirectCall) {
-      Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
-      Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
-      Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
+    if (AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+      continue;
+
+    const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+    const int32_t MaxVGPR = ST.getMaxNumVGPRs(F);
+    const int32_t MaxSGPR = ST.getMaxNumSGPRs(F);
+
+    ModuleWorstCaseInfo->NumVGPR =
+        std::max(ModuleWorstCaseInfo->NumVGPR, MaxVGPR);
+
+    if (ST.hasMAIInsts()) {
+      const int32_t MaxAGPR = ST.getMaxNumAGPRs(F);
+      ModuleWorstCaseInfo->NumAGPR =
+          std::max(ModuleWorstCaseInfo->NumAGPR, MaxAGPR);
     }
+
+    ModuleWorstCaseInfo->NumExplicitSGPR =
+        std::max(ModuleWorstCaseInfo->NumExplicitSGPR, MaxSGPR);
+
+    ModuleWorstCaseInfo->UsesFlatScratch |= ST.hasFlatAddressSpace();
   }
 }

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
index b0a2d3bffc62e..95e9abc65d3b0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -20,6 +20,7 @@
 
 namespace llvm {
 
+class GCNTargetMachine;
 class GCNSubtarget;
 class MachineFunction;
 class TargetMachine;
@@ -71,12 +72,16 @@ struct AMDGPUResourceUsageAnalysis : public CallGraphSCCPass {
     return Info->getSecond();
   }
 
+  const SIFunctionResourceInfo &getWorstCaseResourceInfo(const Module &M);
+
 private:
-  SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF,
-                                              const TargetMachine &TM) const;
-  void propagateIndirectCallRegisterUsage();
+  void computeWorstCaseModuleRegisterUsage(const Module &M);
+
+  SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF);
 
+  const GCNTargetMachine *TM = nullptr;
   DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
+  Optional<SIFunctionResourceInfo> ModuleWorstCaseInfo;
 };
 } // namespace llvm
 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H

diff  --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 0cd2cfa2f0e7b..f34e1051da808 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1105,6 +1105,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// unit requirement.
   unsigned getMaxNumVGPRs(const Function &F) const;
 
+  unsigned getMaxNumAGPRs(const Function &F) const {
+    return getMaxNumVGPRs(F);
+  }
+
   /// \returns Maximum number of VGPRs that meets number of waves per execution
   /// unit requirement for function \p MF, or number of VGPRs explicitly
   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.

diff  --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
index d248429028516..0002b0358750e 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
@@ -154,22 +154,23 @@ bb:
 declare void @undef_func()
 
 ; GCN-LABEL: {{^}}kernel_call_undef_func:
-; GFX908: .amdhsa_next_free_vgpr 32
-; GFX90A: .amdhsa_next_free_vgpr 64
-; GFX90A: .amdhsa_accum_offset 32
+; GFX908: .amdhsa_next_free_vgpr 128
+; GFX90A: .amdhsa_next_free_vgpr 512
+; GFX90A: .amdhsa_accum_offset 256
 ; GCN908: NumVgprs: 128
+; GCN908: NumAgprs: 128
 ; GCN90A: NumVgprs: 256
-; GCN:    NumAgprs: 32
-; GFX908: TotalNumVgprs: 32
-; GFX90A: TotalNumVgprs: 64
-; GFX908: VGPRBlocks: 7
-; GFX90A: VGPRBlocks: 7
-; GFX908: NumVGPRsForWavesPerEU: 32
-; GFX90A: NumVGPRsForWavesPerEU: 64
-; GFX90A: AccumOffset: 32
-; GFX908: Occupancy: 8
-; GFX90A: Occupancy: 8
-; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 7
+; GCN90A: NumAgprs: 256
+; GFX908: TotalNumVgprs: 128
+; GFX90A: TotalNumVgprs: 512
+; GFX908: VGPRBlocks: 31
+; GFX90A: VGPRBlocks: 63
+; GFX908: NumVGPRsForWavesPerEU: 128
+; GFX90A: NumVGPRsForWavesPerEU: 512
+; GFX90A: AccumOffset: 256
+; GFX908: Occupancy: 2
+; GFX90A: Occupancy: 1
+; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 63
 define amdgpu_kernel void @kernel_call_undef_func() #0 {
 bb:
   call void @undef_func()

diff  --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index 3c32a431bc01d..261fae0a837d0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -144,7 +144,8 @@ attributes #0 = { nounwind }
 
 ; GCN: amdpal.pipelines:
 ; GCN-NEXT:  - .registers:
-; GCN-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}}
+; SDAG-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0xaf03cf{{$}}
+; GISEL-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0xaf03cf{{$}}
 ; GCN-NEXT:      0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}}
 ; GCN-NEXT:    .shader_functions:
 ; GCN-NEXT:      dynamic_stack:
@@ -177,22 +178,24 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:        .vgpr_count:     0x2{{$}}
 ; GCN-NEXT:      no_stack_extern_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x68{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x6c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
-; GCN-NEXT:        .vgpr_count:     0x29{{$}}
+; GCN-NEXT:        .vgpr_count:     0x40{{$}}
 ; GCN-NEXT:      no_stack_extern_call_many_args:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x68{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x6c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x90{{$}}
-; GCN-NEXT:        .vgpr_count:     0x2a{{$}}
+; SDAG-NEXT:        .vgpr_count:     0x40{{$}}
+; GISEL-NEXT:        .vgpr_count:     0x40{{$}}
 ; GCN-NEXT:      no_stack_indirect_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x68{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x6c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
-; GCN-NEXT:        .vgpr_count:     0x2a{{$}}
+; SDAG-NEXT:        .vgpr_count:     0x40{{$}}
+; GISEL-NEXT:        .vgpr_count:     0x40{{$}}
 ; GCN-NEXT:      simple_lds:
 ; GCN-NEXT:        .lds_size:       0x100{{$}}
 ; GCN-NEXT:        .sgpr_count:     0x20{{$}}
@@ -215,16 +218,17 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:        .vgpr_count:     0x3{{$}}
 ; GCN-NEXT:      simple_stack_extern_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x68{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x6c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
-; GCN-NEXT:        .vgpr_count:     0x2a{{$}}
+; GCN-NEXT:        .vgpr_count:     0x40{{$}}
 ; GCN-NEXT:      simple_stack_indirect_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x68{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x6c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
-; GCN-NEXT:        .vgpr_count:     0x2b{{$}}
+; SDAG-NEXT:        .vgpr_count:     0x40{{$}}
+; GISEL-NEXT:        .vgpr_count:     0x40{{$}}
 ; GCN-NEXT:      simple_stack_recurse:
 ; GCN-NEXT:        .lds_size:       0{{$}}
 ; GCN-NEXT:        .sgpr_count:     0x26{{$}}

diff  --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
index dde186cbd9ca0..83a2682e7ea85 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
@@ -556,9 +556,9 @@ attributes #512 = { nounwind "amdgpu-flat-work-group-size"="512,512" }
 
 ; GCN-LABEL: {{^}}f1024:
 ; GFX9: NumVgprs: 64
-; GFX90A: NumVgprs: 64
-; GFX90A: NumAgprs: 64
-; GFX90A: TotalNumVgprs: 128
+; GFX90A: NumVgprs: 128
+; GFX90A: NumAgprs: 128
+; GFX90A: TotalNumVgprs: 256
 ; GFX10WGP-WAVE32: NumVgprs: 128
 ; GFX10WGP-WAVE64: NumVgprs: 128
 ; GFX10CU-WAVE32: NumVgprs: 64

diff  --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
new file mode 100644
index 0000000000000..d99a13bff2eee
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
@@ -0,0 +1,31 @@
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=ALL,GFX908 %s
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=ALL,GFX90A %s
+
+; CallGraphAnalysis, which CodeGenSCC order depends on, does not look
+; through aliases. If GlobalOpt is never run, we do not see direct
+; calls,
+
+ at alias = hidden alias void (), void ()* @aliasee_default
+
+; ALL-LABEL: {{^}}kernel:
+; GFX908: .amdhsa_next_free_vgpr 64
+; GFX908-NEXT: .amdhsa_next_free_sgpr 102
+
+; GFX90A: .amdhsa_next_free_vgpr 256
+; GFX90A-NEXT: .amdhsa_next_free_sgpr 102
+; GFX90A-NEXT: .amdhsa_accum_offset 128
+define amdgpu_kernel void @kernel() #0 {
+bb:
+  call void @alias() #2
+  ret void
+}
+
+define internal void @aliasee_default() #1 {
+bb:
+  call void asm sideeffect "; clobber a26 ", "~{a26}"()
+  ret void
+}
+
+attributes #0 = { noinline norecurse nounwind optnone }
+attributes #1 = { noinline norecurse nounwind readnone willreturn }
+attributes #2 = { nounwind readnone willreturn }

diff  --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
new file mode 100644
index 0000000000000..db51b5fc5c8ee
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
@@ -0,0 +1,26 @@
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; CallGraphAnalysis, which CodeGenSCC order depends on, does not look
+; through aliases. If GlobalOpt is never run, we do not see direct
+; calls,
+
+ at alias0 = hidden alias void (), void ()* @aliasee_default_vgpr64_sgpr102
+
+; CHECK-LABEL: {{^}}kernel0:
+; CHECK: .amdhsa_next_free_vgpr 64
+; CHECK-NEXT: .amdhsa_next_free_sgpr 102
+define amdgpu_kernel void @kernel0() #0 {
+bb:
+  call void @alias0() #2
+  ret void
+}
+
+define internal void @aliasee_default_vgpr64_sgpr102() #1 {
+bb:
+  call void asm sideeffect "; clobber v52 ", "~{v52}"()
+  ret void
+}
+
+attributes #0 = { noinline norecurse nounwind optnone }
+attributes #1 = { noinline norecurse nounwind readnone willreturn }
+attributes #2 = { nounwind readnone willreturn }

diff  --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
new file mode 100644
index 0000000000000..28ec550b7ae6a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
@@ -0,0 +1,29 @@
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; CallGraphAnalysis, which CodeGenSCC order depends on, does not look
+; through aliases. If GlobalOpt is never run, we do not see direct
+; calls,
+
+ at alias1 = hidden alias void (), void ()* @aliasee_vgpr32_sgpr76
+
+; The parent kernel has a higher VGPR usage than the possible callees.
+
+; CHECK-LABEL: {{^}}kernel1:
+; CHECK: .amdhsa_next_free_vgpr 42
+; CHECK-NEXT: .amdhsa_next_free_sgpr 74
+define amdgpu_kernel void @kernel1() #0 {
+bb:
+  call void asm sideeffect "; clobber v40 ", "~{v40}"()
+  call void @alias1() #2
+  ret void
+}
+
+define internal void @aliasee_vgpr32_sgpr76() #1 {
+bb:
+  call void asm sideeffect "; clobber v26 ", "~{v26}"()
+  ret void
+}
+
+attributes #0 = { noinline norecurse nounwind optnone }
+attributes #1 = { noinline norecurse nounwind readnone willreturn "amdgpu-waves-per-eu"="8,10" }
+attributes #2 = { nounwind readnone willreturn }

diff  --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
new file mode 100644
index 0000000000000..edb69dfe1c491
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
@@ -0,0 +1,26 @@
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; CallGraphAnalysis, which CodeGenSCC order depends on, does not look
+; through aliases. If GlobalOpt is never run, we do not see direct
+; calls,
+
+ at alias2 = hidden alias void (), void()* @aliasee_vgpr64_sgpr102
+
+; CHECK-LABEL: {{^}}kernel2:
+; CHECK: .amdhsa_next_free_vgpr 64
+; CHECK-NEXT: .amdhsa_next_free_sgpr 102
+define amdgpu_kernel void @kernel2() #0 {
+bb:
+  call void @alias2() #2
+  ret void
+}
+
+define internal void @aliasee_vgpr64_sgpr102() #1 {
+bb:
+  call void asm sideeffect "; clobber v52 ", "~{v52}"()
+  ret void
+}
+
+attributes #0 = { noinline norecurse nounwind optnone }
+attributes #1 = { noinline norecurse nounwind readnone willreturn "amdgpu-waves-per-eu"="4,10" }
+attributes #2 = { nounwind readnone willreturn }

diff  --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
new file mode 100644
index 0000000000000..f5093493f87b0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
@@ -0,0 +1,26 @@
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; CallGraphAnalysis, which CodeGenSCC order depends on, does not look
+; through aliases. If GlobalOpt is never run, we do not see direct
+; calls,
+
+ at alias3 = hidden alias void (), void ()* @aliasee_vgpr256_sgpr102
+
+; CHECK-LABEL: {{^}}kernel3:
+; CHECK: .amdhsa_next_free_vgpr 256
+; CHECK-NEXT: .amdhsa_next_free_sgpr 102
+define amdgpu_kernel void @kernel3() #0 {
+bb:
+  call void @alias3() #2
+  ret void
+}
+
+define internal void @aliasee_vgpr256_sgpr102() #1 {
+bb:
+  call void asm sideeffect "; clobber v252 ", "~{v252}"()
+  ret void
+}
+
+attributes #0 = { noinline norecurse nounwind optnone }
+attributes #1 = { noinline norecurse nounwind readnone willreturn "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="1,1" }
+attributes #2 = { nounwind readnone willreturn }

diff  --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index e91d62c4c3f2e..9a2f055f11116 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -227,10 +227,10 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
 ; Make sure there's no assert when a sgpr96 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr96_external_call
 ; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 84
-; VI-NOBUG: NumSgprs: 86
+; CI: NumSgprs: 104
+; VI-NOBUG: NumSgprs: 108
 ; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 50
+; GCN: NumVgprs: 64
 define amdgpu_kernel void @count_use_sgpr96_external_call()  {
 entry:
   tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1
@@ -241,10 +241,10 @@ entry:
 ; Make sure there's no assert when a sgpr160 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr160_external_call
 ; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 84
-; VI-NOBUG: NumSgprs: 86
+; CI: NumSgprs: 104
+; VI-NOBUG: NumSgprs: 108
 ; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 50
+; GCN: NumVgprs: 64
 define amdgpu_kernel void @count_use_sgpr160_external_call()  {
 entry:
   tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
@@ -255,10 +255,10 @@ entry:
 ; Make sure there's no assert when a vgpr160 is used.
 ; GCN-LABEL: {{^}}count_use_vgpr160_external_call
 ; GCN: ; vgpr160 v[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 84
-; VI-NOBUG: NumSgprs: 86
+; CI: NumSgprs: 104
+; VI-NOBUG: NumSgprs: 108
 ; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 50
+; GCN: NumVgprs: 64
 define amdgpu_kernel void @count_use_vgpr160_external_call()  {
 entry:
   tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 73ccb58c1906b..a66c54cde282e 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -16,8 +16,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
 ; GCN-NEXT:     amd_machine_version_stepping = 0
 ; GCN-NEXT:     kernel_code_entry_byte_offset = 256
 ; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
-; GCN-NEXT:     granulated_workitem_vgpr_count = 7
-; GCN-NEXT:     granulated_wavefront_sgpr_count = 4
+; GCN-NEXT:     granulated_workitem_vgpr_count = 15
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 12
 ; GCN-NEXT:     priority = 0
 ; GCN-NEXT:     float_mode = 240
 ; GCN-NEXT:     priv = 0
@@ -60,8 +60,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
 ; GCN-NEXT:     gds_segment_byte_size = 0
 ; GCN-NEXT:     kernarg_segment_byte_size = 64
 ; GCN-NEXT:     workgroup_fbarrier_count = 0
-; GCN-NEXT:     wavefront_sgpr_count = 37
-; GCN-NEXT:     workitem_vgpr_count = 32
+; GCN-NEXT:     wavefront_sgpr_count = 104
+; GCN-NEXT:     workitem_vgpr_count = 64
 ; GCN-NEXT:     reserved_vgpr_first = 0
 ; GCN-NEXT:     reserved_vgpr_count = 0
 ; GCN-NEXT:     reserved_sgpr_first = 0
@@ -109,8 +109,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
 ; GISEL-NEXT:     amd_machine_version_stepping = 0
 ; GISEL-NEXT:     kernel_code_entry_byte_offset = 256
 ; GISEL-NEXT:     kernel_code_prefetch_byte_size = 0
-; GISEL-NEXT:     granulated_workitem_vgpr_count = 7
-; GISEL-NEXT:     granulated_wavefront_sgpr_count = 4
+; GISEL-NEXT:     granulated_workitem_vgpr_count = 15
+; GISEL-NEXT:     granulated_wavefront_sgpr_count = 12
 ; GISEL-NEXT:     priority = 0
 ; GISEL-NEXT:     float_mode = 240
 ; GISEL-NEXT:     priv = 0
@@ -153,8 +153,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr(i8) {
 ; GISEL-NEXT:     gds_segment_byte_size = 0
 ; GISEL-NEXT:     kernarg_segment_byte_size = 64
 ; GISEL-NEXT:     workgroup_fbarrier_count = 0
-; GISEL-NEXT:     wavefront_sgpr_count = 37
-; GISEL-NEXT:     workitem_vgpr_count = 32
+; GISEL-NEXT:     wavefront_sgpr_count = 104
+; GISEL-NEXT:     workitem_vgpr_count = 64
 ; GISEL-NEXT:     reserved_vgpr_first = 0
 ; GISEL-NEXT:     reserved_vgpr_count = 0
 ; GISEL-NEXT:     reserved_sgpr_first = 0
@@ -207,8 +207,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
 ; GCN-NEXT:     amd_machine_version_stepping = 0
 ; GCN-NEXT:     kernel_code_entry_byte_offset = 256
 ; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
-; GCN-NEXT:     granulated_workitem_vgpr_count = 7
-; GCN-NEXT:     granulated_wavefront_sgpr_count = 4
+; GCN-NEXT:     granulated_workitem_vgpr_count = 15
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 12
 ; GCN-NEXT:     priority = 0
 ; GCN-NEXT:     float_mode = 240
 ; GCN-NEXT:     priv = 0
@@ -251,8 +251,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
 ; GCN-NEXT:     gds_segment_byte_size = 0
 ; GCN-NEXT:     kernarg_segment_byte_size = 64
 ; GCN-NEXT:     workgroup_fbarrier_count = 0
-; GCN-NEXT:     wavefront_sgpr_count = 37
-; GCN-NEXT:     workitem_vgpr_count = 32
+; GCN-NEXT:     wavefront_sgpr_count = 104
+; GCN-NEXT:     workitem_vgpr_count = 64
 ; GCN-NEXT:     reserved_vgpr_first = 0
 ; GCN-NEXT:     reserved_vgpr_count = 0
 ; GCN-NEXT:     reserved_sgpr_first = 0
@@ -301,8 +301,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
 ; GISEL-NEXT:     amd_machine_version_stepping = 0
 ; GISEL-NEXT:     kernel_code_entry_byte_offset = 256
 ; GISEL-NEXT:     kernel_code_prefetch_byte_size = 0
-; GISEL-NEXT:     granulated_workitem_vgpr_count = 7
-; GISEL-NEXT:     granulated_wavefront_sgpr_count = 4
+; GISEL-NEXT:     granulated_workitem_vgpr_count = 15
+; GISEL-NEXT:     granulated_wavefront_sgpr_count = 12
 ; GISEL-NEXT:     priority = 0
 ; GISEL-NEXT:     float_mode = 240
 ; GISEL-NEXT:     priv = 0
@@ -345,8 +345,8 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg(i8) {
 ; GISEL-NEXT:     gds_segment_byte_size = 0
 ; GISEL-NEXT:     kernarg_segment_byte_size = 64
 ; GISEL-NEXT:     workgroup_fbarrier_count = 0
-; GISEL-NEXT:     wavefront_sgpr_count = 37
-; GISEL-NEXT:     workitem_vgpr_count = 32
+; GISEL-NEXT:     wavefront_sgpr_count = 104
+; GISEL-NEXT:     workitem_vgpr_count = 64
 ; GISEL-NEXT:     reserved_vgpr_first = 0
 ; GISEL-NEXT:     reserved_vgpr_count = 0
 ; GISEL-NEXT:     reserved_sgpr_first = 0