[llvm] r363586 - [AMDGPU] Pass to propagate ABI attributes from kernels to the functions

Mon Jun 17 10:47:28 PDT 2019

Author: rampitec
Date: Mon Jun 17 10:47:28 2019
New Revision: 363586

URL: http://llvm.org/viewvc/llvm-project?rev=363586&view=rev
Log:
[AMDGPU] Pass to propagate ABI attributes from kernels to the functions

The pass works in two modes:

Mode 1: Just set attributes starting from kernels. This can work at
the very beginning of opt and llc pipeline, but cannot clone functions
because it must be a function pass.

Mode 2: Actually clone functions for new attributes. This can only work
after all function passes in the opt pipeline because it has to be a
module pass.

Differential Revision: https://reviews.llvm.org/D63208

Added:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
    llvm/trunk/test/CodeGen/AMDGPU/propagate-attributes-clone.ll
    llvm/trunk/test/CodeGen/AMDGPU/propagate-attributes-single-set.ll
Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
    llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPU.h?rev=363586&r1=363585&r2=363586&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h Mon Jun 17 10:47:28 2019
@@ -57,6 +57,8 @@ FunctionPass *createAMDGPUSimplifyLibCal
 FunctionPass *createAMDGPUUseNativeCallsPass();
 FunctionPass *createAMDGPUCodeGenPreparePass();
 FunctionPass *createAMDGPUMachineCFGStructurizerPass();
+FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *);
+ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *);
 FunctionPass *createAMDGPURewriteOutArgumentsPass();
 FunctionPass *createSIModeRegisterPass();
 
@@ -91,6 +93,12 @@ ModulePass *createAMDGPULowerKernelAttri
 void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
 extern char &AMDGPULowerKernelAttributesID;
 
+void initializeAMDGPUPropagateAttributesEarlyPass(PassRegistry &);
+extern char &AMDGPUPropagateAttributesEarlyID;
+
+void initializeAMDGPUPropagateAttributesLatePass(PassRegistry &);
+extern char &AMDGPUPropagateAttributesLateID;
+
 void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
 extern char &AMDGPURewriteOutArgumentsID;
 

Added: llvm/trunk/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp?rev=363586&view=auto
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp (added)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp Mon Jun 17 10:47:28 2019
@@ -0,0 +1,336 @@
+//===--- AMDGPUPropagateAttributes.cpp --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass propagates attributes from kernels to the non-entry
+/// functions. Most of the library functions were not compiled for specific ABI,
+/// yet will be correctly compiled if proper attrbutes are propagated from the
+/// caller.
+///
+/// The pass analyzes call graph and propagates ABI target features through the
+/// call graph.
+///
+/// It can run in two modes: as a function or module pass. A function pass
+/// simply propagates attributes. A module pass clones functions if there are
+/// callers with different ABI. If a function is clonned all call sites will
+/// be updated to use a correct clone.
+///
+/// A function pass is limited in functionality but can run early in the
+/// pipeline. A module pass is more powerful but has to run late, so misses
+/// library folding opportunities.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "amdgpu-propagate-attributes"
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <string>
+
+using namespace llvm;
+
+namespace llvm {
+extern const SubtargetFeatureKV AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures-1];
+}
+
+namespace {
+
+class AMDGPUPropagateAttributes {
+  const FeatureBitset TargetFeatures = {
+    AMDGPU::FeatureWavefrontSize16,
+    AMDGPU::FeatureWavefrontSize32,
+    AMDGPU::FeatureWavefrontSize64
+  };
+
+  class Clone{
+  public:
+    Clone(FeatureBitset FeatureMask, Function *OrigF, Function *NewF) :
+      FeatureMask(FeatureMask), OrigF(OrigF), NewF(NewF) {}
+
+    FeatureBitset FeatureMask;
+    Function *OrigF;
+    Function *NewF;
+  };
+
+  const TargetMachine *TM;
+
+  // Clone functions as needed or just set attributes.
+  bool AllowClone;
+
+  // Option propagation roots.
+  SmallSet<Function *, 32> Roots;
+
+  // Clones of functions with their attributes.
+  SmallVector<Clone, 32> Clones;
+
+  // Find a clone with required features.
+  Function *findFunction(const FeatureBitset &FeaturesNeeded,
+                         Function *OrigF);
+
+  // Clone function F and set NewFeatures on the clone.
+  // Cole takes the name of original function.
+  Function *cloneWithFeatures(Function &F,
+                              const FeatureBitset &NewFeatures);
+
+  // Set new function's features in place.
+  void setFeatures(Function &F, const FeatureBitset &NewFeatures);
+
+  std::string getFeatureString(const FeatureBitset &Features) const;
+
+  // Propagate attributes from Roots.
+  bool process();
+
+public:
+  AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) :
+    TM(TM), AllowClone(AllowClone) {}
+
+  // Use F as a root and propagate its attributes.
+  bool process(Function &F);
+
+  // Propagate attributes starting from kernel functions.
+  bool process(Module &M);
+};
+
+// Allows to propagate attributes early, but no clonning is allowed as it must
+// be a function pass to run before any optimizations.
+// TODO: We shall only need a one instance of module pass, but that needs to be
+// in the linker pipeline which is currently not possible.
+class AMDGPUPropagateAttributesEarly : public FunctionPass {
+  const TargetMachine *TM;
+
+public:
+  static char ID; // Pass identification
+
+  AMDGPUPropagateAttributesEarly(const TargetMachine *TM = nullptr) :
+    FunctionPass(ID), TM(TM) {
+    initializeAMDGPUPropagateAttributesEarlyPass(
+      *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+
+// Allows to propagate attributes with clonning but does that late in the
+// pipeline.
+class AMDGPUPropagateAttributesLate : public ModulePass {
+  const TargetMachine *TM;
+
+public:
+  static char ID; // Pass identification
+
+  AMDGPUPropagateAttributesLate(const TargetMachine *TM = nullptr) :
+    ModulePass(ID), TM(TM) {
+    initializeAMDGPUPropagateAttributesLatePass(
+      *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+};
+
+}  // end anonymous namespace.
+
+char AMDGPUPropagateAttributesEarly::ID = 0;
+char AMDGPUPropagateAttributesLate::ID = 0;
+
+INITIALIZE_PASS(AMDGPUPropagateAttributesEarly,
+                "amdgpu-propagate-attributes-early",
+                "Early propagate attributes from kernels to functions",
+                false, false)
+INITIALIZE_PASS(AMDGPUPropagateAttributesLate,
+                "amdgpu-propagate-attributes-late",
+                "Late propagate attributes from kernels to functions",
+                false, false)
+
+Function *
+AMDGPUPropagateAttributes::findFunction(const FeatureBitset &FeaturesNeeded,
+                                        Function *OrigF) {
+  // TODO: search for clone's clones.
+  for (Clone &C : Clones)
+    if (C.OrigF == OrigF && FeaturesNeeded == C.FeatureMask)
+      return C.NewF;
+
+  return nullptr;
+}
+
+bool AMDGPUPropagateAttributes::process(Module &M) {
+  for (auto &F : M.functions())
+    if (AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+      Roots.insert(&F);
+
+  return process();
+}
+
+bool AMDGPUPropagateAttributes::process(Function &F) {
+  Roots.insert(&F);
+  return process();
+}
+
+bool AMDGPUPropagateAttributes::process() {
+  bool Changed = false;
+  SmallSet<Function *, 32> NewRoots;
+  SmallSet<Function *, 32> Replaced;
+
+  if (Roots.empty())
+    return false;
+  Module &M = *(*Roots.begin())->getParent();
+
+  do {
+    Roots.insert(NewRoots.begin(), NewRoots.end());
+    NewRoots.clear();
+
+    for (auto &F : M.functions()) {
+      if (F.isDeclaration() || Roots.count(&F) || Roots.count(&F))
+        continue;
+
+      const FeatureBitset &CalleeBits =
+        TM->getSubtargetImpl(F)->getFeatureBits();
+      SmallVector<std::pair<CallBase *, Function *>, 32> ToReplace;
+
+      for (User *U : F.users()) {
+        Instruction *I = dyn_cast<Instruction>(U);
+        if (!I)
+          continue;
+        CallBase *CI = dyn_cast<CallBase>(I);
+        if (!CI)
+          continue;
+        Function *Caller = CI->getCaller();
+        if (!Caller)
+          continue;
+        if (!Roots.count(Caller))
+          continue;
+
+        const FeatureBitset &CallerBits =
+          TM->getSubtargetImpl(*Caller)->getFeatureBits() & TargetFeatures;
+
+        if (CallerBits == (CalleeBits  & TargetFeatures)) {
+          NewRoots.insert(&F);
+          continue;
+        }
+
+        Function *NewF = findFunction(CallerBits, &F);
+        if (!NewF) {
+          FeatureBitset NewFeatures((CalleeBits & ~TargetFeatures) |
+                                    CallerBits);
+          if (!AllowClone) {
+            // This may set different features on different iteartions if
+            // there is a contradiction in callers' attributes. In this case
+            // we rely on a second pass running on Module, which is allowed
+            // to clone.
+            setFeatures(F, NewFeatures);
+            NewRoots.insert(&F);
+            Changed = true;
+            break;
+          }
+
+          NewF = cloneWithFeatures(F, NewFeatures);
+          Clones.push_back(Clone(CallerBits, &F, NewF));
+          NewRoots.insert(NewF);
+        }
+
+        ToReplace.push_back(std::make_pair(CI, NewF));
+        Replaced.insert(&F);
+
+        Changed = true;
+      }
+
+      while (!ToReplace.empty()) {
+        auto R = ToReplace.pop_back_val();
+        R.first->setCalledFunction(R.second);
+      }
+    }
+  } while (!NewRoots.empty());
+
+  for (Function *F : Replaced) {
+    if (F->use_empty())
+      F->eraseFromParent();
+  }
+
+  return Changed;
+}
+
+Function *
+AMDGPUPropagateAttributes::cloneWithFeatures(Function &F,
+                                             const FeatureBitset &NewFeatures) {
+  LLVM_DEBUG(dbgs() << "Cloning " << F.getName() << '\n');
+
+  ValueToValueMapTy dummy;
+  Function *NewF = CloneFunction(&F, dummy);
+  setFeatures(*NewF, NewFeatures);
+
+  // Swap names. If that is the only clone it will retain the name of now
+  // dead value.
+  if (F.hasName()) {
+    std::string NewName = NewF->getName();
+    NewF->takeName(&F);
+    F.setName(NewName);
+
+    // Name has changed, it does not need an external symbol.
+    F.setVisibility(GlobalValue::DefaultVisibility);
+    F.setLinkage(GlobalValue::InternalLinkage);
+  }
+
+  return NewF;
+}
+
+void AMDGPUPropagateAttributes::setFeatures(Function &F,
+                                            const FeatureBitset &NewFeatures) {
+  std::string NewFeatureStr = getFeatureString(NewFeatures);
+
+  LLVM_DEBUG(dbgs() << "Set features "
+                    << getFeatureString(NewFeatures & TargetFeatures)
+                    << " on " << F.getName() << '\n');
+
+  F.removeFnAttr("target-features");
+  F.addFnAttr("target-features", NewFeatureStr);
+}
+
+std::string
+AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const
+{
+  std::string Ret;
+  for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV) {
+    if (Features[KV.Value])
+      Ret += (StringRef("+") + KV.Key + ",").str();
+    else if (TargetFeatures[KV.Value])
+      Ret += (StringRef("-") + KV.Key + ",").str();
+  }
+  Ret.pop_back(); // Remove last comma.
+  return Ret;
+}
+
+bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) {
+  if (!TM || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+    return false;
+
+  return AMDGPUPropagateAttributes(TM, false).process(F);
+}
+
+bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) {
+  if (!TM)
+    return false;
+
+  return AMDGPUPropagateAttributes(TM, true).process(M);
+}
+
+FunctionPass
+*llvm::createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *TM) {
+  return new AMDGPUPropagateAttributesEarly(TM);
+}
+
+ModulePass
+*llvm::createAMDGPUPropagateAttributesLatePass(const TargetMachine *TM) {
+  return new AMDGPUPropagateAttributesLate(TM);
+}

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp?rev=363586&r1=363585&r2=363586&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp Mon Jun 17 10:47:28 2019
@@ -217,6 +217,8 @@ extern "C" void LLVMInitializeAMDGPUTarg
   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
   initializeAMDGPUPromoteAllocaPass(*PR);
   initializeAMDGPUCodeGenPreparePass(*PR);
+  initializeAMDGPUPropagateAttributesEarlyPass(*PR);
+  initializeAMDGPUPropagateAttributesLatePass(*PR);
   initializeAMDGPURewriteOutArgumentsPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowPass(*PR);
@@ -402,13 +404,14 @@ void AMDGPUTargetMachine::adjustPassMana
 
   Builder.addExtension(
     PassManagerBuilder::EP_ModuleOptimizerEarly,
-    [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
-                                         legacy::PassManagerBase &PM) {
+    [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &,
+                                               legacy::PassManagerBase &PM) {
       if (AMDGPUAA) {
         PM.add(createAMDGPUAAWrapperPass());
         PM.add(createAMDGPUExternalAAWrapperPass());
       }
       PM.add(createAMDGPUUnifyMetadataPass());
+      PM.add(createAMDGPUPropagateAttributesLatePass(this));
       if (Internalize) {
         PM.add(createInternalizePass(mustPreserveGV));
         PM.add(createGlobalDCEPass());
@@ -420,12 +423,13 @@ void AMDGPUTargetMachine::adjustPassMana
   const auto &Opt = Options;
   Builder.addExtension(
     PassManagerBuilder::EP_EarlyAsPossible,
-    [AMDGPUAA, LibCallSimplify, &Opt](const PassManagerBuilder &,
-                                      legacy::PassManagerBase &PM) {
+    [AMDGPUAA, LibCallSimplify, &Opt, this](const PassManagerBuilder &,
+                                            legacy::PassManagerBase &PM) {
       if (AMDGPUAA) {
         PM.add(createAMDGPUAAWrapperPass());
         PM.add(createAMDGPUExternalAAWrapperPass());
       }
+      PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
       PM.add(llvm::createAMDGPUUseNativeCallsPass());
       if (LibCallSimplify)
         PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt));
@@ -654,6 +658,9 @@ void AMDGPUPassConfig::addIRPasses() {
   disablePass(&FuncletLayoutID);
   disablePass(&PatchableFunctionID);
 
+  // A call to propagate attributes pass in the backend in case opt was not run.
+  addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
+
   addPass(createAtomicExpandPass());
 
   // This must occur before inlining, as the inliner will not look through

Modified: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt?rev=363586&r1=363585&r2=363586&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt (original)
+++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt Mon Jun 17 10:47:28 2019
@@ -58,6 +58,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUMCInstLower.cpp
   AMDGPUOpenCLEnqueuedBlockLowering.cpp
   AMDGPUPromoteAlloca.cpp
+  AMDGPUPropagateAttributes.cpp
   AMDGPURegAsmNames.inc.cpp
   AMDGPURegisterBankInfo.cpp
   AMDGPURegisterInfo.cpp

Added: llvm/trunk/test/CodeGen/AMDGPU/propagate-attributes-clone.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/propagate-attributes-clone.ll?rev=363586&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/propagate-attributes-clone.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/propagate-attributes-clone.ll Mon Jun 17 10:47:28 2019
@@ -0,0 +1,87 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -O1 < %s | FileCheck -check-prefix=OPT %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=LLC %s
+
+; OPT: declare void @foo4() local_unnamed_addr #0
+; OPT: define internal fastcc void @foo3.2() unnamed_addr #1
+; OPT: define void @foo2() local_unnamed_addr #1
+; OPT: define internal fastcc void @foo1.1() unnamed_addr #1
+; OPT: define amdgpu_kernel void @kernel1() local_unnamed_addr #2
+; OPT: define amdgpu_kernel void @kernel2() local_unnamed_addr #3
+; OPT: define amdgpu_kernel void @kernel3() local_unnamed_addr #3
+; OPT: define void @foo1() local_unnamed_addr #4
+; OPT: define void @foo3() local_unnamed_addr #4
+; OPT: attributes #0 = { {{.*}} "target-features"="+wavefrontsize64" }
+; OPT: attributes #1 = { {{.*}} "target-features"="{{.*}},-wavefrontsize16,-wavefrontsize32,+wavefrontsize64{{.*}}" }
+; OPT: attributes #2 = { {{.*}} "target-features"="+wavefrontsize32" }
+; OPT: attributes #3 = { {{.*}} "target-features"="+wavefrontsize64" }
+; OPT: attributes #4 = { {{.*}} "target-features"="{{.*}},-wavefrontsize16,+wavefrontsize32,-wavefrontsize64{{.*}}" }
+
+; LLC: foo3:
+; LLC: sample asm
+; LLC: foo2:
+; LLC: sample asm
+; LLC: foo1:
+; LLC: foo4 at gotpcrel32@lo+4
+; LLC: foo4 at gotpcrel32@hi+4
+; LLC: foo3 at gotpcrel32@lo+4
+; LLC: foo3 at gotpcrel32@hi+4
+; LLC: foo2 at gotpcrel32@lo+4
+; LLC: foo2 at gotpcrel32@hi+4
+; LLC: foo1 at gotpcrel32@lo+4
+; LLC: foo1 at gotpcrel32@hi+4
+; LLC: kernel1:
+; LLC: foo1 at gotpcrel32@lo+4
+; LLC: foo1 at gotpcrel32@hi+4
+; LLC: kernel2:
+; LLC: foo2 at gotpcrel32@lo+4
+; LLC: foo2 at gotpcrel32@hi+4
+; LLC: kernel3:
+; LLC: foo1 at gotpcrel32@lo+4
+; LLC: foo1 at gotpcrel32@hi+4
+
+declare void @foo4() #1
+
+define void @foo3() #1 {
+entry:
+  call void asm sideeffect "; sample asm", ""()
+  ret void
+}
+
+define void @foo2() #1 {
+entry:
+  call void asm sideeffect "; sample asm", ""()
+  ret void
+}
+
+define void @foo1() #1 {
+entry:
+  tail call void @foo4()
+  tail call void @foo3()
+  tail call void @foo2()
+  tail call void @foo2()
+  tail call void @foo1()
+  ret void
+}
+
+define amdgpu_kernel void @kernel1() #0 {
+entry:
+  tail call void @foo1()
+  ret void
+}
+
+define amdgpu_kernel void @kernel2() #2 {
+entry:
+  tail call void @foo2()
+  ret void
+}
+
+define amdgpu_kernel void @kernel3() #3 {
+entry:
+  tail call void @foo1()
+  ret void
+}
+
+attributes #0 = { nounwind "target-features"="+wavefrontsize32" }
+attributes #1 = { noinline nounwind "target-features"="+wavefrontsize64" }
+attributes #2 = { nounwind "target-features"="+wavefrontsize64" }
+attributes #3 = { nounwind "target-features"="+wavefrontsize64" }

Added: llvm/trunk/test/CodeGen/AMDGPU/propagate-attributes-single-set.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/propagate-attributes-single-set.ll?rev=363586&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/propagate-attributes-single-set.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/propagate-attributes-single-set.ll Mon Jun 17 10:47:28 2019
@@ -0,0 +1,72 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -O1 < %s | FileCheck -check-prefix=OPT %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=LLC %s
+
+; OPT: declare void @foo4() local_unnamed_addr #0
+; OPT: define void @foo3() local_unnamed_addr #1
+; OPT: define void @foo2() local_unnamed_addr #1
+; OPT: define void @foo1() local_unnamed_addr #1
+; OPT: define amdgpu_kernel void @kernel1() local_unnamed_addr #2
+; OPT: define amdgpu_kernel void @kernel2() local_unnamed_addr #2
+; OPT: attributes #0 = { {{.*}} "target-features"="+wavefrontsize64" }
+; OPT: attributes #1 = { {{.*}} "target-features"="{{.*}},-wavefrontsize16,+wavefrontsize32,-wavefrontsize64
+; OPT: attributes #2 = { {{.*}} "target-features"="+wavefrontsize32
+; OPT: attributes #3 = { nounwind }
+
+; LLC: foo3:
+; LLC: sample asm
+; LLC: foo2:
+; LLC: sample asm
+; LLC: foo1:
+; LLC: foo4 at gotpcrel32@lo+4
+; LLC: foo4 at gotpcrel32@hi+4
+; LLC: foo3 at gotpcrel32@lo+4
+; LLC: foo3 at gotpcrel32@hi+4
+; LLC: foo2 at gotpcrel32@lo+4
+; LLC: foo2 at gotpcrel32@hi+4
+; LLC: foo1 at gotpcrel32@lo+4
+; LLC: foo1 at gotpcrel32@hi+4
+; LLC: kernel1:
+; LLC: foo1 at gotpcrel32@lo+4
+; LLC: foo1 at gotpcrel32@hi+4
+; LLC: kernel2:
+; LLC: foo2 at gotpcrel32@lo+4
+; LLC: foo2 at gotpcrel32@hi+4
+
+declare void @foo4() #1
+
+define void @foo3() #1 {
+entry:
+  call void asm sideeffect "; sample asm", ""()
+  ret void
+}
+
+define void @foo2() #1 {
+entry:
+  call void asm sideeffect "; sample asm", ""()
+  ret void
+}
+
+define void @foo1() #1 {
+entry:
+  tail call void @foo4()
+  tail call void @foo3()
+  tail call void @foo2()
+  tail call void @foo2()
+  tail call void @foo1()
+  ret void
+}
+
+define amdgpu_kernel void @kernel1() #0 {
+entry:
+  tail call void @foo1()
+  ret void
+}
+
+define amdgpu_kernel void @kernel2() #0 {
+entry:
+  tail call void @foo2()
+  ret void
+}
+
+attributes #0 = { nounwind "target-features"="+wavefrontsize32" }
+attributes #1 = { noinline nounwind "target-features"="+wavefrontsize64" }