[llvm] 6a3beb1 - [AMDGPU] [IndirectCalls] Don't propagate attributes to address taken functions and their callees

Thu Jun 3 23:07:13 PDT 2021

Author: madhur13490
Date: 2021-06-04T11:36:56+05:30
New Revision: 6a3beb1f68d6791a4cd0190f68b48510f754a00a

URL: https://github.com/llvm/llvm-project/commit/6a3beb1f68d6791a4cd0190f68b48510f754a00a
DIFF: https://github.com/llvm/llvm-project/commit/6a3beb1f68d6791a4cd0190f68b48510f754a00a.diff

LOG: [AMDGPU] [IndirectCalls] Don't propagate attributes to address taken functions and their callees

Don't propagate launch bound related attributes to
address taken functions and their callees. The idea
is to do a traversal over the call graph starting at
address taken functions and erase the attributes
set by previous logic i.e. process().

This two phase approach makes sure that we don't
miss out on deep nested callees from address taken
functions as a function might be called directly as
well as indirectly.

This patch is also reattempt to D94585 as latent issues
are fixed in hasAddressTaken function in the recent
past.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D103138

Added: 
    llvm/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll
    llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll
    llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll
    llvm/test/CodeGen/AMDGPU/propagate-attributes-launch-bounds.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
    llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
    llvm/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
index 0e4c26170a8fa..fbad7d9fdcf20 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
@@ -30,11 +30,13 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/CallGraph.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include <queue>
 
 #define DEBUG_TYPE "amdgpu-propagate-attributes"
 
@@ -113,12 +115,17 @@ class AMDGPUPropagateAttributes {
   // Clone functions as needed or just set attributes.
   bool AllowClone;
 
+  CallGraph *ModuleCG = nullptr;
+
   // Option propagation roots.
   SmallSet<Function *, 32> Roots;
 
   // Clones of functions with their attributes.
   SmallVector<Clone, 32> Clones;
 
+  // To memoize address taken functions.
+  SmallSet<Function *, 32> AddressTakenFunctions;
+
   // Find a clone with required features.
   Function *findFunction(const FnProperties &PropsNeeded,
                          Function *OrigF);
@@ -139,14 +146,23 @@ class AMDGPUPropagateAttributes {
   bool process();
 
 public:
-  AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) :
-    TM(TM), AllowClone(AllowClone) {}
+  AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone)
+      : TM(TM), AllowClone(AllowClone) {}
 
   // Use F as a root and propagate its attributes.
   bool process(Function &F);
 
   // Propagate attributes starting from kernel functions.
-  bool process(Module &M);
+  bool process(Module &M, CallGraph *CG);
+
+  // Remove attributes from F.
+  // This is used in presence of address taken functions.
+  bool removeAttributes(Function *F);
+
+  // Handle call graph rooted at address taken functions.
+  // This function will erase all attributes present
+  // on all functions called from address taken functions transitively.
+  bool handleAddressTakenFunctions(CallGraph *CG);
 };
 
 // Allows to propagate attributes early, but no clonning is allowed as it must
@@ -182,6 +198,7 @@ class AMDGPUPropagateAttributesLate : public ModulePass {
       *PassRegistry::getPassRegistry());
   }
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
   bool runOnModule(Module &M) override;
 };
 
@@ -199,6 +216,49 @@ INITIALIZE_PASS(AMDGPUPropagateAttributesLate,
                 "Late propagate attributes from kernels to functions",
                 false, false)
 
+bool AMDGPUPropagateAttributes::removeAttributes(Function *F) {
+  bool Changed = false;
+  if (!F)
+    return Changed;
+  LLVM_DEBUG(dbgs() << "Removing attributes from " << F->getName() << '\n');
+  for (unsigned I = 0; I < NumAttr; ++I) {
+    if (F->hasFnAttribute(AttributeNames[I])) {
+      F->removeFnAttr(AttributeNames[I]);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+bool AMDGPUPropagateAttributes::handleAddressTakenFunctions(CallGraph *CG) {
+  assert(ModuleCG && "Call graph not present");
+
+  bool Changed = false;
+  SmallSet<CallGraphNode *, 32> Visited;
+
+  for (Function *F : AddressTakenFunctions) {
+    CallGraphNode *CGN = (*CG)[F];
+    if (!Visited.count(CGN)) {
+      Changed |= removeAttributes(F);
+      Visited.insert(CGN);
+    }
+
+    std::queue<CallGraphNode *> SubGraph;
+    SubGraph.push(CGN);
+    while (!SubGraph.empty()) {
+      CallGraphNode *CGN = SubGraph.front();
+      SubGraph.pop();
+      if (!Visited.count(CGN)) {
+        Changed |= removeAttributes(CGN->getFunction());
+        Visited.insert(CGN);
+      }
+      for (auto N : *CGN)
+        SubGraph.push(N.second);
+    }
+  }
+  return Changed;
+}
+
 Function *
 AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded,
                                         Function *OrigF) {
@@ -210,11 +270,12 @@ AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded,
   return nullptr;
 }
 
-bool AMDGPUPropagateAttributes::process(Module &M) {
+bool AMDGPUPropagateAttributes::process(Module &M, CallGraph *CG) {
   for (auto &F : M.functions())
     if (AMDGPU::isEntryFunctionCC(F.getCallingConv()))
       Roots.insert(&F);
 
+  ModuleCG = CG;
   return process();
 }
 
@@ -240,6 +301,9 @@ bool AMDGPUPropagateAttributes::process() {
       if (F.isDeclaration())
         continue;
 
+      if (F.hasAddressTaken(nullptr, true, true, true))
+        AddressTakenFunctions.insert(&F);
+
       const FnProperties CalleeProps(*TM, F);
       SmallVector<std::pair<CallBase *, Function *>, 32> ToReplace;
       SmallSet<CallBase *, 32> Visited;
@@ -310,6 +374,12 @@ bool AMDGPUPropagateAttributes::process() {
   Roots.clear();
   Clones.clear();
 
+  // Keep the post processing related to indirect
+  // calls separate to handle them gracefully.
+  // The core traversal need not be affected by this.
+  if (AllowClone)
+    Changed |= handleAddressTakenFunctions(ModuleCG);
+
   return Changed;
 }
 
@@ -389,6 +459,10 @@ bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) {
   return AMDGPUPropagateAttributes(TM, false).process(F);
 }
 
+void AMDGPUPropagateAttributesLate::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<CallGraphWrapperPass>();
+}
+
 bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) {
   if (!TM) {
     auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
@@ -397,8 +471,8 @@ bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) {
 
     TM = &TPC->getTM<TargetMachine>();
   }
-
-  return AMDGPUPropagateAttributes(TM, true).process(M);
+  CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+  return AMDGPUPropagateAttributes(TM, true).process(M, &CG);
 }
 
 FunctionPass
@@ -423,8 +497,9 @@ AMDGPUPropagateAttributesEarlyPass::run(Function &F,
 }
 
 PreservedAnalyses
-AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &AM) {
-  return AMDGPUPropagateAttributes(&TM, true).process(M)
-             ? PreservedAnalyses::none()
-             : PreservedAnalyses::all();
+AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &MAM) {
+  AMDGPUPropagateAttributes APA(&TM, true);
+  CallGraph &CG = MAM.getResult<CallGraphAnalysis>(M);
+  const bool Changed = APA.process(M, &CG);
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
index c18898164e25c..25051b885f9d3 100644
--- a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
@@ -65,6 +65,7 @@
 ; GCN-O1-NEXT:     AMDGPU Printf lowering
 ; GCN-O1-NEXT:       FunctionPass Manager
 ; GCN-O1-NEXT:         Dominator Tree Construction
+; GCN-O1-NEXT:     CallGraph Construction
 ; GCN-O1-NEXT:     Late propagate attributes from kernels to functions
 ; GCN-O1-NEXT:     Interprocedural Sparse Conditional Constant Propagation
 ; GCN-O1-NEXT:       FunctionPass Manager
@@ -374,6 +375,7 @@
 ; GCN-O2-NEXT:     AMDGPU Printf lowering
 ; GCN-O2-NEXT:       FunctionPass Manager
 ; GCN-O2-NEXT:         Dominator Tree Construction
+; GCN-O2-NEXT:     CallGraph Construction
 ; GCN-O2-NEXT:     Late propagate attributes from kernels to functions
 ; GCN-O2-NEXT:     Interprocedural Sparse Conditional Constant Propagation
 ; GCN-O2-NEXT:       FunctionPass Manager
@@ -728,6 +730,7 @@
 ; GCN-O3-NEXT:     AMDGPU Printf lowering
 ; GCN-O3-NEXT:       FunctionPass Manager
 ; GCN-O3-NEXT:         Dominator Tree Construction
+; GCN-O3-NEXT:     CallGraph Construction
 ; GCN-O3-NEXT:     Late propagate attributes from kernels to functions
 ; GCN-O3-NEXT:     FunctionPass Manager
 ; GCN-O3-NEXT:       Dominator Tree Construction

diff  --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll
new file mode 100644
index 0000000000000..394a390980aee
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-common-callees.ll
@@ -0,0 +1,55 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late  %s | FileCheck %s
+
+; CHECK-LABEL: define float @common() {
+define float @common() {
+   ret float 0.0
+}
+
+; CHECK-LABEL: define float @foo() {
+define float @foo() {
+  %direct_call = call contract float @common()
+  ret float %direct_call
+}
+
+; CHECK-LABEL: define float @bar() {
+define float @bar() {
+   ret float 0.0
+}
+
+; CHECK-LABEL: define float @baz() {
+define float @baz() {
+   ret float 0.0
+}
+
+define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 {
+  %fn = alloca float ()*
+  switch i32 %type, label %sw.default [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb:
+  store float ()* @foo, float ()** %fn
+  br label %sw.epilog
+
+sw.bb2:
+  store float ()* @bar, float ()** %fn
+  br label %sw.epilog
+
+sw.bb3:
+  store float ()* @baz, float ()** %fn
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  %fp = load float ()*, float ()** %fn
+  %direct_call = call contract float @common()
+  %indirect_call = call contract float %fp()
+  store float %indirect_call, float* %result
+  ret void
+}
+
+attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }

diff  --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll
new file mode 100644
index 0000000000000..f3e88ea129003
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-direct-indirect-common-callee.ll
@@ -0,0 +1,53 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late  %s | FileCheck %s
+
+; Test to check if we skip propgating attributes even if
+; a function is called directly as well as
+; indirectly. "baz" is called directly as well indirectly.
+
+; CHECK-LABEL: define float @foo()
+define float @foo() #1 {
+  ret float 0.0
+}
+
+; CHECK-LABEL: define float @bar()
+define float @bar() #1 {
+ ret float 0.0
+}
+
+; CHECK-LABEL: define float @baz()
+define float @baz() #1 {
+ ret float 0.0
+}
+
+define amdgpu_kernel void @switch_indirect_kernel(float *%result, i32 %type) #1 {
+  %fn = alloca float ()*
+  switch i32 %type, label %sw.default [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb:
+  store float ()* @foo, float ()** %fn
+  br label %sw.epilog
+
+sw.bb2:
+  store float ()* @bar, float ()** %fn
+  br label %sw.epilog
+
+sw.bb3:
+  store float ()* @baz, float ()** %fn
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  %fp = load float ()*, float ()** %fn
+  %direct_call = call contract float @baz()
+  %indirect_call = call contract float %fp()
+  store float %indirect_call, float* %result
+  ret void
+}
+
+attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }

diff  --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll
index 370c5cc17e91f..44d52fb934f67 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-function-pointer-argument.ll
@@ -30,11 +30,14 @@ define private void @f(void ()* nocapture %0) #0 {
 ; In order to expose this bug, it is necessary that `g` have one of the
 ; propagated attributes, so that a clone and substitution would take place if g
 ; were actually the function being called.
-; CHECK-DAG: define private void @g.1() #1
-; CHECK-DAG: define internal void @g() #2
+; CHECK-DAG: define private void @g.1() #0
+; CHECK-DAG: define internal void @g() #1
 define private void @g() #1 {
     ret void
 }
 
 attributes #0 = { noinline }
 attributes #1 = { noinline "amdgpu-waves-per-eu"="1,10" }
+
+; CHECK: attributes #0 = { noinline }
+; CHECK-NEXT: attributes #1 = { noinline "target-features"="+enable-ds128,+enable-prt-strict-null,+flat-address-space,+flat-for-global,+load-store-opt,+promote-alloca,+trap-handler,+unaligned-access-mode,-wavefrontsize16,-wavefrontsize32,+wavefrontsize64" }

diff  --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll
new file mode 100644
index 0000000000000..2401f4c81a753
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-indirect.ll
@@ -0,0 +1,34 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late  %s | FileCheck %s
+
+; Test to check if we skip attributes on address
+; taken functions and its callees.
+
+; CHECK-LABEL: define float @bar() {
+define float @bar() {
+  ret float 0.0
+}
+
+; CHECK-LABEL: define float @baz() {
+define float @baz() {
+  ret float 0.0
+}
+
+; CHECK-LABEL: define float @foo() {
+define float @foo() {
+  %v1 = call contract float @bar()
+  %v2 = call contract float @baz()
+  %v3 = fadd float %v1, %v2
+  ret float %v3
+}
+
+; CHECK-LABEL: define amdgpu_kernel void @kernel(float* %result, i32 %type) #0 {
+define amdgpu_kernel void @kernel(float *%result, i32 %type) #1 {
+  %fn = alloca float ()*
+  store float ()* @foo, float ()** %fn
+  %fp = load float ()*, float ()** %fn
+  %indirect_call = call contract float %fp()
+  store float %indirect_call, float* %result
+  ret void
+}
+
+attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }

diff  --git a/llvm/test/CodeGen/AMDGPU/propagate-attributes-launch-bounds.ll b/llvm/test/CodeGen/AMDGPU/propagate-attributes-launch-bounds.ll
new file mode 100644
index 0000000000000..362c40293c39d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/propagate-attributes-launch-bounds.ll
@@ -0,0 +1,32 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late  %s | FileCheck %s
+
+; Test attributes on a function which
+; is called indirectly from two kernels
+; having 
diff erent launch bounds.
+
+; This function should not have any attributes on it.
+; CHECK-LABEL: define float @foo() {
+define float @foo() {
+   ret float 0.0
+}
+
+define amdgpu_kernel void @kernel1(float *%result, i32 %type) #1 {
+  %fn = alloca float ()*
+  store float ()* @foo, float ()** %fn
+  %fp = load float ()*, float ()** %fn
+  %indirect_call = call contract float %fp()
+  store float %indirect_call, float* %result
+  ret void
+}
+
+define amdgpu_kernel void @kernel2(float *%result, i32 %type) #2 {
+  %fn = alloca float ()*
+  store float ()* @foo, float ()** %fn
+  %fp = load float ()*, float ()** %fn
+  %indirect_call = call contract float %fp()
+  store float %indirect_call, float* %result
+  ret void
+}
+
+attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }
+attributes #2 = { "amdgpu-flat-work-group-size"="1,512" }