[llvm] r336587 - AMDGPU: Force inlining if LDS global address is used

Vlad Tsyrklevich via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 9 17:51:33 PDT 2018


This change broke the sanitizer bots with the following failure:
/b/sanitizer-x86_64-linux-fast/build/llvm/tools/clang/test/CodeGen/backend-unsupported-error.ll:6:10:
error: expected string not found in input
; CHECK: error: test.c:2:20: in function bar i32 (): unsupported call to
function foo.2
         ^
<stdin>:1:1: note: scanning from here
'-fp32-denormals' is not a recognized feature for this target (ignoring
feature)
^
<stdin>:4:1: note: possible intended match here
error: test.c:3:20: in function bar i32 (): unsupported call to function bar
^

I have reverted it in r336623.

On Mon, Jul 9, 2018 at 12:27 PM Matt Arsenault via llvm-commits <
llvm-commits at lists.llvm.org> wrote:

> Author: arsenm
> Date: Mon Jul  9 12:22:22 2018
> New Revision: 336587
>
> URL: http://llvm.org/viewvc/llvm-project?rev=336587&view=rev
> Log:
> AMDGPU: Force inlining if LDS global address is used
>
> These won't work for the forseeable future. These aren't allowed
> from OpenCL, but IPO optimizations can make them appear.
>
> Also directly set the attributes on functions, regardless
> of the linkage rather than cloning functions like before.
>
> Added:
>     llvm/trunk/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll
> Modified:
>     llvm/trunk/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
>     llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
>     llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.h
>     llvm/trunk/test/CodeGen/AMDGPU/early-inline.ll
>     llvm/trunk/test/CodeGen/AMDGPU/stress-calls.ll
>
> Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp?rev=336587&r1=336586&r2=336587&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp (original)
> +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp Mon Jul  9
> 12:22:22 2018
> @@ -14,6 +14,9 @@
>
>  //===----------------------------------------------------------------------===//
>
>  #include "AMDGPU.h"
> +#include "AMDGPUTargetMachine.h"
> +#include "Utils/AMDGPUBaseInfo.h"
> +#include "llvm/ADT/SmallPtrSet.h"
>  #include "llvm/IR/Module.h"
>  #include "llvm/Transforms/Utils/Cloning.h"
>
> @@ -30,13 +33,18 @@ static cl::opt<bool> StressCalls(
>  class AMDGPUAlwaysInline : public ModulePass {
>    bool GlobalOpt;
>
> +  void recursivelyVisitUsers(GlobalValue &GV,
> +                             SmallPtrSetImpl<Function *>
> &FuncsToAlwaysInline);
>  public:
>    static char ID;
>
>    AMDGPUAlwaysInline(bool GlobalOpt = false) :
>      ModulePass(ID), GlobalOpt(GlobalOpt) { }
>    bool runOnModule(Module &M) override;
> -  StringRef getPassName() const override { return "AMDGPU Always Inline
> Pass"; }
> +
> +  void getAnalysisUsage(AnalysisUsage &AU) const override {
> +    AU.setPreservesAll();
> + }
>  };
>
>  } // End anonymous namespace
> @@ -46,15 +54,53 @@ INITIALIZE_PASS(AMDGPUAlwaysInline, "amd
>
>  char AMDGPUAlwaysInline::ID = 0;
>
> +void AMDGPUAlwaysInline::recursivelyVisitUsers(
> +  GlobalValue &GV,
> +  SmallPtrSetImpl<Function *> &FuncsToAlwaysInline) {
> +  SmallVector<User *, 16> Stack;
> +
> +  SmallPtrSet<const Value *, 8> Visited;
> +
> +  for (User *U : GV.users())
> +    Stack.push_back(U);
> +
> +  while (!Stack.empty()) {
> +    User *U = Stack.pop_back_val();
> +    if (!Visited.insert(U).second)
> +      continue;
> +
> +    if (Instruction *I = dyn_cast<Instruction>(U)) {
> +      Function *F = I->getParent()->getParent();
> +      if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
> +        FuncsToAlwaysInline.insert(F);
> +        Stack.push_back(F);
> +      }
> +
> +      // No need to look at further users, but we do need to inline any
> callers.
> +      continue;
> +    }
> +
> +    for (User *UU : U->users())
> +      Stack.push_back(UU);
> +  }
> +}
> +
>  bool AMDGPUAlwaysInline::runOnModule(Module &M) {
> +  AMDGPUAS AMDGPUAS = AMDGPU::getAMDGPUAS(M);
> +
>    std::vector<GlobalAlias*> AliasesToRemove;
> -  std::vector<Function *> FuncsToClone;
> +
> +  SmallPtrSet<Function *, 8> FuncsToAlwaysInline;
> +  SmallPtrSet<Function *, 8> FuncsToNoInline;
>
>    for (GlobalAlias &A : M.aliases()) {
>      if (Function* F = dyn_cast<Function>(A.getAliasee())) {
>        A.replaceAllUsesWith(F);
>        AliasesToRemove.push_back(&A);
>      }
> +
> +    // FIXME: If the aliasee isn't a function, it's some kind of constant
> expr
> +    // cast that won't be inlined through.
>    }
>
>    if (GlobalOpt) {
> @@ -63,31 +109,51 @@ bool AMDGPUAlwaysInline::runOnModule(Mod
>      }
>    }
>
> -  auto NewAttr = StressCalls ? Attribute::NoInline :
> Attribute::AlwaysInline;
> -  auto IncompatAttr
> -    = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline;
> -
> -  for (Function &F : M) {
> -    if (!F.hasLocalLinkage() && !F.isDeclaration() && !F.use_empty() &&
> -        !F.hasFnAttribute(IncompatAttr))
> -      FuncsToClone.push_back(&F);
> -  }
> +  // Always force inlining of any function that uses an LDS global
> address. This
> +  // is something of a workaround because we don't have a way of
> supporting LDS
> +  // objects defined in functions. LDS is always allocated by a kernel,
> and it
> +  // is difficult to manage LDS usage if a function may be used by
> multiple
> +  // kernels.
> +  //
> +  // OpenCL doesn't allow declaring LDS in non-kernels, so in practice
> this
> +  // should only appear when IPO passes manages to move LDs defined in a
> kernel
> +  // into a single user function.
> +
> +  for (GlobalVariable &GV : M.globals()) {
> +    // TODO: Region address
> +    unsigned AS = GV.getType()->getAddressSpace();
> +    if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS.REGION_ADDRESS)
> +      continue;
>
> -  for (Function *F : FuncsToClone) {
> -    ValueToValueMapTy VMap;
> -    Function *NewFunc = CloneFunction(F, VMap);
> -    NewFunc->setLinkage(GlobalValue::InternalLinkage);
> -    F->replaceAllUsesWith(NewFunc);
> +    recursivelyVisitUsers(GV, FuncsToAlwaysInline);
>    }
>
> -  for (Function &F : M) {
> -    if (F.hasLocalLinkage() && !F.hasFnAttribute(IncompatAttr)) {
> -      F.addFnAttr(NewAttr);
> +  if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) {
> +    auto IncompatAttr
> +      = StressCalls ? Attribute::AlwaysInline : Attribute::NoInline;
> +
> +    for (Function &F : M) {
> +      if (!F.isDeclaration() && !F.use_empty() &&
> +          !F.hasFnAttribute(IncompatAttr)) {
> +        if (StressCalls) {
> +          if (!FuncsToAlwaysInline.count(&F))
> +            FuncsToNoInline.insert(&F);
> +        } else
> +          FuncsToAlwaysInline.insert(&F);
> +      }
>      }
>    }
> -  return false;
> +
> +  for (Function *F : FuncsToAlwaysInline)
> +    F->addFnAttr(Attribute::AlwaysInline);
> +
> +  for (Function *F : FuncsToNoInline)
> +    F->addFnAttr(Attribute::NoInline);
> +
> +  return !FuncsToAlwaysInline.empty() || !FuncsToNoInline.empty();
>  }
>
>  ModulePass *llvm::createAMDGPUAlwaysInlinePass(bool GlobalOpt) {
>    return new AMDGPUAlwaysInline(GlobalOpt);
>  }
> +
>
> Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp?rev=336587&r1=336586&r2=336587&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (original)
> +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp Mon Jul  9
> 12:22:22 2018
> @@ -117,11 +117,12 @@ static cl::opt<bool, true> LateCFGStruct
>    cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
>    cl::Hidden);
>
> -static cl::opt<bool> EnableAMDGPUFunctionCalls(
> +static cl::opt<bool, true> EnableAMDGPUFunctionCalls(
>    "amdgpu-function-calls",
> -  cl::Hidden,
>    cl::desc("Enable AMDGPU function call support"),
> -  cl::init(false));
> +  cl::location(AMDGPUTargetMachine::EnableFunctionCalls),
> +  cl::init(false),
> +  cl::Hidden);
>
>  // Enable lib calls simplifications
>  static cl::opt<bool> EnableLibCallSimplify(
> @@ -311,9 +312,10 @@ AMDGPUTargetMachine::AMDGPUTargetMachine
>    initAsmInfo();
>  }
>
> -AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
> -
>  bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
> +bool AMDGPUTargetMachine::EnableFunctionCalls = false;
> +
> +AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
>
>  StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
>    Attribute GPUAttr = F.getFnAttribute("target-cpu");
>
> Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.h
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.h?rev=336587&r1=336586&r2=336587&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.h (original)
> +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.h Mon Jul  9 12:22:22
> 2018
> @@ -41,6 +41,7 @@ protected:
>
>  public:
>    static bool EnableLateStructurizeCFG;
> +  static bool EnableFunctionCalls;
>
>    AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
>                        StringRef FS, TargetOptions Options,
>
> Modified: llvm/trunk/test/CodeGen/AMDGPU/early-inline.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/early-inline.ll?rev=336587&r1=336586&r2=336587&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/AMDGPU/early-inline.ll (original)
> +++ llvm/trunk/test/CodeGen/AMDGPU/early-inline.ll Mon Jul  9 12:22:22 2018
> @@ -16,10 +16,18 @@ entry:
>  ; CHECK: mul i32
>  ; CHECK-NOT: call i32
>
> -; CHECK: define i32 @c_alias
>  define amdgpu_kernel void @caller(i32 %x) {
>  entry:
>    %res = call i32 @callee(i32 %x)
>    store volatile i32 %res, i32 addrspace(1)* undef
>    ret void
>  }
> +
> +; CHECK-LABEL: @alias_caller(
> +; CHECK-NOT: call
> +define amdgpu_kernel void @alias_caller(i32 %x) {
> +entry:
> +  %res = call i32 @c_alias(i32 %x)
> +  store volatile i32 %res, i32 addrspace(1)* undef
> +  ret void
> +}
>
> Added:
> llvm/trunk/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll?rev=336587&view=auto
>
> ==============================================================================
> ---
> llvm/trunk/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll
> (added)
> +++
> llvm/trunk/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll Mon
> Jul  9 12:22:22 2018
> @@ -0,0 +1,77 @@
> +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-function-calls
> -amdgpu-always-inline %s | FileCheck -check-prefixes=CALLS-ENABLED,ALL %s
> +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-function-calls
> -amdgpu-stress-function-calls -amdgpu-always-inline %s | FileCheck
> -check-prefixes=STRESS-CALLS,ALL %s
> +
> +target datalayout =
> "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"
> +
> + at lds0 = addrspace(3) global i32 undef, align 4
> + at lds1 = addrspace(3) global [512 x i32] undef, align 4
> + at nested.lds.address = addrspace(1) global i32 addrspace(3)* @lds0, align 4
> + at gds0 = addrspace(2) global i32 undef, align 4
> +
> + at alias.lds0 = alias i32, i32 addrspace(3)* @lds0
> + at lds.cycle = addrspace(3) global i32 ptrtoint (i32 addrspace(3)*
> @lds.cycle to i32), align 4
> +
> +
> +; ALL-LABEL: define i32 @load_lds_simple() #0 {
> +define i32 @load_lds_simple() {
> +  %load = load i32, i32 addrspace(3)* @lds0, align 4
> +  ret i32 %load
> +}
> +
> +; ALL-LABEL: define i32 @load_gds_simple() #0 {
> +define i32 @load_gds_simple() {
> +  %load = load i32, i32 addrspace(2)* @gds0, align 4
> +  ret i32 %load
> +}
> +
> +; ALL-LABEL: define i32 @load_lds_const_gep() #0 {
> +define i32 @load_lds_const_gep() {
> +  %load = load i32, i32 addrspace(3)* getelementptr inbounds ([512 x
> i32], [512 x i32] addrspace(3)* @lds1, i64 0, i64 4), align 4
> +  ret i32 %load
> +}
> +
> +; ALL-LABEL: define i32 @load_lds_var_gep(i32 %idx) #0 {
> +define i32 @load_lds_var_gep(i32 %idx) {
> +  %gep = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)*
> @lds1, i32 0, i32 %idx
> +  %load = load i32, i32 addrspace(3)* %gep, align 4
> +  ret i32 %load
> +}
> +
> +; ALL-LABEL: define i32 addrspace(3)* @load_nested_address(i32 %idx) #0 {
> +define i32 addrspace(3)* @load_nested_address(i32 %idx) {
> +  %load = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(1)*
> @nested.lds.address, align 4
> +  ret i32 addrspace(3)* %load
> +}
> +
> +; ALL-LABEL: define i32 @load_lds_alias() #0 {
> +define i32 @load_lds_alias() {
> +  %load = load i32, i32 addrspace(3)* @alias.lds0, align 4
> +  ret i32 %load
> +}
> +
> +; ALL-LABEL: define i32 @load_lds_cycle() #0 {
> +define i32 @load_lds_cycle() {
> +  %load = load i32, i32 addrspace(3)* @lds.cycle, align 4
> +  ret i32 %load
> +}
> +
> +; ALL-LABEL: define i1 @icmp_lds_address() #0 {
> +define i1 @icmp_lds_address() {
> +  ret i1 icmp eq (i32 addrspace(3)* @lds0, i32 addrspace(3)* null)
> +}
> +
> +; ALL-LABEL: define i32 @transitive_call() #0 {
> +define i32 @transitive_call() {
> +  %call = call i32 @load_lds_simple()
> +  ret i32 %call
> +}
> +
> +; ALL-LABEL: define i32 @recursive_call_lds(i32 %arg0) #0 {
> +define i32 @recursive_call_lds(i32 %arg0) {
> +  %load = load i32, i32 addrspace(3)* @lds0, align 4
> +  %add = add i32 %arg0, %load
> +  %call = call i32 @recursive_call_lds(i32 %add)
> +  ret i32 %call
> +}
> +
> +; ALL: attributes #0 = { alwaysinline }
>
> Modified: llvm/trunk/test/CodeGen/AMDGPU/stress-calls.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/stress-calls.ll?rev=336587&r1=336586&r2=336587&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/AMDGPU/stress-calls.ll (original)
> +++ llvm/trunk/test/CodeGen/AMDGPU/stress-calls.ll Mon Jul  9 12:22:22 2018
> @@ -1,4 +1,4 @@
> -; RUN: opt -S -amdgpu-stress-function-calls -amdgpu-always-inline %s |
> FileCheck %s
> +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls
> -amdgpu-always-inline %s | FileCheck %s
>
>  ; CHECK: define internal fastcc i32 @alwaysinline_func(i32 %a) #0 {
>  define internal fastcc i32 @alwaysinline_func(i32 %a) alwaysinline {
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20180709/51498005/attachment.html>


More information about the llvm-commits mailing list