[llvm] r310731 - [AMDGPU] Ported and adopted AMDLibCalls pass

Fri Aug 11 09:42:10 PDT 2017

Author: rampitec
Date: Fri Aug 11 09:42:09 2017
New Revision: 310731

URL: http://llvm.org/viewvc/llvm-project?rev=310731&view=rev
Log:
[AMDGPU] Ported and adopted AMDLibCalls pass

The pass does simplifications of well known AMD library calls.
If given -amdgpu-prelink option it works in a pre-link mode which
allows to reference new library functions which will be linked in
later.

In addition it also used to process traditional AMD option
-fuse-native which allows to replace some of the functions with
their fast native implementations from the library.

The necessary glue to pass the prelink option and translate
-fuse-native is to be added to the driver.

Differential Revision: https://reviews.llvm.org/D36436

Added:
    llvm/trunk/lib/Target/AMDGPU/AMDGPULibCalls.cpp
    llvm/trunk/lib/Target/AMDGPU/AMDGPULibFunc.cpp
    llvm/trunk/lib/Target/AMDGPU/AMDGPULibFunc.h
    llvm/trunk/test/CodeGen/AMDGPU/simplify-libcalls.ll
Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
    llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPU.h?rev=310731&r1=310730&r2=310731&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h Fri Aug 11 09:42:09 2017
@@ -52,6 +52,8 @@ FunctionPass *createSIDebuggerInsertNops
 FunctionPass *createSIInsertWaitsPass();
 FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createSIFixWWMLivenessPass();
+FunctionPass *createAMDGPUSimplifyLibCallsPass();
+FunctionPass *createAMDGPUUseNativeCallsPass();
 FunctionPass *createAMDGPUCodeGenPreparePass();
 FunctionPass *createAMDGPUMachineCFGStructurizerPass();
 FunctionPass *createAMDGPURewriteOutArgumentsPass();
@@ -125,6 +127,12 @@ extern char &SIOptimizeExecMaskingID;
 void initializeSIFixWWMLivenessPass(PassRegistry &);
 extern char &SIFixWWMLivenessID;
 
+void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &);
+extern char &AMDGPUSimplifyLibCallsID;
+
+void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
+extern char &AMDGPUUseNativeCallsID;
+
 // Passes common to R600 and SI
 FunctionPass *createAMDGPUPromoteAlloca();
 void initializeAMDGPUPromoteAllocaPass(PassRegistry&);

Added: llvm/trunk/lib/Target/AMDGPU/AMDGPULibCalls.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPULibCalls.cpp?rev=310731&view=auto
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPULibCalls.cpp (added)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPULibCalls.cpp Fri Aug 11 09:42:09 2017
@@ -0,0 +1,1670 @@
+//===- AMDGPULibCalls.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This file does AMD library function optimizations.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "amdgpu-simplifylib"
+
+#include "AMDGPU.h"
+#include "AMDGPULibFunc.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+#include <cmath>
+
+using namespace llvm;
+
+static cl::opt<bool> EnablePreLink("amdgpu-prelink",
+  cl::desc("Enable pre-link mode optimizations"),
+  cl::init(false),
+  cl::Hidden);
+
+static cl::list<std::string> UseNative("amdgpu-use-native",
+  cl::desc("Comma separated list of functions to replace with native, or all"),
+  cl::CommaSeparated, cl::ValueOptional,
+  cl::Hidden);
+
+#define MATH_PI     3.14159265358979323846264338327950288419716939937511
+#define MATH_E      2.71828182845904523536028747135266249775724709369996
+#define MATH_SQRT2  1.41421356237309504880168872420969807856967187537695
+
+#define MATH_LOG2E     1.4426950408889634073599246810018921374266459541529859
+#define MATH_LOG10E    0.4342944819032518276511289189166050822943970058036665
+// Value of log2(10)
+#define MATH_LOG2_10   3.3219280948873623478703194294893901758648313930245806
+// Value of 1 / log2(10)
+#define MATH_RLOG2_10  0.3010299956639811952137388947244930267681898814621085
+// Value of 1 / M_LOG2E_F = 1 / log2(e)
+#define MATH_RLOG2_E   0.6931471805599453094172321214581765680755001343602552
+
+namespace llvm {
+
+class AMDGPULibCalls {
+private:
+
+  typedef llvm::AMDGPULibFunc FuncInfo;
+
+  // -fuse-native.
+  bool AllNative = false;
+
+  bool useNativeFunc(const StringRef F) const;
+
+  // Return a pointer (pointer expr) to the function if function defintion with
+  // "FuncName" exists. It may create a new function prototype in pre-link mode.
+  Constant *getFunction(Module *M, const FuncInfo& fInfo);
+
+  // Replace a normal function with its native version.
+  bool replaceWithNative(CallInst *CI, const FuncInfo &FInfo);
+
+  bool parseFunctionName(const StringRef& FMangledName,
+                         FuncInfo *FInfo=nullptr /*out*/);
+
+  bool TDOFold(CallInst *CI, const FuncInfo &FInfo);
+
+  /* Specialized optimizations */
+
+  // recip (half or native)
+  bool fold_recip(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+  // divide (half or native)
+  bool fold_divide(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+  // pow/powr/pown
+  bool fold_pow(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+  // rootn
+  bool fold_rootn(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+  // fma/mad
+  bool fold_fma_mad(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+  // -fuse-native for sincos
+  bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo);
+
+  // evaluate calls if calls' arguments are constants.
+  bool evaluateScalarMathFunc(FuncInfo &FInfo, double& Res0,
+    double& Res1, Constant *copr0, Constant *copr1, Constant *copr2);
+  bool evaluateCall(CallInst *aCI, FuncInfo &FInfo);
+
+  // exp
+  bool fold_exp(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+  // exp2
+  bool fold_exp2(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+  // exp10
+  bool fold_exp10(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+  // log
+  bool fold_log(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+  // log2
+  bool fold_log2(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+  // log10
+  bool fold_log10(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+  // sqrt
+  bool fold_sqrt(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+
+  // sin/cos
+  bool fold_sincos(CallInst * CI, IRBuilder<> &B, AliasAnalysis * AA);
+
+  // Get insertion point at entry.
+  BasicBlock::iterator getEntryIns(CallInst * UI);
+  // Insert an Alloc instruction.
+  AllocaInst* insertAlloca(CallInst * UI, IRBuilder<> &B, const char *prefix);
+  // Get a scalar native builtin signle argument FP function
+  Constant* getNativeFunction(Module* M, const FuncInfo &FInfo);
+
+protected:
+  CallInst *CI;
+
+  bool isUnsafeMath(const CallInst *CI) const;
+
+  void replaceCall(Value *With) {
+    CI->replaceAllUsesWith(With);
+    CI->eraseFromParent();
+  }
+
+public:
+  bool fold(CallInst *CI, AliasAnalysis *AA = nullptr);
+
+  void initNativeFuncs();
+
+  // Replace a normal math function call with that native version
+  bool useNative(CallInst *CI);
+};
+
+} // end llvm namespace
+
+namespace {
+
+  class AMDGPUSimplifyLibCalls : public FunctionPass {
+
+  AMDGPULibCalls Simplifier;
+
+  public:
+    static char ID; // Pass identification
+
+    AMDGPUSimplifyLibCalls() : FunctionPass(ID) {
+      initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AAResultsWrapperPass>();
+    }
+
+    bool runOnFunction(Function &M) override;
+  };
+
+  class AMDGPUUseNativeCalls : public FunctionPass {
+
+  AMDGPULibCalls Simplifier;
+
+  public:
+    static char ID; // Pass identification
+
+    AMDGPUUseNativeCalls() : FunctionPass(ID) {
+      initializeAMDGPUUseNativeCallsPass(*PassRegistry::getPassRegistry());
+      Simplifier.initNativeFuncs();
+    }
+
+    bool runOnFunction(Function &F) override;
+  };
+
+} // end anonymous namespace.
+
+char AMDGPUSimplifyLibCalls::ID = 0;
+char AMDGPUUseNativeCalls::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AMDGPUSimplifyLibCalls, "amdgpu-simplifylib",
+                      "Simplify well-known AMD library calls", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(AMDGPUSimplifyLibCalls, "amdgpu-simplifylib",
+                    "Simplify well-known AMD library calls", false, false)
+
+INITIALIZE_PASS(AMDGPUUseNativeCalls, "amdgpu-usenative",
+                "Replace builtin math calls with that native versions.",
+                false, false)
+
+template <typename IRB>
+CallInst *CreateCallEx(IRB &B, Value *Callee, Value *Arg, const Twine &Name="")
+{
+  CallInst *R = B.CreateCall(Callee, Arg, Name);
+  if (Function* F = dyn_cast<Function>(Callee))
+    R->setCallingConv(F->getCallingConv());
+  return R;
+}
+
+template <typename IRB>
+CallInst *CreateCallEx2(IRB &B, Value *Callee, Value *Arg1, Value *Arg2,
+                        const Twine &Name="") {
+  CallInst *R = B.CreateCall(Callee, {Arg1, Arg2}, Name);
+  if (Function* F = dyn_cast<Function>(Callee))
+    R->setCallingConv(F->getCallingConv());
+  return R;
+}
+
+//  Data structures for table-driven optimizations.
+//  FuncTbl works for both f32 and f64 functions with 1 input argument
+
+struct TableEntry {
+  double   result;
+  double   input;
+};
+
+/* a list of {result, input} */
+static const TableEntry tbl_acos[] = {
+  {MATH_PI/2.0, 0.0},
+  {MATH_PI/2.0, -0.0},
+  {0.0, 1.0},
+  {MATH_PI, -1.0}
+};
+static const TableEntry tbl_acosh[] = {
+  {0.0, 1.0}
+};
+static const TableEntry tbl_acospi[] = {
+  {0.5, 0.0},
+  {0.5, -0.0},
+  {0.0, 1.0},
+  {1.0, -1.0}
+};
+static const TableEntry tbl_asin[] = {
+  {0.0, 0.0},
+  {-0.0, -0.0},
+  {MATH_PI/2.0, 1.0},
+  {-MATH_PI/2.0, -1.0}
+};
+static const TableEntry tbl_asinh[] = {
+  {0.0, 0.0},
+  {-0.0, -0.0}
+};
+static const TableEntry tbl_asinpi[] = {
+  {0.0, 0.0},
+  {-0.0, -0.0},
+  {0.5, 1.0},
+  {-0.5, -1.0}
+};
+static const TableEntry tbl_atan[] = {
+  {0.0, 0.0},
+  {-0.0, -0.0},
+  {MATH_PI/4.0, 1.0},
+  {-MATH_PI/4.0, -1.0}
+};
+static const TableEntry tbl_atanh[] = {
+  {0.0, 0.0},
+  {-0.0, -0.0}
+};
+static const TableEntry tbl_atanpi[] = {
+  {0.0, 0.0},
+  {-0.0, -0.0},
+  {0.25, 1.0},
+  {-0.25, -1.0}
+};
+static const TableEntry tbl_cbrt[] = {
+  {0.0, 0.0},
+  {-0.0, -0.0},
+  {1.0, 1.0},
+  {-1.0, -1.0},
+};
+static const TableEntry tbl_cos[] = {
+  {1.0, 0.0},
+  {1.0, -0.0}
+};
+static const TableEntry tbl_cosh[] = {
+  {1.0, 0.0},
+  {1.0, -0.0}
+};
+static const TableEntry tbl_cospi[] = {
+  {1.0, 0.0},
+  {1.0, -0.0}
+};
+static const TableEntry tbl_erfc[] = {
+  {1.0, 0.0},
+  {1.0, -0.0}
+};
+static const TableEntry tbl_erf[] = {
+  {0.0, 0.0},
+  {-0.0, -0.0}
+};
+static const TableEntry tbl_exp[] = {
+  {1.0, 0.0},
+  {1.0, -0.0},
+  {MATH_E, 1.0}
+};
+static const TableEntry tbl_exp2[] = {
+  {1.0, 0.0},
+  {1.0, -0.0},
+  {2.0, 1.0}
+};
+static const TableEntry tbl_exp10[] = {
+  {1.0, 0.0},
+  {1.0, -0.0},
+  {10.0, 1.0}
+};
+static const TableEntry tbl_expm1[] = {
+  {0.0, 0.0},
+  {-0.0, -0.0}
+};
+static const TableEntry tbl_log[] = {
+  {0.0, 1.0},
+  {1.0, MATH_E}
+};
+static const TableEntry tbl_log2[] = {
+  {0.0, 1.0},
+  {1.0, 2.0}
+};
+static const TableEntry tbl_log10[] = {
+  {0.0, 1.0},
+  {1.0, 10.0}
+};
+static const TableEntry tbl_rsqrt[] = {
+  {1.0, 1.0},
+  {1.0/MATH_SQRT2, 2.0}
+};
+static const TableEntry tbl_sin[] = {
+  {0.0, 0.0},
+  {-0.0, -0.0}
+};
+static const TableEntry tbl_sinh[] = {
+  {0.0, 0.0},
+  {-0.0, -0.0}
+};
+static const TableEntry tbl_sinpi[] = {
+  {0.0, 0.0},
+  {-0.0, -0.0}
+};
+static const TableEntry tbl_sqrt[] = {
+  {0.0, 0.0},
+  {1.0, 1.0},
+  {MATH_SQRT2, 2.0}
+};
+static const TableEntry tbl_tan[] = {
+  {0.0, 0.0},
+  {-0.0, -0.0}
+};
+static const TableEntry tbl_tanh[] = {
+  {0.0, 0.0},
+  {-0.0, -0.0}
+};
+static const TableEntry tbl_tanpi[] = {
+  {0.0, 0.0},
+  {-0.0, -0.0}
+};
+static const TableEntry tbl_tgamma[] = {
+  {1.0, 1.0},
+  {1.0, 2.0},
+  {2.0, 3.0},
+  {6.0, 4.0}
+};
+
+static bool HasNative(AMDGPULibFunc::EFuncId id) {
+  switch(id) {
+  case AMDGPULibFunc::EI_DIVIDE:
+  case AMDGPULibFunc::EI_COS:
+  case AMDGPULibFunc::EI_EXP:
+  case AMDGPULibFunc::EI_EXP2:
+  case AMDGPULibFunc::EI_EXP10:
+  case AMDGPULibFunc::EI_LOG:
+  case AMDGPULibFunc::EI_LOG2:
+  case AMDGPULibFunc::EI_LOG10:
+  case AMDGPULibFunc::EI_POWR:
+  case AMDGPULibFunc::EI_RECIP:
+  case AMDGPULibFunc::EI_RSQRT:
+  case AMDGPULibFunc::EI_SIN:
+  case AMDGPULibFunc::EI_SINCOS:
+  case AMDGPULibFunc::EI_SQRT:
+  case AMDGPULibFunc::EI_TAN:
+    return true;
+  default:;
+  }
+  return false;
+}
+
+struct TableRef {
+  size_t size;
+  const TableEntry *table; // variable size: from 0 to (size - 1)
+
+  TableRef() : size(0), table(nullptr) {}
+
+  template <size_t N>
+  TableRef(const TableEntry (&tbl)[N]) : size(N), table(&tbl[0]) {}
+};
+
+static TableRef getOptTable(AMDGPULibFunc::EFuncId id) {
+  switch(id) {
+  case AMDGPULibFunc::EI_ACOS:    return TableRef(tbl_acos);
+  case AMDGPULibFunc::EI_ACOSH:   return TableRef(tbl_acosh);
+  case AMDGPULibFunc::EI_ACOSPI:  return TableRef(tbl_acospi);
+  case AMDGPULibFunc::EI_ASIN:    return TableRef(tbl_asin);
+  case AMDGPULibFunc::EI_ASINH:   return TableRef(tbl_asinh);
+  case AMDGPULibFunc::EI_ASINPI:  return TableRef(tbl_asinpi);
+  case AMDGPULibFunc::EI_ATAN:    return TableRef(tbl_atan);
+  case AMDGPULibFunc::EI_ATANH:   return TableRef(tbl_atanh);
+  case AMDGPULibFunc::EI_ATANPI:  return TableRef(tbl_atanpi);
+  case AMDGPULibFunc::EI_CBRT:    return TableRef(tbl_cbrt);
+  case AMDGPULibFunc::EI_NCOS:
+  case AMDGPULibFunc::EI_COS:     return TableRef(tbl_cos);
+  case AMDGPULibFunc::EI_COSH:    return TableRef(tbl_cosh);
+  case AMDGPULibFunc::EI_COSPI:   return TableRef(tbl_cospi);
+  case AMDGPULibFunc::EI_ERFC:    return TableRef(tbl_erfc);
+  case AMDGPULibFunc::EI_ERF:     return TableRef(tbl_erf);
+  case AMDGPULibFunc::EI_EXP:     return TableRef(tbl_exp);
+  case AMDGPULibFunc::EI_NEXP2:
+  case AMDGPULibFunc::EI_EXP2:    return TableRef(tbl_exp2);
+  case AMDGPULibFunc::EI_EXP10:   return TableRef(tbl_exp10);
+  case AMDGPULibFunc::EI_EXPM1:   return TableRef(tbl_expm1);
+  case AMDGPULibFunc::EI_LOG:     return TableRef(tbl_log);
+  case AMDGPULibFunc::EI_NLOG2:
+  case AMDGPULibFunc::EI_LOG2:    return TableRef(tbl_log2);
+  case AMDGPULibFunc::EI_LOG10:   return TableRef(tbl_log10);
+  case AMDGPULibFunc::EI_NRSQRT:
+  case AMDGPULibFunc::EI_RSQRT:   return TableRef(tbl_rsqrt);
+  case AMDGPULibFunc::EI_NSIN:
+  case AMDGPULibFunc::EI_SIN:     return TableRef(tbl_sin);
+  case AMDGPULibFunc::EI_SINH:    return TableRef(tbl_sinh);
+  case AMDGPULibFunc::EI_SINPI:   return TableRef(tbl_sinpi);
+  case AMDGPULibFunc::EI_NSQRT:
+  case AMDGPULibFunc::EI_SQRT:    return TableRef(tbl_sqrt);
+  case AMDGPULibFunc::EI_TAN:     return TableRef(tbl_tan);
+  case AMDGPULibFunc::EI_TANH:    return TableRef(tbl_tanh);
+  case AMDGPULibFunc::EI_TANPI:   return TableRef(tbl_tanpi);
+  case AMDGPULibFunc::EI_TGAMMA:  return TableRef(tbl_tgamma);
+  default:;
+  }
+  return TableRef();
+}
+
+static inline int getVecSize(const AMDGPULibFunc& FInfo) {
+  return FInfo.Leads[0].VectorSize;
+}
+
+static inline AMDGPULibFunc::EType getArgType(const AMDGPULibFunc& FInfo) {
+  return (AMDGPULibFunc::EType)FInfo.Leads[0].ArgType;
+}
+
+Constant *AMDGPULibCalls::getFunction(Module *M, const FuncInfo& fInfo) {
+  // If we are doing PreLinkOpt, the function is external. So it is safe to
+  // use getOrInsertFunction() at this stage.
+
+  return EnablePreLink ? AMDGPULibFunc::getOrInsertFunction(M, fInfo)
+                       : AMDGPULibFunc::getFunction(M, fInfo);
+}
+
+bool AMDGPULibCalls::parseFunctionName(const StringRef& FMangledName,
+                                    FuncInfo *FInfo) {
+  return AMDGPULibFunc::parse(FMangledName, *FInfo);
+}
+
+bool AMDGPULibCalls::isUnsafeMath(const CallInst *CI) const {
+  if (auto Op = dyn_cast<FPMathOperator>(CI))
+    if (Op->hasUnsafeAlgebra())
+      return true;
+  const Function *F = CI->getParent()->getParent();
+  Attribute Attr = F->getFnAttribute("unsafe-fp-math");
+  return Attr.getValueAsString() == "true";
+}
+
+bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
+  return AllNative ||
+         std::find(UseNative.begin(), UseNative.end(), F) != UseNative.end();
+}
+
+void AMDGPULibCalls::initNativeFuncs() {
+  AllNative = useNativeFunc("all") ||
+              (UseNative.getNumOccurrences() && UseNative.size() == 1 &&
+               UseNative.begin()->empty());
+}
+
+bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) {
+  bool native_sin = useNativeFunc("sin");
+  bool native_cos = useNativeFunc("cos");
+
+  if (native_sin && native_cos) {
+    Module *M = aCI->getModule();
+    Value *opr0 = aCI->getArgOperand(0);
+
+    AMDGPULibFunc nf;
+    nf.Leads[0].ArgType = FInfo.Leads[0].ArgType;
+    nf.Leads[0].VectorSize = FInfo.Leads[0].VectorSize;
+
+    nf.setPrefix(AMDGPULibFunc::NATIVE);
+    nf.setId(AMDGPULibFunc::EI_SIN);
+    Constant *sinExpr = getFunction(M, nf);
+
+    nf.setPrefix(AMDGPULibFunc::NATIVE);
+    nf.setId(AMDGPULibFunc::EI_COS);
+    Constant *cosExpr = getFunction(M, nf);
+    if (sinExpr && cosExpr) {
+      Value *sinval = CallInst::Create(sinExpr, opr0, "splitsin", aCI);
+      Value *cosval = CallInst::Create(cosExpr, opr0, "splitcos", aCI);
+      new StoreInst(cosval, aCI->getArgOperand(1), aCI);
+
+      DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
+                                          << " with native version of sin/cos");
+
+      replaceCall(sinval);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool AMDGPULibCalls::useNative(CallInst *aCI) {
+  CI = aCI;
+  Function *Callee = aCI->getCalledFunction();
+
+  FuncInfo FInfo;
+  if (!parseFunctionName(Callee->getName(), &FInfo) ||
+      FInfo.getPrefix() != AMDGPULibFunc::NOPFX ||
+      getArgType(FInfo) == AMDGPULibFunc::F64 ||
+      !HasNative(FInfo.getId()) ||
+      !(AllNative || useNativeFunc(FInfo.getName())) ) {
+    return false;
+  }
+
+  if (FInfo.getId() == AMDGPULibFunc::EI_SINCOS)
+    return sincosUseNative(aCI, FInfo);
+
+  FInfo.setPrefix(AMDGPULibFunc::NATIVE);
+  Constant *F = getFunction(aCI->getModule(), FInfo);
+  if (!F)
+    return false;
+
+  aCI->setCalledFunction(F);
+  DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
+                                      << " with native version");
+  return true;
+}
+
+// This function returns false if no change; return true otherwise.
+bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
+  this->CI = CI;
+  Function *Callee = CI->getCalledFunction();
+
+  // Ignore indirect calls.
+  if (Callee == 0) return false;
+
+  FuncInfo FInfo;
+  if (!parseFunctionName(Callee->getName(), &FInfo))
+    return false;
+
+  // Further check the number of arguments to see if they match.
+  if (CI->getNumArgOperands() != FInfo.getNumArgs())
+    return false;
+
+  BasicBlock *BB = CI->getParent();
+  LLVMContext &Context = CI->getParent()->getContext();
+  IRBuilder<> B(Context);
+
+  // Set the builder to the instruction after the call.
+  B.SetInsertPoint(BB, CI->getIterator());
+
+  // Copy fast flags from the original call.
+  if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(CI))
+    B.setFastMathFlags(FPOp->getFastMathFlags());
+
+  if (TDOFold(CI, FInfo))
+    return true;
+
+  // Under unsafe-math, evaluate calls if possible.
+  // According to Brian Sumner, we can do this for all f32 function calls
+  // using host's double function calls.
+  if (isUnsafeMath(CI) && evaluateCall(CI, FInfo))
+    return true;
+
+  // Specilized optimizations for each function call
+  switch (FInfo.getId()) {
+  case AMDGPULibFunc::EI_RECIP:
+    // skip vector function
+    assert ((FInfo.getPrefix() == AMDGPULibFunc::NATIVE ||
+             FInfo.getPrefix() == AMDGPULibFunc::HALF) &&
+            "recip must be an either native or half function");
+    return (getVecSize(FInfo) != 1) ? false : fold_recip(CI, B, FInfo);
+
+  case AMDGPULibFunc::EI_DIVIDE:
+    // skip vector function
+    assert ((FInfo.getPrefix() == AMDGPULibFunc::NATIVE ||
+             FInfo.getPrefix() == AMDGPULibFunc::HALF) &&
+            "divide must be an either native or half function");
+    return (getVecSize(FInfo) != 1) ? false : fold_divide(CI, B, FInfo);
+
+  case AMDGPULibFunc::EI_POW:
+  case AMDGPULibFunc::EI_POWR:
+  case AMDGPULibFunc::EI_POWN:
+    return fold_pow(CI, B, FInfo);
+
+  case AMDGPULibFunc::EI_ROOTN:
+    // skip vector function
+    return (getVecSize(FInfo) != 1) ? false : fold_rootn(CI, B, FInfo);
+
+  case AMDGPULibFunc::EI_FMA:
+  case AMDGPULibFunc::EI_MAD:
+  case AMDGPULibFunc::EI_NFMA:
+    // skip vector function
+    return (getVecSize(FInfo) != 1) ? false : fold_fma_mad(CI, B, FInfo);
+
+  case AMDGPULibFunc::EI_SQRT:
+    return isUnsafeMath(CI) && fold_sqrt(CI, B, FInfo);
+  case AMDGPULibFunc::EI_COS:
+  case AMDGPULibFunc::EI_SIN:
+    if ((getArgType(FInfo) == AMDGPULibFunc::F32 ||
+         getArgType(FInfo) == AMDGPULibFunc::F64)
+        && (FInfo.getPrefix() == AMDGPULibFunc::NOPFX))
+      return fold_sincos(CI, B, AA);
+
+    break;
+
+  default:
+    break;
+  }
+
+  return false;
+}
+
+bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
+  // Table-Driven optimization
+  const TableRef tr = getOptTable(FInfo.getId());
+  if (tr.size==0)
+    return false;
+
+  int const sz = (int)tr.size;
+  const TableEntry * const ftbl = tr.table;
+  Value *opr0 = CI->getArgOperand(0);
+
+  if (getVecSize(FInfo) > 1) {
+    if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(opr0)) {
+      SmallVector<double, 0> DVal;
+      for (int eltNo = 0; eltNo < getVecSize(FInfo); ++eltNo) {
+        ConstantFP *eltval = dyn_cast<ConstantFP>(
+                               CV->getElementAsConstant((unsigned)eltNo));
+        assert(eltval && "Non-FP arguments in math function!");
+        bool found = false;
+        for (int i=0; i < sz; ++i) {
+          if (eltval->isExactlyValue(ftbl[i].input)) {
+            DVal.push_back(ftbl[i].result);
+            found = true;
+            break;
+          }
+        }
+        if (!found) {
+          // This vector constants not handled yet.
+          return false;
+        }
+      }
+      LLVMContext &context = CI->getParent()->getParent()->getContext();
+      Constant *nval;
+      if (getArgType(FInfo) == AMDGPULibFunc::F32) {
+        SmallVector<float, 0> FVal;
+        for (unsigned i = 0; i < DVal.size(); ++i) {
+          FVal.push_back((float)DVal[i]);
+        }
+        ArrayRef<float> tmp(FVal);
+        nval = ConstantDataVector::get(context, tmp);
+      } else { // F64
+        ArrayRef<double> tmp(DVal);
+        nval = ConstantDataVector::get(context, tmp);
+      }
+      DEBUG(errs() << "AMDIC: " << *CI
+                   << " ---> " << *nval << "\n");
+      replaceCall(nval);
+      return true;
+    }
+  } else {
+    // Scalar version
+    if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) {
+      for (int i = 0; i < sz; ++i) {
+        if (CF->isExactlyValue(ftbl[i].input)) {
+          Value *nval = ConstantFP::get(CF->getType(), ftbl[i].result);
+          DEBUG(errs() << "AMDIC: " << *CI
+                       << " ---> " << *nval << "\n");
+          replaceCall(nval);
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+bool AMDGPULibCalls::replaceWithNative(CallInst *CI, const FuncInfo &FInfo) {
+  Module *M = CI->getModule();
+  if (getArgType(FInfo) != AMDGPULibFunc::F32 ||
+      FInfo.getPrefix() != AMDGPULibFunc::NOPFX ||
+      !HasNative(FInfo.getId()))
+    return false;
+
+  AMDGPULibFunc nf = FInfo;
+  nf.setPrefix(AMDGPULibFunc::NATIVE);
+  if (Constant *FPExpr = getFunction(M, nf)) {
+    DEBUG(dbgs() << "AMDIC: " << *CI << " ---> ");
+
+    CI->setCalledFunction(FPExpr);
+
+    DEBUG(dbgs() << *CI << '\n');
+
+    return true;
+  }
+  return false;
+}
+
+//  [native_]half_recip(c) ==> 1.0/c
+bool AMDGPULibCalls::fold_recip(CallInst *CI, IRBuilder<> &B,
+                                const FuncInfo &FInfo) {
+  Value *opr0 = CI->getArgOperand(0);
+  if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) {
+    // Just create a normal div. Later, InstCombine will be able
+    // to compute the divide into a constant (avoid check float infinity
+    // or subnormal at this point).
+    Value *nval = B.CreateFDiv(ConstantFP::get(CF->getType(), 1.0),
+                               opr0,
+                               "recip2div");
+    DEBUG(errs() << "AMDIC: " << *CI
+                 << " ---> " << *nval << "\n");
+    replaceCall(nval);
+    return true;
+  }
+  return false;
+}
+
+//  [native_]half_divide(x, c) ==> x/c
+bool AMDGPULibCalls::fold_divide(CallInst *CI, IRBuilder<> &B,
+                                 const FuncInfo &FInfo) {
+  Value *opr0 = CI->getArgOperand(0);
+  Value *opr1 = CI->getArgOperand(1);
+  ConstantFP *CF0 = dyn_cast<ConstantFP>(opr0);
+  ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1);
+
+  if ((CF0 && CF1) ||  // both are constants
+      (CF1 && (getArgType(FInfo) == AMDGPULibFunc::F32)))
+      // CF1 is constant && f32 divide
+  {
+    Value *nval1 = B.CreateFDiv(ConstantFP::get(opr1->getType(), 1.0),
+                                opr1, "__div2recip");
+    Value *nval  = B.CreateFMul(opr0, nval1, "__div2mul");
+    replaceCall(nval);
+    return true;
+  }
+  return false;
+}
+
+namespace llvm {
+static double log2(double V) {
+#if _XOPEN_SOURCE >= 600 || _ISOC99_SOURCE || _POSIX_C_SOURCE >= 200112L
+  return ::log2(V);
+#else
+  return log(V) / 0.693147180559945309417;
+#endif
+}
+}
+
+bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
+                              const FuncInfo &FInfo) {
+  assert((FInfo.getId() == AMDGPULibFunc::EI_POW ||
+          FInfo.getId() == AMDGPULibFunc::EI_POWR ||
+          FInfo.getId() == AMDGPULibFunc::EI_POWN) &&
+         "fold_pow: encounter a wrong function call");
+
+  Value *opr0, *opr1;
+  ConstantFP *CF;
+  ConstantInt *CINT;
+  ConstantAggregateZero *CZero;
+  Type *eltType;
+
+  opr0 = CI->getArgOperand(0);
+  opr1 = CI->getArgOperand(1);
+  CZero = dyn_cast<ConstantAggregateZero>(opr1);
+  if (getVecSize(FInfo) == 1) {
+    eltType = opr0->getType();
+    CF = dyn_cast<ConstantFP>(opr1);
+    CINT = dyn_cast<ConstantInt>(opr1);
+  } else {
+    VectorType *VTy = dyn_cast<VectorType>(opr0->getType());
+    assert(VTy && "Oprand of vector function should be of vectortype");
+    eltType = VTy->getElementType();
+    ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr1);
+
+    // Now, only Handle vector const whose elements have the same value.
+    CF = CDV ? dyn_cast_or_null<ConstantFP>(CDV->getSplatValue()) : nullptr;
+    CINT = CDV ? dyn_cast_or_null<ConstantInt>(CDV->getSplatValue()) : nullptr;
+  }
+
+  // No unsafe math , no constant argument, do nothing
+  if (!isUnsafeMath(CI) && !CF && !CINT && !CZero)
+    return false;
+
+  // 0x1111111 means that we don't do anything for this call.
+  int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : 0x1111111);
+
+  if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0) || CZero) {
+    //  pow/powr/pown(x, 0) == 1
+    DEBUG(errs() << "AMDIC: " << *CI << " ---> 1\n");
+    Constant *cnval = ConstantFP::get(eltType, 1.0);
+    if (getVecSize(FInfo) > 1) {
+      cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
+    }
+    replaceCall(cnval);
+    return true;
+  }
+  if ((CF && CF->isExactlyValue(1.0)) || (CINT && ci_opr1 == 1)) {
+    // pow/powr/pown(x, 1.0) = x
+    DEBUG(errs() << "AMDIC: " << *CI
+                 << " ---> " << *opr0 << "\n");
+    replaceCall(opr0);
+    return true;
+  }
+  if ((CF && CF->isExactlyValue(2.0)) || (CINT && ci_opr1 == 2)) {
+    // pow/powr/pown(x, 2.0) = x*x
+    DEBUG(errs() << "AMDIC: " << *CI
+                 << " ---> " << *opr0 << " * " << *opr0 << "\n");
+    Value *nval = B.CreateFMul(opr0, opr0, "__pow2");
+    replaceCall(nval);
+    return true;
+  }
+  if ((CF && CF->isExactlyValue(-1.0)) || (CINT && ci_opr1 == -1)) {
+    // pow/powr/pown(x, -1.0) = 1.0/x
+    DEBUG(errs() << "AMDIC: " << *CI
+                 << " ---> 1 / " << *opr0 << "\n");
+    Constant *cnval = ConstantFP::get(eltType, 1.0);
+    if (getVecSize(FInfo) > 1) {
+      cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
+    }
+    Value *nval = B.CreateFDiv(cnval, opr0, "__powrecip");
+    replaceCall(nval);
+    return true;
+  }
+
+  Module *M = CI->getModule();
+  if (CF && (CF->isExactlyValue(0.5) || CF->isExactlyValue(-0.5))) {
+    // pow[r](x, [-]0.5) = sqrt(x)
+    bool issqrt = CF->isExactlyValue(0.5);
+    if (Constant *FPExpr = getFunction(M,
+        AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
+                             : AMDGPULibFunc::EI_RSQRT, FInfo))) {
+      DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+                   << FInfo.getName().c_str() << "(" << *opr0 << ")\n");
+      Value *nval = CreateCallEx(B,FPExpr, opr0, issqrt ? "__pow2sqrt"
+                                                        : "__pow2rsqrt");
+      replaceCall(nval);
+      return true;
+    }
+  }
+
+  if (!isUnsafeMath(CI))
+    return false;
+
+  // Unsafe Math optimization
+
+  // Remember that ci_opr1 is set if opr1 is integral
+  if (CF) {
+    double dval = (getArgType(FInfo) == AMDGPULibFunc::F32)
+                    ? (double)CF->getValueAPF().convertToFloat()
+                    : CF->getValueAPF().convertToDouble();
+    int ival = (int)dval;
+    if ((double)ival == dval) {
+      ci_opr1 = ival;
+    } else
+      ci_opr1 = 0x11111111;
+  }
+
+  // pow/powr/pown(x, c) = [1/](x*x*..x); where
+  //   trunc(c) == c && the number of x == c && |c| <= 12
+  unsigned abs_opr1 = (ci_opr1 < 0) ? -ci_opr1 : ci_opr1;
+  if (abs_opr1 <= 12) {
+    Constant *cnval;
+    Value *nval;
+    if (abs_opr1 == 0) {
+      cnval = ConstantFP::get(eltType, 1.0);
+      if (getVecSize(FInfo) > 1) {
+        cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
+      }
+      nval = cnval;
+    } else {
+      Value *valx2 = nullptr;
+      nval = nullptr;
+      while (abs_opr1 > 0) {
+        valx2 = valx2 ? B.CreateFMul(valx2, valx2, "__powx2") : opr0;
+        if (abs_opr1 & 1) {
+          nval = nval ? B.CreateFMul(nval, valx2, "__powprod") : valx2;
+        }
+        abs_opr1 >>= 1;
+      }
+    }
+
+    if (ci_opr1 < 0) {
+      cnval = ConstantFP::get(eltType, 1.0);
+      if (getVecSize(FInfo) > 1) {
+        cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
+      }
+      nval = B.CreateFDiv(cnval, nval, "__1powprod");
+    }
+    DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+                 <<  ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0 << ")\n");
+    replaceCall(nval);
+    return true;
+  }
+
+  // powr ---> exp2(y * log2(x))
+  // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31))
+  Constant *ExpExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2,
+                                                   FInfo));
+  if (!ExpExpr)
+    return false;
+
+  bool needlog = false;
+  bool needabs = false;
+  bool needcopysign = false;
+  Constant *cnval = nullptr;
+  if (getVecSize(FInfo) == 1) {
+    CF = dyn_cast<ConstantFP>(opr0);
+
+    if (CF) {
+      double V = (getArgType(FInfo) == AMDGPULibFunc::F32)
+                   ? (double)CF->getValueAPF().convertToFloat()
+                   : CF->getValueAPF().convertToDouble();
+
+      V = log2(std::abs(V));
+      cnval = ConstantFP::get(eltType, V);
+      needcopysign = (FInfo.getId() != AMDGPULibFunc::EI_POWR) &&
+                     CF->isNegative();
+    } else {
+      needlog = true;
+      needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR &&
+                               (!CF || CF->isNegative());
+    }
+  } else {
+    ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr0);
+
+    if (!CDV) {
+      needlog = true;
+      needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR;
+    } else {
+      assert ((int)CDV->getNumElements() == getVecSize(FInfo) &&
+              "Wrong vector size detected");
+
+      SmallVector<double, 0> DVal;
+      for (int i=0; i < getVecSize(FInfo); ++i) {
+        double V = (getArgType(FInfo) == AMDGPULibFunc::F32)
+                     ? (double)CDV->getElementAsFloat(i)
+                     : CDV->getElementAsDouble(i);
+        if (V < 0.0) needcopysign = true;
+        V = log2(std::abs(V));
+        DVal.push_back(V);
+      }
+      if (getArgType(FInfo) == AMDGPULibFunc::F32) {
+        SmallVector<float, 0> FVal;
+        for (unsigned i=0; i < DVal.size(); ++i) {
+          FVal.push_back((float)DVal[i]);
+        }
+        ArrayRef<float> tmp(FVal);
+        cnval = ConstantDataVector::get(M->getContext(), tmp);
+      } else {
+        ArrayRef<double> tmp(DVal);
+        cnval = ConstantDataVector::get(M->getContext(), tmp);
+      }
+    }
+  }
+
+  if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) {
+    // We cannot handle corner cases for a general pow() function, give up
+    // unless y is a constant integral value. Then proceed as if it were pown.
+    if (getVecSize(FInfo) == 1) {
+      if (const ConstantFP *CF = dyn_cast<ConstantFP>(opr1)) {
+        double y = (getArgType(FInfo) == AMDGPULibFunc::F32)
+                   ? (double)CF->getValueAPF().convertToFloat()
+                   : CF->getValueAPF().convertToDouble();
+        if (y != (double)(int64_t)y)
+          return false;
+      } else
+        return false;
+    } else {
+      if (const ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr1)) {
+        for (int i=0; i < getVecSize(FInfo); ++i) {
+          double y = (getArgType(FInfo) == AMDGPULibFunc::F32)
+                     ? (double)CDV->getElementAsFloat(i)
+                     : CDV->getElementAsDouble(i);
+          if (y != (double)(int64_t)y)
+            return false;
+        }
+      } else
+        return false;
+    }
+  }
+
+  Value *nval;
+  if (needabs) {
+    Constant *AbsExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_FABS,
+                                                     FInfo));
+    if (!AbsExpr)
+      return false;
+    nval = CreateCallEx(B, AbsExpr, opr0, "__fabs");
+  } else {
+    nval = cnval ? cnval : opr0;
+  }
+  if (needlog) {
+    Constant *LogExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2,
+                                                     FInfo));
+    if (!LogExpr)
+      return false;
+    nval = CreateCallEx(B,LogExpr, nval, "__log2");
+  }
+
+  if (FInfo.getId() == AMDGPULibFunc::EI_POWN) {
+    // convert int(32) to fp(f32 or f64)
+    opr1 = B.CreateSIToFP(opr1, nval->getType(), "pownI2F");
+  }
+  nval = B.CreateFMul(opr1, nval, "__ylogx");
+  nval = CreateCallEx(B,ExpExpr, nval, "__exp2");
+
+  if (needcopysign) {
+    Value *opr_n;
+    Type* rTy = opr0->getType();
+    Type* nTyS = eltType->isDoubleTy() ? B.getInt64Ty() : B.getInt32Ty();
+    Type *nTy = nTyS;
+    if (const VectorType *vTy = dyn_cast<VectorType>(rTy))
+      nTy = VectorType::get(nTyS, vTy->getNumElements());
+    unsigned size = nTy->getScalarSizeInBits();
+    opr_n = CI->getArgOperand(1);
+    if (opr_n->getType()->isIntegerTy())
+      opr_n = B.CreateZExtOrBitCast(opr_n, nTy, "__ytou");
+    else
+      opr_n = B.CreateFPToSI(opr1, nTy, "__ytou");
+
+    Value *sign = B.CreateShl(opr_n, size-1, "__yeven");
+    sign = B.CreateAnd(B.CreateBitCast(opr0, nTy), sign, "__pow_sign");
+    nval = B.CreateOr(B.CreateBitCast(nval, nTy), sign);
+    nval = B.CreateBitCast(nval, opr0->getType());
+  }
+
+  DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+               << "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
+  replaceCall(nval);
+
+  return true;
+}
+
+bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
+                                const FuncInfo &FInfo) {
+  Value *opr0 = CI->getArgOperand(0);
+  Value *opr1 = CI->getArgOperand(1);
+
+  ConstantInt *CINT = dyn_cast<ConstantInt>(opr1);
+  if (!CINT) {
+    return false;
+  }
+  int ci_opr1 = (int)CINT->getSExtValue();
+  if (ci_opr1 == 1) {  // rootn(x, 1) = x
+    DEBUG(errs() << "AMDIC: " << *CI
+                 << " ---> " << *opr0 << "\n");
+    replaceCall(opr0);
+    return true;
+  }
+  if (ci_opr1 == 2) {  // rootn(x, 2) = sqrt(x)
+    std::vector<const Type*> ParamsTys;
+    ParamsTys.push_back(opr0->getType());
+    Module *M = CI->getModule();
+    if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT,
+                                                        FInfo))) {
+      DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n");
+      Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2sqrt");
+      replaceCall(nval);
+      return true;
+    }
+  } else if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x)
+    Module *M = CI->getModule();
+    if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT,
+                                                        FInfo))) {
+      DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n");
+      Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2cbrt");
+      replaceCall(nval);
+      return true;
+    }
+  } else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x
+    DEBUG(errs() << "AMDIC: " << *CI << " ---> 1.0 / " << *opr0 << "\n");
+    Value *nval = B.CreateFDiv(ConstantFP::get(opr0->getType(), 1.0),
+                               opr0,
+                               "__rootn2div");
+    replaceCall(nval);
+    return true;
+  } else if (ci_opr1 == -2) {  // rootn(x, -2) = rsqrt(x)
+    std::vector<const Type*> ParamsTys;
+    ParamsTys.push_back(opr0->getType());
+    Module *M = CI->getModule();
+    if (Constant *FPExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT,
+                                                        FInfo))) {
+      DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0 << ")\n");
+      Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2rsqrt");
+      replaceCall(nval);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B,
+                                  const FuncInfo &FInfo) {
+  Value *opr0 = CI->getArgOperand(0);
+  Value *opr1 = CI->getArgOperand(1);
+  Value *opr2 = CI->getArgOperand(2);
+
+  ConstantFP *CF0 = dyn_cast<ConstantFP>(opr0);
+  ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1);
+  if ((CF0 && CF0->isZero()) || (CF1 && CF1->isZero())) {
+    // fma/mad(a, b, c) = c if a=0 || b=0
+    DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr2 << "\n");
+    replaceCall(opr2);
+    return true;
+  }
+  if (CF0 && CF0->isExactlyValue(1.0f)) {
+    // fma/mad(a, b, c) = b+c if a=1
+    DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+                 << *opr1 << " + " << *opr2 << "\n");
+    Value *nval = B.CreateFAdd(opr1, opr2, "fmaadd");
+    replaceCall(nval);
+    return true;
+  }
+  if (CF1 && CF1->isExactlyValue(1.0f)) {
+    // fma/mad(a, b, c) = a+c if b=1
+    DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+                 << *opr0 << " + " << *opr2 << "\n");
+    Value *nval = B.CreateFAdd(opr0, opr2, "fmaadd");
+    replaceCall(nval);
+    return true;
+  }
+  if (ConstantFP *CF = dyn_cast<ConstantFP>(opr2)) {
+    if (CF->isZero()) {
+      // fma/mad(a, b, c) = a*b if c=0
+      DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+                   << *opr0 << " * " << *opr1 << "\n");
+      Value *nval = B.CreateFMul(opr0, opr1, "fmamul");
+      replaceCall(nval);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Get a scalar native builtin signle argument FP function
+Constant* AMDGPULibCalls::getNativeFunction(Module* M, const FuncInfo& FInfo) {
+  FuncInfo nf = FInfo;
+  nf.setPrefix(AMDGPULibFunc::NATIVE);
+  return getFunction(M, nf);
+}
+
+// fold sqrt -> native_sqrt (x)
+bool AMDGPULibCalls::fold_sqrt(CallInst *CI, IRBuilder<> &B,
+                               const FuncInfo &FInfo) {
+  if ((getArgType(FInfo) == AMDGPULibFunc::F32 ||
+       getArgType(FInfo) == AMDGPULibFunc::F64) &&
+      (getVecSize(FInfo) == 1) &&
+      (FInfo.getPrefix() != AMDGPULibFunc::NATIVE)) {
+    if (Constant *FPExpr = getNativeFunction(
+        CI->getModule(), AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) {
+      Value *opr0 = CI->getArgOperand(0);
+      DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+                   << "sqrt(" << *opr0 << ")\n");
+      Value *nval = CreateCallEx(B,FPExpr, opr0, "__sqrt");
+      replaceCall(nval);
+      return true;
+    }
+  }
+  return false;
+}
+
+// fold sin, cos -> sincos.
+bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
+                                 AliasAnalysis *AA) {
+  AMDGPULibFunc fInfo;
+  if (!AMDGPULibFunc::parse(CI->getCalledFunction()->getName(), fInfo))
+    return false;
+
+  assert(fInfo.getId() == AMDGPULibFunc::EI_SIN ||
+         fInfo.getId() == AMDGPULibFunc::EI_COS);
+  bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN;
+
+  Value *CArgVal = CI->getArgOperand(0);
+  BasicBlock * const CBB = CI->getParent();
+
+  int const MaxScan = 30;
+
+  { // fold in load value.
+    LoadInst *LI = dyn_cast<LoadInst>(CArgVal);
+    if (LI && LI->getParent() == CBB) {
+      BasicBlock::iterator BBI = LI->getIterator();
+      Value *AvailableVal = FindAvailableLoadedValue(LI, CBB, BBI, MaxScan, AA);
+      if (AvailableVal) {
+        CArgVal->replaceAllUsesWith(AvailableVal);
+        if (CArgVal->getNumUses() == 0)
+          LI->eraseFromParent();
+        CArgVal = CI->getArgOperand(0);
+      }
+    }
+  }
+
+  Module *M = CI->getModule();
+  fInfo.setId(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN);
+  std::string const PairName = fInfo.mangle();
+
+  CallInst *UI = nullptr;
+  for (User* U : CArgVal->users()) {
+    CallInst *XI = dyn_cast_or_null<CallInst>(U);
+    if (!XI || XI == CI || XI->getParent() != CBB)
+      continue;
+
+    Function *UCallee = XI->getCalledFunction();
+    if (!UCallee || !UCallee->getName().equals(PairName))
+      continue;
+
+    BasicBlock::iterator BBI = CI->getIterator();
+    if (BBI == CI->getParent()->begin())
+      break;
+    --BBI;
+    for (int I = MaxScan; I > 0 && BBI != CBB->begin(); --BBI, --I) {
+      if (cast<Instruction>(BBI) == XI) {
+        UI = XI;
+        break;
+      }
+    }
+    if (UI) break;
+  }
+
+  if (!UI) return false;
+
+  // Merge the sin and cos.
+
+  // for OpenCL 2.0 we have only generic implementation of sincos
+  // function.
+  AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
+  nf.Leads[0].PtrKind = AMDGPULibFunc::GENERIC;
+  Function *Fsincos = dyn_cast_or_null<Function>(getFunction(M, nf));
+  if (!Fsincos) return false;
+
+  BasicBlock::iterator ItOld = B.GetInsertPoint();
+  AllocaInst *Alloc = insertAlloca(UI, B, "__sincos_");
+  B.SetInsertPoint(UI);
+
+  Value *P = Alloc;
+  Type *PTy = Fsincos->getFunctionType()->getParamType(1);
+  // The allocaInst allocates the memory in private address space. This need
+  // to be bitcasted to point to the address space of cos pointer type.
+  // In OpenCL 2.0 this is generic, while in 1.2 that is private.
+  const AMDGPUAS AS = AMDGPU::getAMDGPUAS(*M);
+  if (PTy->getPointerAddressSpace() != AS.PRIVATE_ADDRESS)
+    P = B.CreateAddrSpaceCast(Alloc, PTy);
+  CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P);
+
+  DEBUG(errs() << "AMDIC: fold_sincos (" << *CI << ", " << *UI
+               << ") with " << *Call << "\n");
+
+  if (!isSin) { // CI->cos, UI->sin
+    B.SetInsertPoint(&*ItOld);
+    UI->replaceAllUsesWith(&*Call);
+    Instruction *Reload = B.CreateLoad(Alloc);
+    CI->replaceAllUsesWith(Reload);
+    UI->eraseFromParent();
+    CI->eraseFromParent();
+  } else { // CI->sin, UI->cos
+    Instruction *Reload = B.CreateLoad(Alloc);
+    UI->replaceAllUsesWith(Reload);
+    CI->replaceAllUsesWith(Call);
+    UI->eraseFromParent();
+    CI->eraseFromParent();
+  }
+  return true;
+}
+
+// Get insertion point at entry.
+BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) {
+  Function * Func = UI->getParent()->getParent();
+  BasicBlock * BB = &Func->getEntryBlock();
+  assert(BB && "Entry block not found!");
+  BasicBlock::iterator ItNew = BB->begin();
+  assert(&*ItNew && "Entry instruction not found!");
+  return ItNew;
+}
+
+// Insert a AllocsInst at the beginning of function entry block.
+AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B,
+                                         const char *prefix) {
+  BasicBlock::iterator ItNew = getEntryIns(UI);
+  Function *UCallee = UI->getCalledFunction();
+  Type *RetType = UCallee->getReturnType();
+  B.SetInsertPoint(&*ItNew);
+  AllocaInst *Alloc = B.CreateAlloca(RetType, 0,
+    std::string(prefix) + UI->getName());
+  Alloc->setAlignment(UCallee->getParent()->getDataLayout()
+                       .getTypeAllocSize(RetType));
+  return Alloc;
+}
+
+bool AMDGPULibCalls::evaluateScalarMathFunc(FuncInfo &FInfo,
+                                            double& Res0, double& Res1,
+                                            Constant *copr0, Constant *copr1,
+                                            Constant *copr2) {
+  // By default, opr0/opr1/opr3 holds values of float/double type.
+  // If they are not float/double, each function has to its
+  // operand separately.
+  double opr0=0.0, opr1=0.0, opr2=0.0;
+  ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(copr0);
+  ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(copr1);
+  ConstantFP *fpopr2 = dyn_cast_or_null<ConstantFP>(copr2);
+  if (fpopr0) {
+    opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64)
+             ? fpopr0->getValueAPF().convertToDouble()
+             : (double)fpopr0->getValueAPF().convertToFloat();
+  }
+
+  if (fpopr1) {
+    opr1 = (getArgType(FInfo) == AMDGPULibFunc::F64)
+             ? fpopr1->getValueAPF().convertToDouble()
+             : (double)fpopr1->getValueAPF().convertToFloat();
+  }
+
+  if (fpopr2) {
+    opr2 = (getArgType(FInfo) == AMDGPULibFunc::F64)
+             ? fpopr2->getValueAPF().convertToDouble()
+             : (double)fpopr2->getValueAPF().convertToFloat();
+  }
+
+  switch (FInfo.getId()) {
+  default : return false;
+
+  case AMDGPULibFunc::EI_ACOS:
+    Res0 = acos(opr0);
+    return true;
+
+  case AMDGPULibFunc::EI_ACOSH:
+    // acosh(x) == log(x + sqrt(x*x - 1))
+    Res0 = log(opr0 + sqrt(opr0*opr0 - 1.0));
+    return true;
+
+  case AMDGPULibFunc::EI_ACOSPI:
+    Res0 = acos(opr0) / MATH_PI;
+    return true;
+
+  case AMDGPULibFunc::EI_ASIN:
+    Res0 = asin(opr0);
+    return true;
+
+  case AMDGPULibFunc::EI_ASINH:
+    // asinh(x) == log(x + sqrt(x*x + 1))
+    Res0 = log(opr0 + sqrt(opr0*opr0 + 1.0));
+    return true;
+
+  case AMDGPULibFunc::EI_ASINPI:
+    Res0 = asin(opr0) / MATH_PI;
+    return true;
+
+  case AMDGPULibFunc::EI_ATAN:
+    Res0 = atan(opr0);
+    return true;
+
+  case AMDGPULibFunc::EI_ATANH:
+    // atanh(x) == (log(x+1) - log(x-1))/2;
+    Res0 = (log(opr0 + 1.0) - log(opr0 - 1.0))/2.0;
+    return true;
+
+  case AMDGPULibFunc::EI_ATANPI:
+    Res0 = atan(opr0) / MATH_PI;
+    return true;
+
+  case AMDGPULibFunc::EI_CBRT:
+    Res0 = (opr0 < 0.0) ? -pow(-opr0, 1.0/3.0) : pow(opr0, 1.0/3.0);
+    return true;
+
+  case AMDGPULibFunc::EI_COS:
+    Res0 = cos(opr0);
+    return true;
+
+  case AMDGPULibFunc::EI_COSH:
+    Res0 = cosh(opr0);
+    return true;
+
+  case AMDGPULibFunc::EI_COSPI:
+    Res0 = cos(MATH_PI * opr0);
+    return true;
+
+  case AMDGPULibFunc::EI_EXP:
+    Res0 = exp(opr0);
+    return true;
+
+  case AMDGPULibFunc::EI_EXP2:
+    Res0 = pow(2.0, opr0);
+    return true;
+
+  case AMDGPULibFunc::EI_EXP10:
+    Res0 = pow(10.0, opr0);
+    return true;
+
+  case AMDGPULibFunc::EI_EXPM1:
+    Res0 = exp(opr0) - 1.0;
+    return true;
+
+  case AMDGPULibFunc::EI_LOG:
+    Res0 = log(opr0);
+    return true;
+
+  case AMDGPULibFunc::EI_LOG2:
+    Res0 = log(opr0) / log(2.0);
+    return true;
+
+  case AMDGPULibFunc::EI_LOG10:
+    Res0 = log(opr0) / log(10.0);
+    return true;
+
+  case AMDGPULibFunc::EI_RSQRT:
+    Res0 = 1.0 / sqrt(opr0);
+    return true;
+
+  case AMDGPULibFunc::EI_SIN:
+    Res0 = sin(opr0);
+    return true;
+
+  case AMDGPULibFunc::EI_SINH:
+    Res0 = sinh(opr0);
+    return true;
+
+  case AMDGPULibFunc::EI_SINPI:
+    Res0 = sin(MATH_PI * opr0);
+    return true;
+
+  case AMDGPULibFunc::EI_SQRT:
+    Res0 = sqrt(opr0);
+    return true;
+
+  case AMDGPULibFunc::EI_TAN:
+    Res0 = tan(opr0);
+    return true;
+
+  case AMDGPULibFunc::EI_TANH:
+    Res0 = tanh(opr0);
+    return true;
+
+  case AMDGPULibFunc::EI_TANPI:
+    Res0 = tan(MATH_PI * opr0);
+    return true;
+
+  case AMDGPULibFunc::EI_RECIP:
+    Res0 = 1.0 / opr0;
+    return true;
+
+  // two-arg functions
+  case AMDGPULibFunc::EI_DIVIDE:
+    Res0 = opr0 / opr1;
+    return true;
+
+  case AMDGPULibFunc::EI_POW:
+  case AMDGPULibFunc::EI_POWR:
+    Res0 = pow(opr0, opr1);
+    return true;
+
+  case AMDGPULibFunc::EI_POWN: {
+    if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
+      double val = (double)iopr1->getSExtValue();
+      Res0 = pow(opr0, val);
+      return true;
+    }
+    return false;
+  }
+
+  case AMDGPULibFunc::EI_ROOTN: {
+    if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
+      double val = (double)iopr1->getSExtValue();
+      Res0 = pow(opr0, 1.0 / val);
+      return true;
+    }
+    return false;
+  }
+
+  // with ptr arg
+  case AMDGPULibFunc::EI_SINCOS:
+    Res0 = sin(opr0);
+    Res1 = cos(opr0);
+    return true;
+
+  // three-arg functions
+  case AMDGPULibFunc::EI_FMA:
+  case AMDGPULibFunc::EI_MAD:
+    Res0 = opr0 * opr1 + opr2;
+    return true;
+  }
+
+  return false;
+}
+
+bool AMDGPULibCalls::evaluateCall(CallInst *aCI, FuncInfo &FInfo) {
+  int numArgs = (int)aCI->getNumArgOperands();
+  if (numArgs > 3)
+    return false;
+
+  Constant *copr0 = nullptr;
+  Constant *copr1 = nullptr;
+  Constant *copr2 = nullptr;
+  if (numArgs > 0) {
+    if ((copr0 = dyn_cast<Constant>(aCI->getArgOperand(0))) == nullptr)
+      return false;
+  }
+
+  if (numArgs > 1) {
+    if ((copr1 = dyn_cast<Constant>(aCI->getArgOperand(1))) == nullptr) {
+      if (FInfo.getId() != AMDGPULibFunc::EI_SINCOS)
+        return false;
+    }
+  }
+
+  if (numArgs > 2) {
+    if ((copr2 = dyn_cast<Constant>(aCI->getArgOperand(2))) == nullptr)
+      return false;
+  }
+
+  // At this point, all arguments to aCI are constants.
+
+  // max vector size is 16, and sincos will generate two results.
+  double DVal0[16], DVal1[16];
+  bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS);
+  if (getVecSize(FInfo) == 1) {
+    if (!evaluateScalarMathFunc(FInfo, DVal0[0],
+                                DVal1[0], copr0, copr1, copr2)) {
+      return false;
+    }
+  } else {
+    ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(copr0);
+    ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(copr1);
+    ConstantDataVector *CDV2 = dyn_cast_or_null<ConstantDataVector>(copr2);
+    for (int i=0; i < getVecSize(FInfo); ++i) {
+      Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr;
+      Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr;
+      Constant *celt2 = CDV2 ? CDV2->getElementAsConstant(i) : nullptr;
+      if (!evaluateScalarMathFunc(FInfo, DVal0[i],
+                                  DVal1[i], celt0, celt1, celt2)) {
+        return false;
+      }
+    }
+  }
+
+  LLVMContext &context = CI->getParent()->getParent()->getContext();
+  Constant *nval0, *nval1;
+  if (getVecSize(FInfo) == 1) {
+    nval0 = ConstantFP::get(CI->getType(), DVal0[0]);
+    if (hasTwoResults)
+      nval1 = ConstantFP::get(CI->getType(), DVal1[0]);
+  } else {
+    if (getArgType(FInfo) == AMDGPULibFunc::F32) {
+      SmallVector <float, 0> FVal0, FVal1;
+      for (int i=0; i < getVecSize(FInfo); ++i)
+        FVal0.push_back((float)DVal0[i]);
+      ArrayRef<float> tmp0(FVal0);
+      nval0 = ConstantDataVector::get(context, tmp0);
+      if (hasTwoResults) {
+        for (int i=0; i < getVecSize(FInfo); ++i)
+          FVal1.push_back((float)DVal1[i]);
+        ArrayRef<float> tmp1(FVal1);
+        nval1 = ConstantDataVector::get(context, tmp1);
+      }
+    } else {
+      ArrayRef<double> tmp0(DVal0);
+      nval0 = ConstantDataVector::get(context, tmp0);
+      if (hasTwoResults) {
+        ArrayRef<double> tmp1(DVal1);
+        nval1 = ConstantDataVector::get(context, tmp1);
+      }
+    }
+  }
+
+  if (hasTwoResults) {
+    // sincos
+    assert(FInfo.getId() == AMDGPULibFunc::EI_SINCOS &&
+           "math function with ptr arg not supported yet");
+    new StoreInst(nval1, aCI->getArgOperand(1), aCI);
+  }
+
+  replaceCall(nval0);
+  return true;
+}
+
+// Public interface to the Simplify LibCalls pass.
+FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass() {
+  return new AMDGPUSimplifyLibCalls();
+}
+
+FunctionPass *llvm::createAMDGPUUseNativeCallsPass() {
+  return new AMDGPUUseNativeCalls();
+}
+
+bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  bool Changed = false;
+  auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  DEBUG(dbgs() << "AMDIC: process function ";
+        F.printAsOperand(dbgs(), false, F.getParent());
+        dbgs() << '\n';);
+
+  for (auto &BB : F) {
+    for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ) {
+      // Ignore non-calls.
+      CallInst *CI = dyn_cast<CallInst>(I);
+      ++I;
+      if (!CI) continue;
+
+      // Ignore indirect calls.
+      Function *Callee = CI->getCalledFunction();
+      if (Callee == 0) continue;
+
+      DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
+            dbgs().flush());
+      if(Simplifier.fold(CI, AA))
+        Changed = true;
+    }
+  }
+  return Changed;
+}
+
+bool AMDGPUUseNativeCalls::runOnFunction(Function &F) {
+  if (skipFunction(F) || UseNative.empty())
+    return false;
+
+  bool Changed = false;
+  for (auto &BB : F) {
+    for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ) {
+      // Ignore non-calls.
+      CallInst *CI = dyn_cast<CallInst>(I);
+      ++I;
+      if (!CI) continue;
+
+      // Ignore indirect calls.
+      Function *Callee = CI->getCalledFunction();
+      if (Callee == 0) continue;
+
+      if(Simplifier.useNative(CI))
+        Changed = true;
+    }
+  }
+  return Changed;
+}

Added: llvm/trunk/lib/Target/AMDGPU/AMDGPULibFunc.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPULibFunc.cpp?rev=310731&view=auto
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPULibFunc.cpp (added)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPULibFunc.cpp Fri Aug 11 09:42:09 2017
@@ -0,0 +1,928 @@
+//===-- AMDGPULibFunc.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file contains utility functions to work with Itanium mangled names
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPULibFunc.h"
+#include <llvm/ADT/SmallString.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/StringSwitch.h>
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include <llvm/Support/raw_ostream.h>
+#include <string>
+
+using namespace llvm;
+
+namespace {
+
+enum EManglingParam {
+    E_NONE,
+    EX_EVENT,
+    EX_FLOAT4,
+    EX_INTV4,
+    EX_RESERVEDID,
+    EX_SAMPLER,
+    EX_SIZET,
+    EX_UINT,
+    EX_UINTV4,
+    E_ANY,
+    E_CONSTPTR_ANY,
+    E_CONSTPTR_SWAPGL,
+    E_COPY,
+    E_IMAGECOORDS,
+    E_POINTEE,
+    E_SETBASE_I32,
+    E_SETBASE_U32,
+    E_MAKEBASE_UNS,
+    E_V16_OF_POINTEE,
+    E_V2_OF_POINTEE,
+    E_V3_OF_POINTEE,
+    E_V4_OF_POINTEE,
+    E_V8_OF_POINTEE,
+    E_VLTLPTR_ANY,
+};
+
+struct ManglingRule {
+   StringRef const Name;
+   unsigned char Lead[2];
+   unsigned char Param[5];
+
+   int maxLeadIndex() const { return (std::max)(Lead[0], Lead[1]); }
+   int getNumLeads() const { return (Lead[0] ? 1 : 0) + (Lead[1] ? 1 : 0); }
+
+   unsigned getNumArgs() const;
+};
+
+unsigned ManglingRule::getNumArgs() const {
+   unsigned I=0;
+   while (I < (sizeof Param/sizeof Param[0]) && Param[I]) ++I;
+   return I;
+}
+
+// This table describes function formal argument type rules. The order of rules
+// corresponds to the EFuncId enum at AMDGPULibFunc.h
+//
+// "<func name>", { <leads> }, { <param rules> }
+// where:
+//  <leads> - list of integers that are one-based indexes of formal argument
+//    used to mangle a function name. Other argument types are derived from types
+//    of these 'leads'. The order of integers in this list correspond to the
+//    order in which these arguments are mangled in the EDG mangling scheme. The
+//    same order should be preserved for arguments in the AMDGPULibFunc structure
+//    when it is used for mangling. For example:
+//    { "vstorea_half", {3,1}, {E_ANY,EX_SIZET,E_ANY}},
+//    will be mangled in EDG scheme as  vstorea_half_<3dparam>_<1stparam>
+//    When mangling from code use:
+//    AMDGPULibFunc insc;
+//    insc.param[0] = ... // describe 3rd parameter
+//    insc.param[1] = ... // describe 1rd parameter
+//
+// <param rules> - list of rules used to derive all of the function formal
+//    argument types. EX_ prefixed are simple types, other derived from the
+//    latest 'lead' argument type in the order of encoding from first to last.
+//    E_ANY - use prev lead type, E_CONSTPTR_ANY - make const pointer out of
+//    prev lead type, etc. see ParamIterator::getNextParam() for details.
+
+static const ManglingRule manglingRules[] = {
+{ StringRef(), {0}, {0} },
+{ "abs"                             , {1},   {E_ANY}},
+{ "abs_diff"                        , {1},   {E_ANY,E_COPY}},
+{ "acos"                            , {1},   {E_ANY}},
+{ "acosh"                           , {1},   {E_ANY}},
+{ "acospi"                          , {1},   {E_ANY}},
+{ "add_sat"                         , {1},   {E_ANY,E_COPY}},
+{ "all"                             , {1},   {E_ANY}},
+{ "any"                             , {1},   {E_ANY}},
+{ "asin"                            , {1},   {E_ANY}},
+{ "asinh"                           , {1},   {E_ANY}},
+{ "asinpi"                          , {1},   {E_ANY}},
+{ "async_work_group_copy"           , {1},   {E_ANY,E_CONSTPTR_SWAPGL,EX_SIZET,EX_EVENT}},
+{ "async_work_group_strided_copy"   , {1},   {E_ANY,E_CONSTPTR_SWAPGL,EX_SIZET,EX_SIZET,EX_EVENT}},
+{ "atan"                            , {1},   {E_ANY}},
+{ "atan2"                           , {1},   {E_ANY,E_COPY}},
+{ "atan2pi"                         , {1},   {E_ANY,E_COPY}},
+{ "atanh"                           , {1},   {E_ANY}},
+{ "atanpi"                          , {1},   {E_ANY}},
+{ "atomic_add"                      , {1},   {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_and"                      , {1},   {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_cmpxchg"                  , {1},   {E_VLTLPTR_ANY,E_POINTEE,E_POINTEE}},
+{ "atomic_dec"                      , {1},   {E_VLTLPTR_ANY}},
+{ "atomic_inc"                      , {1},   {E_VLTLPTR_ANY}},
+{ "atomic_max"                      , {1},   {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_min"                      , {1},   {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_or"                       , {1},   {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_sub"                      , {1},   {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_xchg"                     , {1},   {E_VLTLPTR_ANY,E_POINTEE}},
+{ "atomic_xor"                      , {1},   {E_VLTLPTR_ANY,E_POINTEE}},
+{ "bitselect"                       , {1},   {E_ANY,E_COPY,E_COPY}},
+{ "cbrt"                            , {1},   {E_ANY}},
+{ "ceil"                            , {1},   {E_ANY}},
+{ "clamp"                           , {1},   {E_ANY,E_COPY,E_COPY}},
+{ "clz"                             , {1},   {E_ANY}},
+{ "commit_read_pipe"                , {1},   {E_ANY,EX_RESERVEDID}},
+{ "commit_write_pipe"               , {1},   {E_ANY,EX_RESERVEDID}},
+{ "copysign"                        , {1},   {E_ANY,E_COPY}},
+{ "cos"                             , {1},   {E_ANY}},
+{ "cosh"                            , {1},   {E_ANY}},
+{ "cospi"                           , {1},   {E_ANY}},
+{ "cross"                           , {1},   {E_ANY,E_COPY}},
+{ "ctz"                             , {1},   {E_ANY}},
+{ "degrees"                         , {1},   {E_ANY}},
+{ "distance"                        , {1},   {E_ANY,E_COPY}},
+{ "divide"                          , {1},   {E_ANY,E_COPY}},
+{ "dot"                             , {1},   {E_ANY,E_COPY}},
+{ "erf"                             , {1},   {E_ANY}},
+{ "erfc"                            , {1},   {E_ANY}},
+{ "exp"                             , {1},   {E_ANY}},
+{ "exp10"                           , {1},   {E_ANY}},
+{ "exp2"                            , {1},   {E_ANY}},
+{ "expm1"                           , {1},   {E_ANY}},
+{ "fabs"                            , {1},   {E_ANY}},
+{ "fast_distance"                   , {1},   {E_ANY,E_COPY}},
+{ "fast_length"                     , {1},   {E_ANY}},
+{ "fast_normalize"                  , {1},   {E_ANY}},
+{ "fdim"                            , {1},   {E_ANY,E_COPY}},
+{ "floor"                           , {1},   {E_ANY}},
+{ "fma"                             , {1},   {E_ANY,E_COPY,E_COPY}},
+{ "fmax"                            , {1},   {E_ANY,E_COPY}},
+{ "fmin"                            , {1},   {E_ANY,E_COPY}},
+{ "fmod"                            , {1},   {E_ANY,E_COPY}},
+{ "fract"                           , {2},   {E_POINTEE,E_ANY}},
+{ "frexp"                           , {1,2}, {E_ANY,E_ANY}},
+{ "get_image_array_size"            , {1},   {E_ANY}},
+{ "get_image_channel_data_type"     , {1},   {E_ANY}},
+{ "get_image_channel_order"         , {1},   {E_ANY}},
+{ "get_image_dim"                   , {1},   {E_ANY}},
+{ "get_image_height"                , {1},   {E_ANY}},
+{ "get_image_width"                 , {1},   {E_ANY}},
+{ "get_pipe_max_packets"            , {1},   {E_ANY}},
+{ "get_pipe_num_packets"            , {1},   {E_ANY}},
+{ "hadd"                            , {1},   {E_ANY,E_COPY}},
+{ "hypot"                           , {1},   {E_ANY,E_COPY}},
+{ "ilogb"                           , {1},   {E_ANY}},
+{ "isequal"                         , {1},   {E_ANY,E_COPY}},
+{ "isfinite"                        , {1},   {E_ANY}},
+{ "isgreater"                       , {1},   {E_ANY,E_COPY}},
+{ "isgreaterequal"                  , {1},   {E_ANY,E_COPY}},
+{ "isinf"                           , {1},   {E_ANY}},
+{ "isless"                          , {1},   {E_ANY,E_COPY}},
+{ "islessequal"                     , {1},   {E_ANY,E_COPY}},
+{ "islessgreater"                   , {1},   {E_ANY,E_COPY}},
+{ "isnan"                           , {1},   {E_ANY}},
+{ "isnormal"                        , {1},   {E_ANY}},
+{ "isnotequal"                      , {1},   {E_ANY,E_COPY}},
+{ "isordered"                       , {1},   {E_ANY,E_COPY}},
+{ "isunordered"                     , {1},   {E_ANY,E_COPY}},
+{ "ldexp"                           , {1},   {E_ANY,E_SETBASE_I32}},
+{ "length"                          , {1},   {E_ANY}},
+{ "lgamma"                          , {1},   {E_ANY}},
+{ "lgamma_r"                        , {1,2}, {E_ANY,E_ANY}},
+{ "log"                             , {1},   {E_ANY}},
+{ "log10"                           , {1},   {E_ANY}},
+{ "log1p"                           , {1},   {E_ANY}},
+{ "log2"                            , {1},   {E_ANY}},
+{ "logb"                            , {1},   {E_ANY}},
+{ "mad"                             , {1},   {E_ANY,E_COPY,E_COPY}},
+{ "mad24"                           , {1},   {E_ANY,E_COPY,E_COPY}},
+{ "mad_hi"                          , {1},   {E_ANY,E_COPY,E_COPY}},
+{ "mad_sat"                         , {1},   {E_ANY,E_COPY,E_COPY}},
+{ "max"                             , {1},   {E_ANY,E_COPY}},
+{ "maxmag"                          , {1},   {E_ANY,E_COPY}},
+{ "min"                             , {1},   {E_ANY,E_COPY}},
+{ "minmag"                          , {1},   {E_ANY,E_COPY}},
+{ "mix"                             , {1},   {E_ANY,E_COPY,E_COPY}},
+{ "modf"                            , {2},   {E_POINTEE,E_ANY}},
+{ "mul24"                           , {1},   {E_ANY,E_COPY}},
+{ "mul_hi"                          , {1},   {E_ANY,E_COPY}},
+{ "nan"                             , {1},   {E_ANY}},
+{ "nextafter"                       , {1},   {E_ANY,E_COPY}},
+{ "normalize"                       , {1},   {E_ANY}},
+{ "popcount"                        , {1},   {E_ANY}},
+{ "pow"                             , {1},   {E_ANY,E_COPY}},
+{ "pown"                            , {1},   {E_ANY,E_SETBASE_I32}},
+{ "powr"                            , {1},   {E_ANY,E_COPY}},
+{ "prefetch"                        , {1},   {E_CONSTPTR_ANY,EX_SIZET}},
+{ "radians"                         , {1},   {E_ANY}},
+{ "read_pipe"                       , {4},   {E_COPY,EX_RESERVEDID,EX_UINT,E_ANY}},
+{ "recip"                           , {1},   {E_ANY}},
+{ "remainder"                       , {1},   {E_ANY,E_COPY}},
+{ "remquo"                          , {1,3}, {E_ANY,E_COPY,E_ANY}},
+{ "reserve_read_pipe"               , {1},   {E_ANY,EX_UINT}},
+{ "reserve_write_pipe"              , {1},   {E_ANY,EX_UINT}},
+{ "rhadd"                           , {1},   {E_ANY,E_COPY}},
+{ "rint"                            , {1},   {E_ANY}},
+{ "rootn"                           , {1},   {E_ANY,E_SETBASE_I32}},
+{ "rotate"                          , {1},   {E_ANY,E_COPY}},
+{ "round"                           , {1},   {E_ANY}},
+{ "rsqrt"                           , {1},   {E_ANY}},
+{ "select"                          , {1,3}, {E_ANY,E_COPY,E_ANY}},
+{ "shuffle"                         , {1,2}, {E_ANY,E_ANY}},
+{ "shuffle2"                        , {1,3}, {E_ANY,E_COPY,E_ANY}},
+{ "sign"                            , {1},   {E_ANY}},
+{ "signbit"                         , {1},   {E_ANY}},
+{ "sin"                             , {1},   {E_ANY}},
+{ "sincos"                          , {2},   {E_POINTEE,E_ANY}},
+{ "sinh"                            , {1},   {E_ANY}},
+{ "sinpi"                           , {1},   {E_ANY}},
+{ "smoothstep"                      , {1},   {E_ANY,E_COPY,E_COPY}},
+{ "sqrt"                            , {1},   {E_ANY}},
+{ "step"                            , {1},   {E_ANY,E_COPY}},
+{ "sub_group_broadcast"             , {1},   {E_ANY,EX_UINT}},
+{ "sub_group_commit_read_pipe"      , {1},   {E_ANY,EX_RESERVEDID}},
+{ "sub_group_commit_write_pipe"     , {1},   {E_ANY,EX_RESERVEDID}},
+{ "sub_group_reduce_add"            , {1},   {E_ANY}},
+{ "sub_group_reduce_max"            , {1},   {E_ANY}},
+{ "sub_group_reduce_min"            , {1},   {E_ANY}},
+{ "sub_group_reserve_read_pipe"     , {1},   {E_ANY,EX_UINT}},
+{ "sub_group_reserve_write_pipe"    , {1},   {E_ANY,EX_UINT}},
+{ "sub_group_scan_exclusive_add"    , {1},   {E_ANY}},
+{ "sub_group_scan_exclusive_max"    , {1},   {E_ANY}},
+{ "sub_group_scan_exclusive_min"    , {1},   {E_ANY}},
+{ "sub_group_scan_inclusive_add"    , {1},   {E_ANY}},
+{ "sub_group_scan_inclusive_max"    , {1},   {E_ANY}},
+{ "sub_group_scan_inclusive_min"    , {1},   {E_ANY}},
+{ "sub_sat"                         , {1},   {E_ANY,E_COPY}},
+{ "tan"                             , {1},   {E_ANY}},
+{ "tanh"                            , {1},   {E_ANY}},
+{ "tanpi"                           , {1},   {E_ANY}},
+{ "tgamma"                          , {1},   {E_ANY}},
+{ "trunc"                           , {1},   {E_ANY}},
+{ "upsample"                        , {1},   {E_ANY,E_MAKEBASE_UNS}},
+{ "vec_step"                        , {1},   {E_ANY}},
+{ "vstore"                          , {3},   {E_POINTEE,EX_SIZET,E_ANY}},
+{ "vstore16"                        , {3},   {E_V16_OF_POINTEE,EX_SIZET,E_ANY}},
+{ "vstore2"                         , {3},   {E_V2_OF_POINTEE,EX_SIZET,E_ANY}},
+{ "vstore3"                         , {3},   {E_V3_OF_POINTEE,EX_SIZET,E_ANY}},
+{ "vstore4"                         , {3},   {E_V4_OF_POINTEE,EX_SIZET,E_ANY}},
+{ "vstore8"                         , {3},   {E_V8_OF_POINTEE,EX_SIZET,E_ANY}},
+{ "work_group_commit_read_pipe"     , {1},   {E_ANY,EX_RESERVEDID}},
+{ "work_group_commit_write_pipe"    , {1},   {E_ANY,EX_RESERVEDID}},
+{ "work_group_reduce_add"           , {1},   {E_ANY}},
+{ "work_group_reduce_max"           , {1},   {E_ANY}},
+{ "work_group_reduce_min"           , {1},   {E_ANY}},
+{ "work_group_reserve_read_pipe"    , {1},   {E_ANY,EX_UINT}},
+{ "work_group_reserve_write_pipe"   , {1},   {E_ANY,EX_UINT}},
+{ "work_group_scan_exclusive_add"   , {1},   {E_ANY}},
+{ "work_group_scan_exclusive_max"   , {1},   {E_ANY}},
+{ "work_group_scan_exclusive_min"   , {1},   {E_ANY}},
+{ "work_group_scan_inclusive_add"   , {1},   {E_ANY}},
+{ "work_group_scan_inclusive_max"   , {1},   {E_ANY}},
+{ "work_group_scan_inclusive_min"   , {1},   {E_ANY}},
+{ "write_imagef"                    , {1},   {E_ANY,E_IMAGECOORDS,EX_FLOAT4}},
+{ "write_imagei"                    , {1},   {E_ANY,E_IMAGECOORDS,EX_INTV4}},
+{ "write_imageui"                   , {1},   {E_ANY,E_IMAGECOORDS,EX_UINTV4}},
+{ "write_pipe"                      , {4},   {E_COPY,EX_RESERVEDID,EX_UINT,E_ANY}},
+{ "ncos"                            , {1},   {E_ANY} },
+{ "nexp2"                           , {1},   {E_ANY} },
+{ "nfma"                            , {1},   {E_ANY, E_COPY, E_COPY} },
+{ "nlog2"                           , {1},   {E_ANY} },
+{ "nrcp"                            , {1},   {E_ANY} },
+{ "nrsqrt"                          , {1},   {E_ANY} },
+{ "nsin"                            , {1},   {E_ANY} },
+{ "nsqrt"                           , {1},   {E_ANY} },
+{ "ftz"                             , {1},   {E_ANY} },
+{ "fldexp"                          , {1},   {E_ANY, EX_UINT} },
+{ "class"                           , {1},   {E_ANY, EX_UINT} },
+{ "rcbrt"                           , {1},   {E_ANY} },
+};
+
+static const struct ManglingRulesMap : public StringMap<int> {
+  ManglingRulesMap()
+    : StringMap<int>(sizeof(manglingRules)/sizeof(manglingRules[0])) {
+    int Id = 0;
+    for (auto Rule : manglingRules)
+      insert({ Rule.Name, Id++ });
+  }
+} manglingRulesMap;
+
+static AMDGPULibFunc::Param getRetType(AMDGPULibFunc::EFuncId id,
+                                       const AMDGPULibFunc::Param (&Leads)[2]) {
+  AMDGPULibFunc::Param Res = Leads[0];
+  // TBD - This switch may require to be extended for other intriniscs
+  switch (id) {
+  case AMDGPULibFunc::EI_SINCOS:
+    Res.PtrKind = AMDGPULibFunc::BYVALUE;
+    break;
+  default:
+    break;
+  }
+  return Res;
+}
+
+class ParamIterator {
+  const AMDGPULibFunc::Param (&Leads)[2];
+  const ManglingRule& Rule;
+  int Index;
+public:
+  ParamIterator(const AMDGPULibFunc::Param (&leads)[2],
+                const ManglingRule& rule)
+    : Leads(leads), Rule(rule), Index(0) {}
+
+  AMDGPULibFunc::Param getNextParam();
+};
+
+AMDGPULibFunc::Param ParamIterator::getNextParam() {
+  AMDGPULibFunc::Param P;
+  if (Index >= int(sizeof Rule.Param/sizeof Rule.Param[0])) return P;
+
+  const char R = Rule.Param[Index];
+  switch (R) {
+  case E_NONE:     break;
+  case EX_UINT:
+    P.ArgType = AMDGPULibFunc::U32; break;
+  case EX_INTV4:
+    P.ArgType = AMDGPULibFunc::I32; P.VectorSize = 4; break;
+  case EX_UINTV4:
+    P.ArgType = AMDGPULibFunc::U32; P.VectorSize = 4; break;
+  case EX_FLOAT4:
+    P.ArgType = AMDGPULibFunc::F32; P.VectorSize = 4; break;
+  case EX_SIZET:
+    P.ArgType = AMDGPULibFunc::U64; break;
+  case EX_EVENT:
+    P.ArgType = AMDGPULibFunc::EVENT;   break;
+  case EX_SAMPLER:
+    P.ArgType = AMDGPULibFunc::SAMPLER; break;
+  case EX_RESERVEDID: break; // TBD
+  default:
+    if (Index == (Rule.Lead[1] - 1)) P = Leads[1];
+    else P = Leads[0];
+
+    switch (R) {
+    case E_ANY:
+    case E_COPY: break;
+
+    case E_POINTEE:
+      P.PtrKind = AMDGPULibFunc::BYVALUE; break;
+    case E_V2_OF_POINTEE:
+      P.VectorSize = 2; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
+    case E_V3_OF_POINTEE:
+      P.VectorSize = 3; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
+    case E_V4_OF_POINTEE:
+      P.VectorSize = 4; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
+    case E_V8_OF_POINTEE:
+      P.VectorSize = 8; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
+    case E_V16_OF_POINTEE:
+      P.VectorSize = 16; P.PtrKind = AMDGPULibFunc::BYVALUE; break;
+    case E_CONSTPTR_ANY:
+      P.PtrKind |= AMDGPULibFunc::CONST; break;
+    case E_VLTLPTR_ANY:
+      P.PtrKind |= AMDGPULibFunc::VOLATILE; break;
+    case E_SETBASE_I32:
+      P.ArgType = AMDGPULibFunc::I32; break;
+    case E_SETBASE_U32:
+      P.ArgType = AMDGPULibFunc::U32; break;
+
+    case E_MAKEBASE_UNS:
+      P.ArgType &= ~AMDGPULibFunc::BASE_TYPE_MASK;
+      P.ArgType |= AMDGPULibFunc::UINT;
+      break;
+
+    case E_IMAGECOORDS:
+      switch (P.ArgType) {
+      case AMDGPULibFunc::IMG1DA: P.VectorSize = 2; break;
+      case AMDGPULibFunc::IMG1DB: P.VectorSize = 1; break;
+      case AMDGPULibFunc::IMG2DA: P.VectorSize = 4; break;
+      case AMDGPULibFunc::IMG1D:  P.VectorSize = 1; break;
+      case AMDGPULibFunc::IMG2D:  P.VectorSize = 2; break;
+      case AMDGPULibFunc::IMG3D:  P.VectorSize = 4; break;
+      }
+      P.PtrKind = AMDGPULibFunc::BYVALUE;
+      P.ArgType = AMDGPULibFunc::I32;
+      break;
+
+    case E_CONSTPTR_SWAPGL:
+      switch (P.PtrKind & AMDGPULibFunc::ADDR_SPACE) {
+      case AMDGPULibFunc::GLOBAL: P.PtrKind = AMDGPULibFunc::LOCAL; break;
+      case AMDGPULibFunc::LOCAL:  P.PtrKind = AMDGPULibFunc::GLOBAL; break;
+      }
+      P.PtrKind |= AMDGPULibFunc::CONST;
+      break;
+
+    default: llvm_unreachable("Unhandeled param rule");
+    }
+  }
+  ++Index;
+  return P;
+}
+
+inline static void drop_front(StringRef& str, size_t n = 1) {
+  str = str.drop_front(n);
+}
+
+static bool eatTerm(StringRef& mangledName, const char c) {
+  if (mangledName.front() == c) {
+    drop_front(mangledName);
+    return true;
+  }
+  return false;
+}
+
+template <size_t N>
+static bool eatTerm(StringRef& mangledName, const char (&str)[N]) {
+  if (mangledName.startswith(StringRef(str, N-1))) {
+    drop_front(mangledName, N-1);
+    return true;
+  }
+  return false;
+}
+
+static inline bool isDigit(char c) { return c >= '0' && c <= '9'; }
+
+static int eatNumber(StringRef& s) {
+  size_t const savedSize = s.size();
+  int n = 0;
+  while (!s.empty() && isDigit(s.front())) {
+    n = n*10 + s.front() - '0';
+    drop_front(s);
+  }
+  return s.size() < savedSize ? n : -1;
+}
+
+static StringRef eatLengthPrefixedName(StringRef& mangledName) {
+  int const Len = eatNumber(mangledName);
+  if (Len <= 0 || static_cast<size_t>(Len) > mangledName.size())
+    return StringRef();
+  StringRef Res = mangledName.substr(0, Len);
+  drop_front(mangledName, Len);
+  return Res;
+}
+
+} // end anonymous namespace
+
+AMDGPULibFunc::AMDGPULibFunc() {
+  reset();
+}
+
+AMDGPULibFunc::AMDGPULibFunc(EFuncId id, const AMDGPULibFunc& copyFrom)
+  : FuncId(id) {
+  FKind = copyFrom.FKind;
+  Leads[0] = copyFrom.Leads[0];
+  Leads[1] = copyFrom.Leads[1];
+}
+
+void AMDGPULibFunc::reset() {
+  FuncId = EI_NONE;
+  FKind = NOPFX;
+  Leads[0].reset();
+  Leads[1].reset();
+  Name.clear();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Demangling
+
+static int parseVecSize(StringRef& mangledName) {
+  size_t const Len = eatNumber(mangledName);
+  switch (Len) {
+  case 2: case 3: case 4: case 8: case 16:
+    return Len;
+  default:
+    break;
+  }
+  return 1;
+}
+
+static AMDGPULibFunc::ENamePrefix parseNamePrefix(StringRef& mangledName) {
+  std::pair<StringRef, StringRef> const P = mangledName.split('_');
+  AMDGPULibFunc::ENamePrefix Pfx =
+    StringSwitch<AMDGPULibFunc::ENamePrefix>(P.first)
+    .Case("native", AMDGPULibFunc::NATIVE)
+    .Case("half"  , AMDGPULibFunc::HALF)
+    .Default(AMDGPULibFunc::NOPFX);
+
+  if (Pfx != AMDGPULibFunc::NOPFX)
+    mangledName = P.second;
+
+  return Pfx;
+}
+
+bool AMDGPULibFunc::parseName(const StringRef& fullName) {
+  FuncId = static_cast<EFuncId>(manglingRulesMap.lookup(fullName));
+  return FuncId != EI_NONE;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Itanium Demangling
+
+struct ItaniumParamParser {
+  AMDGPULibFunc::Param Prev;
+  bool parseItaniumParam(StringRef& param, AMDGPULibFunc::Param &res);
+};
+
+bool ItaniumParamParser::parseItaniumParam(StringRef& param,
+                                           AMDGPULibFunc::Param &res) {
+  res.reset();
+  if (param.empty()) return false;
+
+  // parse pointer prefix
+  if (eatTerm(param, 'P')) {
+    if (eatTerm(param, 'K')) res.PtrKind |= AMDGPULibFunc::CONST;
+    if (eatTerm(param, 'V')) res.PtrKind |= AMDGPULibFunc::VOLATILE;
+    if (!eatTerm(param, "U3AS")) {
+      res.PtrKind |= AMDGPULibFunc::PRIVATE;
+    } else {
+      switch(param.front()) {
+      case '1': res.PtrKind |= AMDGPULibFunc::GLOBAL;  break;
+      case '2': res.PtrKind |= AMDGPULibFunc::READONLY;break;
+      case '3': res.PtrKind |= AMDGPULibFunc::LOCAL;   break;
+      case '4': res.PtrKind |= AMDGPULibFunc::GENERIC; break;
+      case '5': res.PtrKind |= AMDGPULibFunc::OTHER;   break;
+      default: return false;
+      }
+      drop_front(param, 1);
+    }
+  } else {
+    res.PtrKind = AMDGPULibFunc::BYVALUE;
+  }
+
+  // parse vector size
+  if (eatTerm(param,"Dv")) {
+    res.VectorSize = parseVecSize(param);
+    if (res.VectorSize==1 || !eatTerm(param, '_')) return false;
+  }
+
+  // parse type
+  char const TC = param.front();
+  if (::isDigit(TC)) {
+    res.ArgType = StringSwitch<AMDGPULibFunc::EType>
+      (eatLengthPrefixedName(param))
+      .Case("ocl_image1darray" , AMDGPULibFunc::IMG1DA)
+      .Case("ocl_image1dbuffer", AMDGPULibFunc::IMG1DB)
+      .Case("ocl_image2darray" , AMDGPULibFunc::IMG2DA)
+      .Case("ocl_image1d"      , AMDGPULibFunc::IMG1D)
+      .Case("ocl_image2d"      , AMDGPULibFunc::IMG2D)
+      .Case("ocl_image3d"      , AMDGPULibFunc::IMG3D)
+      .Case("ocl_event"        , AMDGPULibFunc::DUMMY)
+      .Case("ocl_sampler"      , AMDGPULibFunc::DUMMY)
+      .Default(AMDGPULibFunc::DUMMY);
+  } else {
+    drop_front(param);
+    switch (TC) {
+    case 'h': res.ArgType =  AMDGPULibFunc::U8; break;
+    case 't': res.ArgType = AMDGPULibFunc::U16; break;
+    case 'j': res.ArgType = AMDGPULibFunc::U32; break;
+    case 'm': res.ArgType = AMDGPULibFunc::U64; break;
+    case 'c': res.ArgType =  AMDGPULibFunc::I8; break;
+    case 's': res.ArgType = AMDGPULibFunc::I16; break;
+    case 'i': res.ArgType = AMDGPULibFunc::I32; break;
+    case 'l': res.ArgType = AMDGPULibFunc::I64; break;
+    case 'f': res.ArgType = AMDGPULibFunc::F32; break;
+    case 'd': res.ArgType = AMDGPULibFunc::F64; break;
+    case 'D': if (!eatTerm(param, 'h')) return false;
+              res.ArgType = AMDGPULibFunc::F16; break;
+    case 'S':
+      if (!eatTerm(param, '_')) {
+        eatNumber(param);
+        if (!eatTerm(param, '_')) return false;
+      }
+      res.VectorSize = Prev.VectorSize;
+      res.ArgType    = Prev.ArgType;
+      break;
+    default:;
+    }
+  }
+  if (res.ArgType == 0) return false;
+  Prev.VectorSize = res.VectorSize;
+  Prev.ArgType    = res.ArgType;
+  return true;
+}
+
+bool AMDGPULibFunc::parseItanuimName(StringRef& mangledName) {
+  StringRef Name = eatLengthPrefixedName(mangledName);
+  FKind = parseNamePrefix(Name);
+  if (!parseName(Name)) return false;
+
+  const ManglingRule& Rule = manglingRules[FuncId];
+  ItaniumParamParser Parser;
+  for (int I=0; I < Rule.maxLeadIndex(); ++I) {
+    Param P;
+    if (!Parser.parseItaniumParam(mangledName, P))
+      return false;
+
+    if ((I + 1) == Rule.Lead[0]) Leads[0] = P;
+    if ((I + 1) == Rule.Lead[1]) Leads[1] = P;
+  }
+  return true;
+}
+
+bool AMDGPULibFunc::parse(StringRef mangledName, AMDGPULibFunc& iInfo) {
+  iInfo.reset();
+  if (mangledName.empty())
+    return false;
+
+  if (eatTerm(mangledName, "_Z")) {
+    return iInfo.parseItanuimName(mangledName);
+  }
+  return false;
+}
+
+StringRef AMDGPULibFunc::getUnmangledName(const StringRef& mangledName) {
+  StringRef S = mangledName;
+  if (eatTerm(S, "_Z"))
+    return eatLengthPrefixedName(S);
+  return StringRef();
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Mangling
+
+template <typename Stream>
+void AMDGPULibFunc::writeName(Stream& OS) const {
+  const char *Pfx = "";
+  switch (FKind) {
+  case NATIVE: Pfx = "native_"; break;
+  case HALF:   Pfx = "half_";   break;
+  default: break;
+  }
+  if (!Name.empty()) {
+    OS << Pfx << Name;
+  } else if (FuncId != EI_NONE) {
+    OS << Pfx;
+    const StringRef& S = manglingRules[FuncId].Name;
+    OS.write(S.data(), S.size());
+  }
+}
+
+std::string AMDGPULibFunc::mangle() const {
+  return mangleNameItanium();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Itanium Mangling
+
+static const char *getItaniumTypeName(AMDGPULibFunc::EType T) {
+  switch (T) {
+  case AMDGPULibFunc::U8:      return "h";
+  case AMDGPULibFunc::U16:     return "t";
+  case AMDGPULibFunc::U32:     return "j";
+  case AMDGPULibFunc::U64:     return "m";
+  case AMDGPULibFunc::I8:      return "c";
+  case AMDGPULibFunc::I16:     return "s";
+  case AMDGPULibFunc::I32:     return "i";
+  case AMDGPULibFunc::I64:     return "l";
+  case AMDGPULibFunc::F16:     return "Dh";
+  case AMDGPULibFunc::F32:     return "f";
+  case AMDGPULibFunc::F64:     return "d";
+  case AMDGPULibFunc::IMG1DA:  return "16ocl_image1darray";
+  case AMDGPULibFunc::IMG1DB:  return "17ocl_image1dbuffer";
+  case AMDGPULibFunc::IMG2DA:  return "16ocl_image2darray";
+  case AMDGPULibFunc::IMG1D:   return "11ocl_image1d";
+  case AMDGPULibFunc::IMG2D:   return "11ocl_image2d";
+  case AMDGPULibFunc::IMG3D:   return "11ocl_image3d";
+  case AMDGPULibFunc::SAMPLER: return "11ocl_sampler";
+  case AMDGPULibFunc::EVENT:   return "9ocl_event";
+  default: llvm_unreachable("Unhandeled param type");
+  }
+  return nullptr;
+}
+
+
+// Itanium mangling ABI says:
+// "5.1.8. Compression
+// ... Each non-terminal in the grammar for which <substitution> appears on the
+// right-hand side is both a source of future substitutions and a candidate
+// for being substituted. There are two exceptions that appear to be
+// substitution candidates from the grammar, but are explicitly excluded:
+// 1. <builtin-type> other than vendor extended types ..."
+
+// For the purpose of functions the following productions make sence for the
+// substitution:
+//  <type> ::= <builtin-type>
+//    ::= <class-enum-type>
+//    ::= <array-type>
+//    ::=<CV-qualifiers> <type>
+//    ::= P <type>                # pointer-to
+//    ::= <substitution>
+//
+// Note that while types like images, samplers and events are by the ABI encoded
+// using <class-enum-type> production rule they're not used for substitution
+// because clang consider them as builtin types.
+//
+// DvNN_ type is GCC extension for vectors and is a subject for the substitution.
+
+
+class ItaniumMangler {
+  SmallVector<AMDGPULibFunc::Param, 10> Str; // list of accumulated substituions
+  bool  UseAddrSpace;
+
+  int findSubst(const AMDGPULibFunc::Param& P) const {
+    for(unsigned I = 0; I < Str.size(); ++I) {
+      const AMDGPULibFunc::Param& T = Str[I];
+      if (P.PtrKind    == T.PtrKind &&
+          P.VectorSize == T.VectorSize &&
+          P.ArgType    == T.ArgType) {
+        return I;
+      }
+    }
+    return -1;
+  }
+
+  template <typename Stream>
+  bool trySubst(Stream& os, const AMDGPULibFunc::Param& p) {
+    int const subst = findSubst(p);
+    if (subst < 0) return false;
+    // Substitutions are mangled as S(XX)?_ where XX is a hexadecimal number
+    // 0   1    2
+    // S_  S0_  S1_
+    if (subst == 0) os << "S_";
+    else os << 'S' << (subst-1) << '_';
+    return true;
+  }
+
+public:
+  ItaniumMangler(bool useAddrSpace)
+    : UseAddrSpace(useAddrSpace) {}
+
+  template <typename Stream>
+  void operator()(Stream& os, AMDGPULibFunc::Param p) {
+
+    // Itanium mangling ABI 5.1.8. Compression:
+    // Logically, the substitutable components of a mangled name are considered
+    // left-to-right, components before the composite structure of which they
+    // are a part. If a component has been encountered before, it is substituted
+    // as described below. This decision is independent of whether its components
+    // have been substituted, so an implementation may optimize by considering
+    // large structures for substitution before their components. If a component
+    // has not been encountered before, its mangling is identified, and it is
+    // added to a dictionary of substitution candidates. No entity is added to
+    // the dictionary twice.
+    AMDGPULibFunc::Param Ptr;
+
+    if (p.PtrKind) {
+      if (trySubst(os, p)) return;
+      os << 'P';
+      if (p.PtrKind & AMDGPULibFunc::CONST) os << 'K';
+      if (p.PtrKind & AMDGPULibFunc::VOLATILE) os << 'V';
+      int AS = UseAddrSpace ? (p.PtrKind & AMDGPULibFunc::ADDR_SPACE)-1 : 0;
+      if (AS != 0) os << "U3AS" << AS;
+      Ptr = p;
+      p.PtrKind = 0;
+    }
+
+    if (p.VectorSize > 1) {
+      if (trySubst(os, p)) goto exit;
+      Str.push_back(p);
+      os << "Dv" << static_cast<unsigned>(p.VectorSize) << '_';
+    }
+
+    os << getItaniumTypeName((AMDGPULibFunc::EType)p.ArgType);
+
+  exit:
+    if (Ptr.ArgType) Str.push_back(Ptr);
+  }
+};
+
+std::string AMDGPULibFunc::mangleNameItanium() const {
+  SmallString<128> Buf;
+  raw_svector_ostream S(Buf);
+  SmallString<128> NameBuf;
+  raw_svector_ostream Name(NameBuf);
+  writeName(Name);
+  const StringRef& NameStr = Name.str();
+  S << "_Z" << static_cast<int>(NameStr.size()) << NameStr;
+
+  ItaniumMangler Mangler(true);
+  ParamIterator I(Leads, manglingRules[FuncId]);
+  Param P;
+  while ((P = I.getNextParam()).ArgType != 0)
+    Mangler(S, P);
+  return S.str();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Misc
+
+static Type* getIntrinsicParamType(
+  LLVMContext& C,
+  const AMDGPULibFunc::Param& P,
+  bool useAddrSpace) {
+  Type* T = nullptr;
+  switch (P.ArgType) {
+  case AMDGPULibFunc::U8:
+  case AMDGPULibFunc::I8:   T = Type::getInt8Ty(C);   break;
+  case AMDGPULibFunc::U16:
+  case AMDGPULibFunc::I16:  T = Type::getInt16Ty(C);  break;
+  case AMDGPULibFunc::U32:
+  case AMDGPULibFunc::I32:  T = Type::getInt32Ty(C);  break;
+  case AMDGPULibFunc::U64:
+  case AMDGPULibFunc::I64:  T = Type::getInt64Ty(C);  break;
+  case AMDGPULibFunc::F16:  T = Type::getHalfTy(C);   break;
+  case AMDGPULibFunc::F32:  T = Type::getFloatTy(C);  break;
+  case AMDGPULibFunc::F64:  T = Type::getDoubleTy(C); break;
+
+  case AMDGPULibFunc::IMG1DA:
+  case AMDGPULibFunc::IMG1DB:
+  case AMDGPULibFunc::IMG2DA:
+  case AMDGPULibFunc::IMG1D:
+  case AMDGPULibFunc::IMG2D:
+  case AMDGPULibFunc::IMG3D:
+    T = StructType::create(C,"ocl_image")->getPointerTo(); break;
+  case AMDGPULibFunc::SAMPLER:
+    T = StructType::create(C,"ocl_sampler")->getPointerTo(); break;
+  case AMDGPULibFunc::EVENT:
+    T = StructType::create(C,"ocl_event")->getPointerTo(); break;
+  default:
+    llvm_unreachable("Unhandeled param type");
+    return nullptr;
+  }
+  if (P.VectorSize > 1)
+    T = VectorType::get(T, P.VectorSize);
+  if (P.PtrKind != AMDGPULibFunc::BYVALUE)
+    T = useAddrSpace ? T->getPointerTo((P.PtrKind & AMDGPULibFunc::ADDR_SPACE)
+                                       - 1)
+                     : T->getPointerTo();
+  return T;
+}
+
+FunctionType* AMDGPULibFunc::getFunctionType(Module& M) const {
+  LLVMContext& C = M.getContext();
+  std::vector<Type*> Args;
+  ParamIterator I(Leads, manglingRules[FuncId]);
+  Param P;
+  while ((P=I.getNextParam()).ArgType != 0)
+    Args.push_back(getIntrinsicParamType(C, P, true));
+
+  return FunctionType::get(
+    getIntrinsicParamType(C, getRetType(FuncId, Leads), true),
+    Args, false);
+}
+
+unsigned AMDGPULibFunc::getNumArgs() const {
+  return manglingRules[FuncId].getNumArgs();
+}
+
+std::string AMDGPULibFunc::getName() const {
+  SmallString<128> Buf;
+  raw_svector_ostream OS(Buf);
+  writeName(OS);
+  return OS.str();
+}
+
+Function *AMDGPULibFunc::getFunction(Module *M, const AMDGPULibFunc& fInfo) {
+  std::string FuncName = fInfo.mangle();
+  Function *F = dyn_cast_or_null<Function>(
+    M->getValueSymbolTable().lookup(FuncName));
+
+  // check formal with actual types conformance
+  if (F && !F->isDeclaration()
+        && !F->isVarArg()
+        && F->arg_size() == fInfo.getNumArgs()) {
+    return F;
+  }
+  return nullptr;
+}
+
+Function *AMDGPULibFunc::getOrInsertFunction(Module *M,
+                                             const AMDGPULibFunc& fInfo) {
+  std::string const FuncName = fInfo.mangle();
+  Function *F = dyn_cast_or_null<Function>(
+    M->getValueSymbolTable().lookup(FuncName));
+
+  // check formal with actual types conformance
+  if (F && !F->isDeclaration()
+        && !F->isVarArg()
+        && F->arg_size() == fInfo.getNumArgs()) {
+    return F;
+  }
+
+  FunctionType *FuncTy = fInfo.getFunctionType(*M);
+
+  bool hasPtr = false;
+  for (FunctionType::param_iterator
+         PI = FuncTy->param_begin(),
+         PE = FuncTy->param_end();
+       PI != PE; ++PI) {
+    const Type* argTy = static_cast<const Type*>(*PI);
+    if (argTy->isPointerTy()) {
+      hasPtr = true;
+      break;
+    }
+  }
+
+  Constant *C = nullptr;
+  if (hasPtr) {
+    // Do not set extra attributes for functions with pointer arguments.
+    C = M->getOrInsertFunction(FuncName, FuncTy);
+  } else {
+    AttributeList Attr;
+    LLVMContext &Ctx = M->getContext();
+    Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::ReadOnly);
+    Attr.addAttribute(Ctx, AttributeList::FunctionIndex, Attribute::NoUnwind);
+    C = M->getOrInsertFunction(FuncName, FuncTy, Attr);
+  }
+
+  return cast<Function>(C);
+}

Added: llvm/trunk/lib/Target/AMDGPU/AMDGPULibFunc.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPULibFunc.h?rev=310731&view=auto
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPULibFunc.h (added)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPULibFunc.h Fri Aug 11 09:42:09 2017
@@ -0,0 +1,348 @@
+//===-- AMDGPULibFunc.h ---------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _AMDGPU_LIBFUNC_H_
+#define _AMDGPU_LIBFUNC_H_
+
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+
+class FunctionType;
+class Function;
+class Module;
+
+class AMDGPULibFunc {
+public:
+  enum EFuncId {
+    EI_NONE,
+
+    // IMPORTANT: enums below should go in ascending by 1 value order
+    // because they are used as indexes in the mangling rules table.
+    // don't use explicit value assignment.
+    EI_ABS,
+    EI_ABS_DIFF,
+    EI_ACOS,
+    EI_ACOSH,
+    EI_ACOSPI,
+    EI_ADD_SAT,
+    EI_ALL,
+    EI_ANY,
+    EI_ASIN,
+    EI_ASINH,
+    EI_ASINPI,
+    EI_ASYNC_WORK_GROUP_COPY,
+    EI_ASYNC_WORK_GROUP_STRIDED_COPY,
+    EI_ATAN,
+    EI_ATAN2,
+    EI_ATAN2PI,
+    EI_ATANH,
+    EI_ATANPI,
+    EI_ATOMIC_ADD,
+    EI_ATOMIC_AND,
+    EI_ATOMIC_CMPXCHG,
+    EI_ATOMIC_DEC,
+    EI_ATOMIC_INC,
+    EI_ATOMIC_MAX,
+    EI_ATOMIC_MIN,
+    EI_ATOMIC_OR,
+    EI_ATOMIC_SUB,
+    EI_ATOMIC_XCHG,
+    EI_ATOMIC_XOR,
+    EI_BITSELECT,
+    EI_CBRT,
+    EI_CEIL,
+    EI_CLAMP,
+    EI_CLZ,
+    EI_COMMIT_READ_PIPE,
+    EI_COMMIT_WRITE_PIPE,
+    EI_COPYSIGN,
+    EI_COS,
+    EI_COSH,
+    EI_COSPI,
+    EI_CROSS,
+    EI_CTZ,
+    EI_DEGREES,
+    EI_DISTANCE,
+    EI_DIVIDE,
+    EI_DOT,
+    EI_ERF,
+    EI_ERFC,
+    EI_EXP,
+    EI_EXP10,
+    EI_EXP2,
+    EI_EXPM1,
+    EI_FABS,
+    EI_FAST_DISTANCE,
+    EI_FAST_LENGTH,
+    EI_FAST_NORMALIZE,
+    EI_FDIM,
+    EI_FLOOR,
+    EI_FMA,
+    EI_FMAX,
+    EI_FMIN,
+    EI_FMOD,
+    EI_FRACT,
+    EI_FREXP,
+    EI_GET_IMAGE_ARRAY_SIZE,
+    EI_GET_IMAGE_CHANNEL_DATA_TYPE,
+    EI_GET_IMAGE_CHANNEL_ORDER,
+    EI_GET_IMAGE_DIM,
+    EI_GET_IMAGE_HEIGHT,
+    EI_GET_IMAGE_WIDTH,
+    EI_GET_PIPE_MAX_PACKETS,
+    EI_GET_PIPE_NUM_PACKETS,
+    EI_HADD,
+    EI_HYPOT,
+    EI_ILOGB,
+    EI_ISEQUAL,
+    EI_ISFINITE,
+    EI_ISGREATER,
+    EI_ISGREATEREQUAL,
+    EI_ISINF,
+    EI_ISLESS,
+    EI_ISLESSEQUAL,
+    EI_ISLESSGREATER,
+    EI_ISNAN,
+    EI_ISNORMAL,
+    EI_ISNOTEQUAL,
+    EI_ISORDERED,
+    EI_ISUNORDERED,
+    EI_LDEXP,
+    EI_LENGTH,
+    EI_LGAMMA,
+    EI_LGAMMA_R,
+    EI_LOG,
+    EI_LOG10,
+    EI_LOG1P,
+    EI_LOG2,
+    EI_LOGB,
+    EI_MAD,
+    EI_MAD24,
+    EI_MAD_HI,
+    EI_MAD_SAT,
+    EI_MAX,
+    EI_MAXMAG,
+    EI_MIN,
+    EI_MINMAG,
+    EI_MIX,
+    EI_MODF,
+    EI_MUL24,
+    EI_MUL_HI,
+    EI_NAN,
+    EI_NEXTAFTER,
+    EI_NORMALIZE,
+    EI_POPCOUNT,
+    EI_POW,
+    EI_POWN,
+    EI_POWR,
+    EI_PREFETCH,
+    EI_RADIANS,
+    EI_READ_PIPE,
+    EI_RECIP,
+    EI_REMAINDER,
+    EI_REMQUO,
+    EI_RESERVE_READ_PIPE,
+    EI_RESERVE_WRITE_PIPE,
+    EI_RHADD,
+    EI_RINT,
+    EI_ROOTN,
+    EI_ROTATE,
+    EI_ROUND,
+    EI_RSQRT,
+    EI_SELECT,
+    EI_SHUFFLE,
+    EI_SHUFFLE2,
+    EI_SIGN,
+    EI_SIGNBIT,
+    EI_SIN,
+    EI_SINCOS,
+    EI_SINH,
+    EI_SINPI,
+    EI_SMOOTHSTEP,
+    EI_SQRT,
+    EI_STEP,
+    EI_SUB_GROUP_BROADCAST,
+    EI_SUB_GROUP_COMMIT_READ_PIPE,
+    EI_SUB_GROUP_COMMIT_WRITE_PIPE,
+    EI_SUB_GROUP_REDUCE_ADD,
+    EI_SUB_GROUP_REDUCE_MAX,
+    EI_SUB_GROUP_REDUCE_MIN,
+    EI_SUB_GROUP_RESERVE_READ_PIPE,
+    EI_SUB_GROUP_RESERVE_WRITE_PIPE,
+    EI_SUB_GROUP_SCAN_EXCLUSIVE_ADD,
+    EI_SUB_GROUP_SCAN_EXCLUSIVE_MAX,
+    EI_SUB_GROUP_SCAN_EXCLUSIVE_MIN,
+    EI_SUB_GROUP_SCAN_INCLUSIVE_ADD,
+    EI_SUB_GROUP_SCAN_INCLUSIVE_MAX,
+    EI_SUB_GROUP_SCAN_INCLUSIVE_MIN,
+    EI_SUB_SAT,
+    EI_TAN,
+    EI_TANH,
+    EI_TANPI,
+    EI_TGAMMA,
+    EI_TRUNC,
+    EI_UPSAMPLE,
+    EI_VEC_STEP,
+    EI_VSTORE,
+    EI_VSTORE16,
+    EI_VSTORE2,
+    EI_VSTORE3,
+    EI_VSTORE4,
+    EI_VSTORE8,
+    EI_WORK_GROUP_COMMIT_READ_PIPE,
+    EI_WORK_GROUP_COMMIT_WRITE_PIPE,
+    EI_WORK_GROUP_REDUCE_ADD,
+    EI_WORK_GROUP_REDUCE_MAX,
+    EI_WORK_GROUP_REDUCE_MIN,
+    EI_WORK_GROUP_RESERVE_READ_PIPE,
+    EI_WORK_GROUP_RESERVE_WRITE_PIPE,
+    EI_WORK_GROUP_SCAN_EXCLUSIVE_ADD,
+    EI_WORK_GROUP_SCAN_EXCLUSIVE_MAX,
+    EI_WORK_GROUP_SCAN_EXCLUSIVE_MIN,
+    EI_WORK_GROUP_SCAN_INCLUSIVE_ADD,
+    EI_WORK_GROUP_SCAN_INCLUSIVE_MAX,
+    EI_WORK_GROUP_SCAN_INCLUSIVE_MIN,
+    EI_WRITE_IMAGEF,
+    EI_WRITE_IMAGEI,
+    EI_WRITE_IMAGEUI,
+    EI_WRITE_PIPE,
+    EI_NCOS,
+    EI_NEXP2,
+    EI_NFMA,
+    EI_NLOG2,
+    EI_NRCP,
+    EI_NRSQRT,
+    EI_NSIN,
+    EI_NSQRT,
+    EI_FTZ,
+    EI_FLDEXP,
+    EI_CLASS,
+    EI_RCBRT,
+
+    EX_INTRINSICS_COUNT
+  };
+
+  enum ENamePrefix {
+    NOPFX,
+    NATIVE,
+    HALF
+  };
+
+  enum EType {
+    B8  = 1,
+    B16 = 2,
+    B32 = 3,
+    B64 = 4,
+    SIZE_MASK = 7,
+    FLOAT = 0x10,
+    INT   = 0x20,
+    UINT  = 0x30,
+    BASE_TYPE_MASK = 0x30,
+    U8  =  UINT | B8,
+    U16 =  UINT | B16,
+    U32 =  UINT | B32,
+    U64 =  UINT | B64,
+    I8  =   INT | B8,
+    I16 =   INT | B16,
+    I32 =   INT | B32,
+    I64 =   INT | B64,
+    F16 = FLOAT | B16,
+    F32 = FLOAT | B32,
+    F64 = FLOAT | B64,
+    IMG1DA = 0x80,
+    IMG1DB,
+    IMG2DA,
+    IMG1D,
+    IMG2D,
+    IMG3D,
+    SAMPLER,
+    EVENT,
+    DUMMY
+  };
+
+  enum EPtrKind {
+    BYVALUE = 0,
+    PRIVATE,
+    GLOBAL,
+    READONLY,
+    LOCAL,
+    GENERIC,
+    OTHER,
+
+    ADDR_SPACE = 0xF,
+    CONST      = 0x10,
+    VOLATILE   = 0x20
+  };
+
+  struct Param {
+    unsigned char ArgType;
+    unsigned char VectorSize;
+    unsigned char PtrKind;
+
+    unsigned char Reserved;
+
+    void reset() {
+      ArgType = 0;
+      VectorSize = 1;
+      PtrKind = 0;
+    }
+    Param() { reset(); }
+
+    template <typename Stream>
+    void mangleItanium(Stream& os);
+  };
+
+public:
+  static bool      parse(StringRef mangledName, AMDGPULibFunc &iInfo);
+
+  AMDGPULibFunc();
+  AMDGPULibFunc(EFuncId id, const AMDGPULibFunc& copyFrom);
+
+  ENamePrefix   getPrefix() const { return FKind; }
+  EFuncId  getId() const { return FuncId; }
+
+  std::string   getName() const;
+  unsigned      getNumArgs() const;
+
+  FunctionType* getFunctionType(Module& M) const;
+
+  std::string   mangle() const;
+
+  void setPrefix(ENamePrefix pfx) { FKind = pfx; }
+  void setId(EFuncId id) { FuncId = id; }
+
+  static Function* getFunction(llvm::Module *M, const AMDGPULibFunc& fInfo);
+
+  static Function* getOrInsertFunction(llvm::Module *M,
+                                       const AMDGPULibFunc& fInfo);
+
+  static StringRef getUnmangledName(const StringRef& mangledName);
+
+  Param         Leads[2];
+
+private:
+  EFuncId       FuncId;
+  ENamePrefix   FKind;
+  std::string   Name;
+
+  void          reset();
+
+  std::string   mangleNameItanium() const;
+  bool          parseItanuimName(StringRef& mangledName);
+
+  std::string   mangleName(const StringRef& name) const;
+  bool          parseName(const StringRef& mangledName);
+
+  template <typename Stream>
+  void          writeName(Stream& OS) const;
+};
+
+}
+#endif // _AMDGPU_LIBFUNC_H_

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp?rev=310731&r1=310730&r2=310731&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp Fri Aug 11 09:42:09 2017
@@ -129,6 +129,13 @@ static cl::opt<bool> EnableAMDGPUFunctio
   cl::desc("Enable AMDGPU function call support"),
   cl::init(false));
 
+// Enable lib calls simplifications
+static cl::opt<bool> EnableLibCallSimplify(
+  "amdgpu-simplify-libcall",
+  cl::desc("Enable mdgpu library simplifications"),
+  cl::init(true),
+  cl::Hidden);
+
 extern "C" void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -170,6 +177,8 @@ extern "C" void LLVMInitializeAMDGPUTarg
   initializeSIFixWWMLivenessPass(*PR);
   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
   initializeAMDGPUAAWrapperPassPass(*PR);
+  initializeAMDGPUUseNativeCallsPass(*PR);
+  initializeAMDGPUSimplifyLibCallsPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -313,12 +322,12 @@ static ImmutablePass *createAMDGPUExtern
 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
   Builder.DivergentTarget = true;
 
-  bool Internalize = InternalizeSymbols &&
-                     (getOptLevel() > CodeGenOpt::None) &&
+  bool EnableOpt = getOptLevel() > CodeGenOpt::None;
+  bool Internalize = InternalizeSymbols && EnableOpt &&
                      (getTargetTriple().getArch() == Triple::amdgcn);
-  bool EarlyInline = EarlyInlineAll &&
-                     (getOptLevel() > CodeGenOpt::None);
-  bool AMDGPUAA = EnableAMDGPUAliasAnalysis && getOptLevel() > CodeGenOpt::None;
+  bool EarlyInline = EarlyInlineAll && EnableOpt;
+  bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
+  bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
 
   Builder.addExtension(
     PassManagerBuilder::EP_ModuleOptimizerEarly,
@@ -357,11 +366,15 @@ void AMDGPUTargetMachine::adjustPassMana
 
   Builder.addExtension(
     PassManagerBuilder::EP_EarlyAsPossible,
-    [AMDGPUAA](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+    [AMDGPUAA, LibCallSimplify](const PassManagerBuilder &,
+                                legacy::PassManagerBase &PM) {
       if (AMDGPUAA) {
         PM.add(createAMDGPUAAWrapperPass());
         PM.add(createAMDGPUExternalAAWrapperPass());
       }
+      PM.add(llvm::createAMDGPUUseNativeCallsPass());
+      if (LibCallSimplify)
+        PM.add(llvm::createAMDGPUSimplifyLibCallsPass());
   });
 
   Builder.addExtension(

Modified: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt?rev=310731&r1=310730&r2=310731&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt (original)
+++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt Fri Aug 11 09:42:09 2017
@@ -50,6 +50,8 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPURegisterInfo.cpp
   AMDGPURewriteOutArguments.cpp
   AMDGPUUnifyDivergentExitNodes.cpp
+  AMDGPULibFunc.cpp
+  AMDGPULibCalls.cpp
   GCNHazardRecognizer.cpp
   GCNSchedStrategy.cpp
   R600ClauseMergePass.cpp

Added: llvm/trunk/test/CodeGen/AMDGPU/simplify-libcalls.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/simplify-libcalls.ll?rev=310731&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/simplify-libcalls.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/simplify-libcalls.ll Fri Aug 11 09:42:09 2017
@@ -0,0 +1,683 @@
+; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-POSTLINK %s
+; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-PRELINK %s
+; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-use-native -amdgpu-prelink <%s | FileCheck -check-prefix=GCN -check-prefix=GCN-NATIVE %s
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos
+; GCN-POSTLINK: tail call fast float @_Z3sinf(
+; GCN-POSTLINK: tail call fast float @_Z3cosf(
+; GCN-PRELINK: call fast float @_Z6sincosfPU3AS4f(
+; GCN-NATIVE: tail call fast float @_Z10native_sinf(
+; GCN-NATIVE: tail call fast float @_Z10native_cosf(
+define amdgpu_kernel void @test_sincos(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3sinf(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  %call2 = tail call fast float @_Z3cosf(float %tmp)
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  store float %call2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+declare float @_Z3sinf(float)
+
+declare float @_Z3cosf(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v2
+; GCN-POSTLINK: tail call fast <2 x float> @_Z3sinDv2_f(
+; GCN-POSTLINK: tail call fast <2 x float> @_Z3cosDv2_f(
+; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPU3AS4S_(
+; GCN-NATIVE: tail call fast <2 x float> @_Z10native_sinDv2_f(
+; GCN-NATIVE: tail call fast <2 x float> @_Z10native_cosDv2_f(
+define amdgpu_kernel void @test_sincos_v2(<2 x float> addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load <2 x float>, <2 x float> addrspace(1)* %a, align 8
+  %call = tail call fast <2 x float> @_Z3sinDv2_f(<2 x float> %tmp)
+  store <2 x float> %call, <2 x float> addrspace(1)* %a, align 8
+  %call2 = tail call fast <2 x float> @_Z3cosDv2_f(<2 x float> %tmp)
+  %arrayidx3 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i64 1
+  store <2 x float> %call2, <2 x float> addrspace(1)* %arrayidx3, align 8
+  ret void
+}
+
+declare <2 x float> @_Z3sinDv2_f(<2 x float>)
+
+declare <2 x float> @_Z3cosDv2_f(<2 x float>)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v3
+; GCN-POSTLINK: tail call fast <3 x float> @_Z3sinDv3_f(
+; GCN-POSTLINK: tail call fast <3 x float> @_Z3cosDv3_f(
+; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPU3AS4S_(
+; GCN-NATIVE: tail call fast <3 x float> @_Z10native_sinDv3_f(
+; GCN-NATIVE: tail call fast <3 x float> @_Z10native_cosDv3_f(
+define amdgpu_kernel void @test_sincos_v3(<3 x float> addrspace(1)* nocapture %a) {
+entry:
+  %castToVec4 = bitcast <3 x float> addrspace(1)* %a to <4 x float> addrspace(1)*
+  %loadVec4 = load <4 x float>, <4 x float> addrspace(1)* %castToVec4, align 16
+  %extractVec4 = shufflevector <4 x float> %loadVec4, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %call = tail call fast <3 x float> @_Z3sinDv3_f(<3 x float> %extractVec4)
+  %extractVec6 = shufflevector <3 x float> %call, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  store <4 x float> %extractVec6, <4 x float> addrspace(1)* %castToVec4, align 16
+  %call11 = tail call fast <3 x float> @_Z3cosDv3_f(<3 x float> %extractVec4)
+  %arrayidx12 = getelementptr inbounds <3 x float>, <3 x float> addrspace(1)* %a, i64 1
+  %extractVec13 = shufflevector <3 x float> %call11, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %storetmp14 = bitcast <3 x float> addrspace(1)* %arrayidx12 to <4 x float> addrspace(1)*
+  store <4 x float> %extractVec13, <4 x float> addrspace(1)* %storetmp14, align 16
+  ret void
+}
+
+declare <3 x float> @_Z3sinDv3_f(<3 x float>)
+
+declare <3 x float> @_Z3cosDv3_f(<3 x float>)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v4
+; GCN-POSTLINK: tail call fast <4 x float> @_Z3sinDv4_f(
+; GCN-POSTLINK: tail call fast <4 x float> @_Z3cosDv4_f(
+; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPU3AS4S_(
+; GCN-NATIVE: tail call fast <4 x float> @_Z10native_sinDv4_f(
+; GCN-NATIVE: tail call fast <4 x float> @_Z10native_cosDv4_f(
+define amdgpu_kernel void @test_sincos_v4(<4 x float> addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
+  %call = tail call fast <4 x float> @_Z3sinDv4_f(<4 x float> %tmp)
+  store <4 x float> %call, <4 x float> addrspace(1)* %a, align 16
+  %call2 = tail call fast <4 x float> @_Z3cosDv4_f(<4 x float> %tmp)
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i64 1
+  store <4 x float> %call2, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+
+declare <4 x float> @_Z3sinDv4_f(<4 x float>)
+
+declare <4 x float> @_Z3cosDv4_f(<4 x float>)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v8
+; GCN-POSTLINK: tail call fast <8 x float> @_Z3sinDv8_f(
+; GCN-POSTLINK: tail call fast <8 x float> @_Z3cosDv8_f(
+; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPU3AS4S_(
+; GCN-NATIVE: tail call fast <8 x float> @_Z10native_sinDv8_f(
+; GCN-NATIVE: tail call fast <8 x float> @_Z10native_cosDv8_f(
+define amdgpu_kernel void @test_sincos_v8(<8 x float> addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load <8 x float>, <8 x float> addrspace(1)* %a, align 32
+  %call = tail call fast <8 x float> @_Z3sinDv8_f(<8 x float> %tmp)
+  store <8 x float> %call, <8 x float> addrspace(1)* %a, align 32
+  %call2 = tail call fast <8 x float> @_Z3cosDv8_f(<8 x float> %tmp)
+  %arrayidx3 = getelementptr inbounds <8 x float>, <8 x float> addrspace(1)* %a, i64 1
+  store <8 x float> %call2, <8 x float> addrspace(1)* %arrayidx3, align 32
+  ret void
+}
+
+declare <8 x float> @_Z3sinDv8_f(<8 x float>)
+
+declare <8 x float> @_Z3cosDv8_f(<8 x float>)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v16
+; GCN-POSTLINK: tail call fast <16 x float> @_Z3sinDv16_f(
+; GCN-POSTLINK: tail call fast <16 x float> @_Z3cosDv16_f(
+; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPU3AS4S_(
+; GCN-NATIVE: tail call fast <16 x float> @_Z10native_sinDv16_f(
+; GCN-NATIVE: tail call fast <16 x float> @_Z10native_cosDv16_f(
+define amdgpu_kernel void @test_sincos_v16(<16 x float> addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load <16 x float>, <16 x float> addrspace(1)* %a, align 64
+  %call = tail call fast <16 x float> @_Z3sinDv16_f(<16 x float> %tmp)
+  store <16 x float> %call, <16 x float> addrspace(1)* %a, align 64
+  %call2 = tail call fast <16 x float> @_Z3cosDv16_f(<16 x float> %tmp)
+  %arrayidx3 = getelementptr inbounds <16 x float>, <16 x float> addrspace(1)* %a, i64 1
+  store <16 x float> %call2, <16 x float> addrspace(1)* %arrayidx3, align 64
+  ret void
+}
+
+declare <16 x float> @_Z3sinDv16_f(<16 x float>)
+
+declare <16 x float> @_Z3cosDv16_f(<16 x float>)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_native_recip
+; GCN: store float 0x3FD5555560000000, float addrspace(1)* %a
+define amdgpu_kernel void @test_native_recip(float addrspace(1)* nocapture %a) {
+entry:
+  %call = tail call fast float @_Z12native_recipf(float 3.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z12native_recipf(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_half_recip
+; GCN: store float 0x3FD5555560000000, float addrspace(1)* %a
+define amdgpu_kernel void @test_half_recip(float addrspace(1)* nocapture %a) {
+entry:
+  %call = tail call fast float @_Z10half_recipf(float 3.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z10half_recipf(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_native_divide
+; GCN: fmul fast float %tmp, 0x3FD5555560000000
+define amdgpu_kernel void @test_native_divide(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z13native_divideff(float %tmp, float 3.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z13native_divideff(float, float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_half_divide
+; GCN: fmul fast float %tmp, 0x3FD5555560000000
+define amdgpu_kernel void @test_half_divide(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z11half_divideff(float %tmp, float 3.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z11half_divideff(float, float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_0f
+; GCN: store float 1.000000e+00, float addrspace(1)* %a
+define amdgpu_kernel void @test_pow_0f(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float 0.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z3powff(float, float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_0i
+; GCN: store float 1.000000e+00, float addrspace(1)* %a
+define amdgpu_kernel void @test_pow_0i(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float 0.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_1f
+; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
+; GCN: store float %tmp, float addrspace(1)* %a, align 4
+define amdgpu_kernel void @test_pow_1f(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float 1.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_1i
+; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
+; GCN: store float %tmp, float addrspace(1)* %a, align 4
+define amdgpu_kernel void @test_pow_1i(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float 1.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_2f
+; GCN: %tmp = load float, float addrspace(1)* %a, align 4
+; GCN: %__pow2 = fmul fast float %tmp, %tmp
+define amdgpu_kernel void @test_pow_2f(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float 2.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_2i
+; GCN: %tmp = load float, float addrspace(1)* %a, align 4
+; GCN: %__pow2 = fmul fast float %tmp, %tmp
+define amdgpu_kernel void @test_pow_2i(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float 2.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_m1f
+; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
+; GCN: %__powrecip = fdiv fast float 1.000000e+00, %tmp
+define amdgpu_kernel void @test_pow_m1f(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float -1.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_m1i
+; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
+; GCN: %__powrecip = fdiv fast float 1.000000e+00, %tmp
+define amdgpu_kernel void @test_pow_m1i(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float -1.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_half
+; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float 5.000000e-01)
+; GCN-PRELINK: %__pow2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
+define amdgpu_kernel void @test_pow_half(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float 5.000000e-01)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_mhalf
+; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float -5.000000e-01)
+; GCN-PRELINK: %__pow2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
+define amdgpu_kernel void @test_pow_mhalf(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float -5.000000e-01)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_c
+; GCN: %__powx2 = fmul fast float %tmp, %tmp
+; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
+; GCN: %__powx22 = fmul fast float %__powx2, %tmp
+; GCN: %0 = fmul fast float %__powx21, %__powx21
+; GCN: %__powprod3 = fmul fast float %0, %__powx22
+define amdgpu_kernel void @test_pow_c(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float 1.100000e+01)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_powr_c
+; GCN: %__powx2 = fmul fast float %tmp, %tmp
+; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
+; GCN: %__powx22 = fmul fast float %__powx2, %tmp
+; GCN: %0 = fmul fast float %__powx21, %__powx21
+; GCN: %__powprod3 = fmul fast float %0, %__powx22
+define amdgpu_kernel void @test_powr_c(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z4powrff(float %tmp, float 1.100000e+01)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z4powrff(float, float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pown_c
+; GCN: %__powx2 = fmul fast float %tmp, %tmp
+; GCN: %__powx21 = fmul fast float %__powx2, %__powx2
+; GCN: %__powx22 = fmul fast float %__powx2, %tmp
+; GCN: %0 = fmul fast float %__powx21, %__powx21
+; GCN: %__powprod3 = fmul fast float %0, %__powx22
+define amdgpu_kernel void @test_pown_c(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z4pownfi(float %tmp, i32 11)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z4pownfi(float, i32)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow
+; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float 1.013000e+03)
+; GCN-PRELINK: %__fabs = tail call fast float @_Z4fabsf(float %tmp)
+; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs)
+; GCN-PRELINK: %__ylogx = fmul fast float %__log2, 1.013000e+03
+; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
+; GCN-PRELINK: %0 = bitcast float %tmp to i32
+; GCN-PRELINK: %__pow_sign = and i32 %0, -2147483648
+; GCN-PRELINK: %1 = bitcast float %__exp2 to i32
+; GCN-PRELINK: %2 = or i32 %__pow_sign, %1
+; GCN-PRELINK: %3 = bitcast float addrspace(1)* %a to i32 addrspace(1)*
+; GCN-PRELINK: store i32 %2, i32 addrspace(1)* %3, align 4
+define amdgpu_kernel void @test_pow(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3powff(float %tmp, float 1.013000e+03)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_powr
+; GCN-POSTLINK: tail call fast float @_Z4powrff(float %tmp, float %tmp1)
+; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %tmp)
+; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %tmp1
+; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
+; GCN-PRELINK: store float %__exp2, float addrspace(1)* %a, align 4
+; GCN-NATIVE:  %__log2 = tail call fast float @_Z11native_log2f(float %tmp)
+; GCN-NATIVE:  %__ylogx = fmul fast float %__log2, %tmp1
+; GCN-NATIVE:  %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx)
+; GCN-NATIVE:  store float %__exp2, float addrspace(1)* %a, align 4
+define amdgpu_kernel void @test_powr(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
+  %call = tail call fast float @_Z4powrff(float %tmp, float %tmp1)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pown
+; GCN-POSTLINK: tail call fast float @_Z4pownfi(float %tmp, i32 %conv)
+; GCN-PRELINK: %conv = fptosi float %tmp1 to i32
+; GCN-PRELINK: %__fabs = tail call fast float @_Z4fabsf(float %tmp)
+; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs)
+; GCN-PRELINK: %pownI2F = sitofp i32 %conv to float
+; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %pownI2F
+; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
+; GCN-PRELINK: %__yeven = shl i32 %conv, 31
+; GCN-PRELINK: %0 = bitcast float %tmp to i32
+; GCN-PRELINK: %__pow_sign = and i32 %__yeven, %0
+; GCN-PRELINK: %1 = bitcast float %__exp2 to i32
+; GCN-PRELINK: %2 = or i32 %__pow_sign, %1
+; GCN-PRELINK: %3 = bitcast float addrspace(1)* %a to i32 addrspace(1)*
+; GCN-PRELINK: store i32 %2, i32 addrspace(1)* %3, align 4
+define amdgpu_kernel void @test_pown(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
+  %conv = fptosi float %tmp1 to i32
+  %call = tail call fast float @_Z4pownfi(float %tmp, i32 %conv)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_1
+; GCN: %tmp = load float, float addrspace(1)* %arrayidx, align 4
+; GCN: store float %tmp, float addrspace(1)* %a, align 4
+define amdgpu_kernel void @test_rootn_1(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 1)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z5rootnfi(float, i32)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_2
+; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 2)
+; GCN-PRELINK: %__rootn2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
+define amdgpu_kernel void @test_rootn_2(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 2)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_3
+; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 3)
+; GCN-PRELINK: %__rootn2cbrt = tail call fast float @_Z4cbrtf(float %tmp)
+define amdgpu_kernel void @test_rootn_3(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 3)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m1
+; GCN: fdiv fast float 1.000000e+00, %tmp
+define amdgpu_kernel void @test_rootn_m1(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 -1)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m2
+; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 -2)
+; GCN-PRELINK: %__rootn2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
+define amdgpu_kernel void @test_rootn_m2(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 -2)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_0x
+; GCN: store float %y, float addrspace(1)* %a
+define amdgpu_kernel void @test_fma_0x(float addrspace(1)* nocapture %a, float %y) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3fmafff(float 0.000000e+00, float %tmp, float %y)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z3fmafff(float, float, float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x0
+; GCN: store float %y, float addrspace(1)* %a
+define amdgpu_kernel void @test_fma_x0(float addrspace(1)* nocapture %a, float %y) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3fmafff(float %tmp, float 0.000000e+00, float %y)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_0x
+; GCN: store float %y, float addrspace(1)* %a
+define amdgpu_kernel void @test_mad_0x(float addrspace(1)* nocapture %a, float %y) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3madfff(float 0.000000e+00, float %tmp, float %y)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z3madfff(float, float, float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_x0
+; GCN: store float %y, float addrspace(1)* %a
+define amdgpu_kernel void @test_mad_x0(float addrspace(1)* nocapture %a, float %y) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3madfff(float %tmp, float 0.000000e+00, float %y)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x1y
+; GCN: %fmaadd = fadd fast float %tmp, %y
+define amdgpu_kernel void @test_fma_x1y(float addrspace(1)* nocapture %a, float %y) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3fmafff(float %tmp, float 1.000000e+00, float %y)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_1xy
+; GCN: %fmaadd = fadd fast float %tmp, %y
+define amdgpu_kernel void @test_fma_1xy(float addrspace(1)* nocapture %a, float %y) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3fmafff(float 1.000000e+00, float %tmp, float %y)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_xy0
+; GCN: %fmamul = fmul fast float %tmp1, %tmp
+define amdgpu_kernel void @test_fma_xy0(float addrspace(1)* nocapture %a) {
+entry:
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp = load float, float addrspace(1)* %arrayidx, align 4
+  %tmp1 = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3fmafff(float %tmp, float %tmp1, float 0.000000e+00)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp
+; GCN-NATIVE: tail call fast float @_Z10native_expf(float %tmp)
+define amdgpu_kernel void @test_use_native_exp(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3expf(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z3expf(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp2
+; GCN-NATIVE: tail call fast float @_Z11native_exp2f(float %tmp)
+define amdgpu_kernel void @test_use_native_exp2(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z4exp2f(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z4exp2f(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp10
+; GCN-NATIVE: tail call fast float @_Z12native_exp10f(float %tmp)
+define amdgpu_kernel void @test_use_native_exp10(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z5exp10f(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z5exp10f(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log
+; GCN-NATIVE: tail call fast float @_Z10native_logf(float %tmp)
+define amdgpu_kernel void @test_use_native_log(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3logf(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z3logf(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log2
+; GCN-NATIVE: tail call fast float @_Z11native_log2f(float %tmp)
+define amdgpu_kernel void @test_use_native_log2(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z4log2f(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z4log2f(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log10
+; GCN-NATIVE: tail call fast float @_Z12native_log10f(float %tmp)
+define amdgpu_kernel void @test_use_native_log10(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z5log10f(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z5log10f(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_powr
+; GCN-NATIVE: %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
+; GCN-NATIVE: %__log2 = tail call fast float @_Z11native_log2f(float %tmp)
+; GCN-NATIVE: %__ylogx = fmul fast float %__log2, %tmp1
+; GCN-NATIVE: %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx)
+; GCN-NATIVE: store float %__exp2, float addrspace(1)* %a, align 4
+define amdgpu_kernel void @test_use_native_powr(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
+  %call = tail call fast float @_Z4powrff(float %tmp, float %tmp1)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sqrt
+; GCN-NATIVE: tail call fast float @_Z11native_sqrtf(float %tmp)
+define amdgpu_kernel void @test_use_native_sqrt(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z4sqrtf(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z4sqrtf(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_rsqrt
+; GCN-NATIVE: tail call fast float @_Z12native_rsqrtf(float %tmp)
+define amdgpu_kernel void @test_use_native_rsqrt(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z5rsqrtf(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z5rsqrtf(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_tan
+; GCN-NATIVE: tail call fast float @_Z10native_tanf(float %tmp)
+define amdgpu_kernel void @test_use_native_tan(float addrspace(1)* nocapture %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %call = tail call fast float @_Z3tanf(float %tmp)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z3tanf(float)
+
+; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sincos
+; GCN-NATIVE: tail call float @_Z10native_sinf(float %tmp)
+; GCN-NATIVE: tail call float @_Z10native_cosf(float %tmp)
+define amdgpu_kernel void @test_use_native_sincos(float addrspace(1)* %a) {
+entry:
+  %tmp = load float, float addrspace(1)* %a, align 4
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
+  %tmp1 = addrspacecast float addrspace(1)* %arrayidx1 to float addrspace(4)*
+  %call = tail call fast float @_Z6sincosfPU3AS4f(float %tmp, float addrspace(4)* %tmp1)
+  store float %call, float addrspace(1)* %a, align 4
+  ret void
+}
+
+declare float @_Z6sincosfPU3AS4f(float, float addrspace(4)*)