[llvm] [AMDGPU] Add half vector support for table-driven libcall optimzation (PR #178638)

Thu Jan 29 04:21:40 PST 2026

https://github.com/steffenlarsen updated https://github.com/llvm/llvm-project/pull/178638

>From e9fc863388d6fe4bcec65a4bc7e72a22e810f8ab Mon Sep 17 00:00:00 2001
From: Steffen Holst Larsen <HolstLarsen.Steffen at amd.com>
Date: Wed, 28 Jan 2026 07:58:03 -0600
Subject: [PATCH] [AMDGPU] Add half vector support for table-driven libcall
 optimzation

When replacing certain AMDGPU library calls with constant data vectors,
the existing implementation only handled single and double-precision
floats. This change extends the functionality to also support
half-precision floats.

Additionally, it refactors the function responsible for generating
constant float data vectors to improve readability and reduces code
duplication. In tandem with this refactoring, the patch relaxes the
check for constant data vectors to include any constant of vector type.
This allows other constant vectors to be processed, such as those
created from constant aggregate zeros (e.g.
`<2 x float> zeroinitializer`).

Signed-off-by: Steffen Holst Larsen <HolstLarsen.Steffen at amd.com>
---
 llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp     | 102 +++++++++---------
 .../amdgpu-simplify-libcall-tdo-acos.ll       |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-acosh.ll      |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-acospi.ll     |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-asin.ll       |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-asinh.ll      |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-asinpi.ll     |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-atan.ll       |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-atanh.ll      |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-atanpi.ll     |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-cbrt.ll       |  87 +++++++++++++++
 .../AMDGPU/amdgpu-simplify-libcall-tdo-cos.ll |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-cosh.ll       |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-cospi.ll      |  87 +++++++++++++++
 .../AMDGPU/amdgpu-simplify-libcall-tdo-erf.ll |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-erfc.ll       |  87 +++++++++++++++
 .../AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-exp10.ll      |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-exp2.ll       |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-expm1.ll      |  87 +++++++++++++++
 .../AMDGPU/amdgpu-simplify-libcall-tdo-log.ll |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-log10.ll      |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-log2.ll       |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-rsqrt.ll      |  87 +++++++++++++++
 .../AMDGPU/amdgpu-simplify-libcall-tdo-sin.ll |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-sinh.ll       |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-sinpi.ll      |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-sqrt.ll       |  87 +++++++++++++++
 .../AMDGPU/amdgpu-simplify-libcall-tdo-tan.ll |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-tanh.ll       |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-tanpi.ll      |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-tgamma.ll     |  87 +++++++++++++++
 32 files changed, 2750 insertions(+), 49 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acos.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acosh.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acospi.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asin.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinh.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinpi.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atan.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanh.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanpi.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cbrt.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cos.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cosh.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cospi.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erf.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erfc.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp10.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp2.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-expm1.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log10.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log2.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-rsqrt.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sin.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinh.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinpi.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sqrt.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tan.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanh.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanpi.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tgamma.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 4a553beb63bb1..84b8ec22ff86a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -743,6 +743,38 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
   return false;
 }
 
+static Constant *
+_Z4coshdgetConstantFloatVectorForArgType(LLVMContext &Ctx,
+                                         AMDGPULibFunc::EType ArgType,
+                                         ArrayRef<double> Values, Type *Ty) {
+  switch (ArgType) {
+  case AMDGPULibFunc::F16: {
+    SmallVector<uint16_t, 0> HalfIntValues;
+    for (double D : Values) {
+      APFloat APF16 = APFloat(D);
+      [[maybe_unused]] bool Unused;
+      APF16.convert(llvm::APFloat::IEEEhalf(),
+                    llvm::RoundingMode::NearestTiesToEven, &Unused);
+      uint16_t APF16Int = APF16.bitcastToAPInt().getZExtValue();
+      HalfIntValues.push_back(APF16Int);
+    }
+    ArrayRef<uint16_t> Tmp(HalfIntValues);
+    return ConstantDataVector::getFP(Ty->getScalarType(), Tmp);
+  }
+  case AMDGPULibFunc::F32: {
+    SmallVector<float, 0> FValues;
+    for (double D : Values)
+      FValues.push_back((float)D);
+    ArrayRef<float> Tmp(FValues);
+    return ConstantDataVector::get(Ty->getContext(), Tmp);
+  }
+  case AMDGPULibFunc::F64:
+    return ConstantDataVector::get(Ty->getContext(), Values);
+  default:
+    llvm_unreachable("Unsupported argument type");
+  }
+}
+
 bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
   // Table-Driven optimization
   const TableRef tr = getOptTable(FInfo.getId());
@@ -753,39 +785,26 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
   Value *opr0 = CI->getArgOperand(0);
 
   if (getVecSize(FInfo) > 1) {
-    if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(opr0)) {
-      SmallVector<double, 0> DVal;
+    // Vector version
+    Constant *CV = dyn_cast<Constant>(opr0);
+    if (CV && CV->getType()->isVectorTy()) {
+      SmallVector<double, 0> DValues;
       for (int eltNo = 0; eltNo < getVecSize(FInfo); ++eltNo) {
-        ConstantFP *eltval = dyn_cast<ConstantFP>(
-                               CV->getElementAsConstant((unsigned)eltNo));
+        ConstantFP *eltval =
+            dyn_cast<ConstantFP>(CV->getAggregateElement((unsigned)eltNo));
         assert(eltval && "Non-FP arguments in math function!");
-        bool found = false;
-        for (int i=0; i < sz; ++i) {
-          if (eltval->isExactlyValue(tr[i].input)) {
-            DVal.push_back(tr[i].result);
-            found = true;
-            break;
-          }
-        }
-        if (!found) {
-          // This vector constants not handled yet.
+        auto MatchingRow = std::find_if(
+            tr.begin(), tr.end(), [eltval](const TableEntry &entry) {
+              return eltval->isExactlyValue(entry.input);
+            });
+        if (MatchingRow == tr.end())
           return false;
-        }
+        DValues.push_back(MatchingRow->result);
       }
-      LLVMContext &context = CI->getContext();
-      Constant *nval;
-      if (getArgType(FInfo) == AMDGPULibFunc::F32) {
-        SmallVector<float, 0> FVal;
-        for (double D : DVal)
-          FVal.push_back((float)D);
-        ArrayRef<float> tmp(FVal);
-        nval = ConstantDataVector::get(context, tmp);
-      } else { // F64
-        ArrayRef<double> tmp(DVal);
-        nval = ConstantDataVector::get(context, tmp);
-      }
-      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
-      replaceCall(CI, nval);
+      Constant *NewValues = _Z4coshdgetConstantFloatVectorForArgType(
+          CI->getContext(), getArgType(FInfo), DValues, CI->getType());
+      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *NewValues << "\n");
+      replaceCall(CI, NewValues);
       return true;
     }
   } else {
@@ -1592,26 +1611,11 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
     if (hasTwoResults)
       nval1 = ConstantFP::get(aCI->getType(), DVal1[0]);
   } else {
-    if (getArgType(FInfo) == AMDGPULibFunc::F32) {
-      SmallVector <float, 0> FVal0, FVal1;
-      for (int i = 0; i < FuncVecSize; ++i)
-        FVal0.push_back((float)DVal0[i]);
-      ArrayRef<float> tmp0(FVal0);
-      nval0 = ConstantDataVector::get(context, tmp0);
-      if (hasTwoResults) {
-        for (int i = 0; i < FuncVecSize; ++i)
-          FVal1.push_back((float)DVal1[i]);
-        ArrayRef<float> tmp1(FVal1);
-        nval1 = ConstantDataVector::get(context, tmp1);
-      }
-    } else {
-      ArrayRef<double> tmp0(DVal0);
-      nval0 = ConstantDataVector::get(context, tmp0);
-      if (hasTwoResults) {
-        ArrayRef<double> tmp1(DVal1);
-        nval1 = ConstantDataVector::get(context, tmp1);
-      }
-    }
+    nval0 = _Z4coshdgetConstantFloatVectorForArgType(context, getArgType(FInfo),
+                                                     DVal0, aCI->getType());
+    if (hasTwoResults)
+      nval1 = _Z4coshdgetConstantFloatVectorForArgType(
+          context, getArgType(FInfo), DVal1, aCI->getType());
   }
 
   if (hasTwoResults) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acos.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acos.ll
new file mode 100644
index 0000000000000..64c8c8186b7ce
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acos.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_acos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_acos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0x3FF921FB60000000, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4acosf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_acos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_acos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 0x3FF921FB60000000, float 0.000000e+00>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4acosDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_acos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_acos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3E48, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4acosDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_acos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_acos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH3E48, half 0xH0000>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4acosDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_acos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_acos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0x3FF921FB54442D18, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4acosd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_acos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_acos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 0x3FF921FB54442D18, double 0.000000e+00>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4acosDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4acosf(float)
+declare <2 x float>  @_Z4acosDv2_f(<2 x float>)
+declare half         @_Z4acosDh(half)
+declare <2 x half>   @_Z4acosDv2_Dh(<2 x half>)
+declare double       @_Z4acosd(double)
+declare <2 x double> @_Z4acosDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acosh.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acosh.ll
new file mode 100644
index 0000000000000..7c13788f5ee60
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acosh.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_acosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_acosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5acoshf(float 1.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_acosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_acosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5acoshDv2_f(<2 x float> <float 1.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_acosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_acosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5acoshDh(half 1.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_acosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_acosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5acoshDv2_Dh(<2 x half> <half 1.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_acosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_acosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5acoshd(double 1.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_acosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_acosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5acoshDv2_d(<2 x double> <double 1.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5acoshf(float)
+declare <2 x float>  @_Z5acoshDv2_f(<2 x float>)
+declare half         @_Z5acoshDh(half)
+declare <2 x half>   @_Z5acoshDv2_Dh(<2 x half>)
+declare double       @_Z5acoshd(double)
+declare <2 x double> @_Z5acoshDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acospi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acospi.ll
new file mode 100644
index 0000000000000..b350fe5957ac7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acospi.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_acospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_acospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 5.000000e-01, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z6acospif(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_acospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_acospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 5.000000e-01, float 0.000000e+00>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z6acospiDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_acospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_acospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3800, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z6acospiDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_acospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_acospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH3800, half 0xH0000>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z6acospiDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_acospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_acospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 5.000000e-01, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z6acospid(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_acospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_acospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 5.000000e-01, double 0.000000e+00>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z6acospiDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z6acospif(float)
+declare <2 x float>  @_Z6acospiDv2_f(<2 x float>)
+declare half         @_Z6acospiDh(half)
+declare <2 x half>   @_Z6acospiDv2_Dh(<2 x half>)
+declare double       @_Z6acospid(double)
+declare <2 x double> @_Z6acospiDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asin.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asin.ll
new file mode 100644
index 0000000000000..d94827e037dd4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asin.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_asin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_asin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4asinf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_asin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_asin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float 0x3FF921FB60000000>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4asinDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_asin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_asin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4asinDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_asin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_asin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH3E48>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4asinDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_asin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_asin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4asind(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_asin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_asin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double 0x3FF921FB54442D18>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4asinDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4asinf(float)
+declare <2 x float>  @_Z4asinDv2_f(<2 x float>)
+declare half         @_Z4asinDh(half)
+declare <2 x half>   @_Z4asinDv2_Dh(<2 x half>)
+declare double       @_Z4asind(double)
+declare <2 x double> @_Z4asinDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinh.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinh.ll
new file mode 100644
index 0000000000000..a8b72c758927e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinh.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_asinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_asinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5asinhf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_asinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_asinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float -0.000000e+00>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5asinhDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_asinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_asinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5asinhDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_asinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_asinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH8000>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5asinhDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_asinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_asinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5asinhd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_asinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_asinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double -0.000000e+00>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5asinhDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5asinhf(float)
+declare <2 x float>  @_Z5asinhDv2_f(<2 x float>)
+declare half         @_Z5asinhDh(half)
+declare <2 x half>   @_Z5asinhDv2_Dh(<2 x half>)
+declare double       @_Z5asinhd(double)
+declare <2 x double> @_Z5asinhDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinpi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinpi.ll
new file mode 100644
index 0000000000000..ed2d4bdb8875c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinpi.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_asinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_asinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z6asinpif(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_asinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_asinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float 5.000000e-01>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z6asinpiDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_asinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_asinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z6asinpiDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_asinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_asinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH3800>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z6asinpiDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_asinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_asinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z6asinpid(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_asinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_asinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double 5.000000e-01>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z6asinpiDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z6asinpif(float)
+declare <2 x float>  @_Z6asinpiDv2_f(<2 x float>)
+declare half         @_Z6asinpiDh(half)
+declare <2 x half>   @_Z6asinpiDv2_Dh(<2 x half>)
+declare double       @_Z6asinpid(double)
+declare <2 x double> @_Z6asinpiDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atan.ll
new file mode 100644
index 0000000000000..aa74d2228fd1d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atan.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_atan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_atan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4atanf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_atan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_atan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float 0x3FE921FB60000000>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4atanDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_atan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_atan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4atanDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_atan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_atan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH3A48>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4atanDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_atan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_atan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4atand(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_atan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_atan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double 0x3FE921FB54442D18>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4atanDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4atanf(float)
+declare <2 x float>  @_Z4atanDv2_f(<2 x float>)
+declare half         @_Z4atanDh(half)
+declare <2 x half>   @_Z4atanDv2_Dh(<2 x half>)
+declare double       @_Z4atand(double)
+declare <2 x double> @_Z4atanDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanh.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanh.ll
new file mode 100644
index 0000000000000..c1eac02bcb83d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanh.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_atanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_atanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5atanhf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_atanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_atanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float -0.000000e+00>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5atanhDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_atanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_atanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5atanhDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_atanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_atanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH8000>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5atanhDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_atanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_atanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5atanhd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_atanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_atanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double -0.000000e+00>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5atanhDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5atanhf(float)
+declare <2 x float>  @_Z5atanhDv2_f(<2 x float>)
+declare half         @_Z5atanhDh(half)
+declare <2 x half>   @_Z5atanhDv2_Dh(<2 x half>)
+declare double       @_Z5atanhd(double)
+declare <2 x double> @_Z5atanhDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanpi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanpi.ll
new file mode 100644
index 0000000000000..0b985d07935c4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanpi.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_atanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_atanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z6atanpif(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_atanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_atanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float 2.500000e-01>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z6atanpiDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_atanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_atanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z6atanpiDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_atanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_atanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH3400>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z6atanpiDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_atanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_atanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z6atanpid(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_atanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_atanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double 2.500000e-01>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z6atanpiDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z6atanpif(float)
+declare <2 x float>  @_Z6atanpiDv2_f(<2 x float>)
+declare half         @_Z6atanpiDh(half)
+declare <2 x half>   @_Z6atanpiDv2_Dh(<2 x half>)
+declare double       @_Z6atanpid(double)
+declare <2 x double> @_Z6atanpiDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cbrt.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cbrt.ll
new file mode 100644
index 0000000000000..e99d5fee0f988
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cbrt.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_cbrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_cbrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4cbrtf(float 1.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_cbrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_cbrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 1.000000e+00, float -1.000000e+00>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4cbrtDv2_f(<2 x float> <float 1.000000e+00, float -1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_cbrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_cbrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4cbrtDh(half 1.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_cbrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_cbrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH3C00, half 0xHBC00>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4cbrtDv2_Dh(<2 x half> <half 1.000000e+00, half -1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_cbrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_cbrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4cbrtd(double 1.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_cbrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_cbrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 1.000000e+00, double -1.000000e+00>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4cbrtDv2_d(<2 x double> <double 1.000000e+00, double -1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4cbrtf(float)
+declare <2 x float>  @_Z4cbrtDv2_f(<2 x float>)
+declare half         @_Z4cbrtDh(half)
+declare <2 x half>   @_Z4cbrtDv2_Dh(<2 x half>)
+declare double       @_Z4cbrtd(double)
+declare <2 x double> @_Z4cbrtDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cos.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cos.ll
new file mode 100644
index 0000000000000..0998ae8ab3b25
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cos.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_cos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_cos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z3cosf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_cos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_cos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z3cosDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_cos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_cos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z3cosDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_cos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_cos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z3cosDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_cos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_cos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z3cosd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_cos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_cos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z3cosDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z3cosf(float)
+declare <2 x float>  @_Z3cosDv2_f(<2 x float>)
+declare half         @_Z3cosDh(half)
+declare <2 x half>   @_Z3cosDv2_Dh(<2 x half>)
+declare double       @_Z3cosd(double)
+declare <2 x double> @_Z3cosDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cosh.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cosh.ll
new file mode 100644
index 0000000000000..c151ed27ca197
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cosh.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_cosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_cosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4coshf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_cosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_cosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4coshDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_cosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_cosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4coshDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_cosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_cosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4coshDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_cosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_cosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4coshd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_cosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_cosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4coshDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4coshf(float)
+declare <2 x float>  @_Z4coshDv2_f(<2 x float>)
+declare half         @_Z4coshDh(half)
+declare <2 x half>   @_Z4coshDv2_Dh(<2 x half>)
+declare double       @_Z4coshd(double)
+declare <2 x double> @_Z4coshDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cospi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cospi.ll
new file mode 100644
index 0000000000000..4dce71bc5a5df
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cospi.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_cospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_cospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5cospif(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_cospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_cospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5cospiDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_cospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_cospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5cospiDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_cospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_cospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5cospiDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_cospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_cospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5cospid(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_cospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_cospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5cospiDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5cospif(float)
+declare <2 x float>  @_Z5cospiDv2_f(<2 x float>)
+declare half         @_Z5cospiDh(half)
+declare <2 x half>   @_Z5cospiDv2_Dh(<2 x half>)
+declare double       @_Z5cospid(double)
+declare <2 x double> @_Z5cospiDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erf.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erf.ll
new file mode 100644
index 0000000000000..ea54a59dcf907
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erf.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_erf(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_erf(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z3erff(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_erf(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_erf(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float -0.000000e+00>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z3erfDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_erf(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_erf(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z3erfDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_erf(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_erf(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH8000>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z3erfDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_erf(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_erf(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z3erfd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_erf(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_erf(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double -0.000000e+00>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z3erfDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z3erff(float)
+declare <2 x float>  @_Z3erfDv2_f(<2 x float>)
+declare half         @_Z3erfDh(half)
+declare <2 x half>   @_Z3erfDv2_Dh(<2 x half>)
+declare double       @_Z3erfd(double)
+declare <2 x double> @_Z3erfDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erfc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erfc.ll
new file mode 100644
index 0000000000000..1b1c18940987e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erfc.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_erfc(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_erfc(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4erfcf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_erfc(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_erfc(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4erfcDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_erfc(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_erfc(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4erfcDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_erfc(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_erfc(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4erfcDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_erfc(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_erfc(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4erfcd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_erfc(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_erfc(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4erfcDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4erfcf(float)
+declare <2 x float>  @_Z4erfcDv2_f(<2 x float>)
+declare half         @_Z4erfcDh(half)
+declare <2 x half>   @_Z4erfcDv2_Dh(<2 x half>)
+declare double       @_Z4erfcd(double)
+declare <2 x double> @_Z4erfcDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll
new file mode 100644
index 0000000000000..de29931287665
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_exp(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_exp(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z3expf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_exp(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_exp(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 1.000000e+00, float 0x4005BF0A80000000>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z3expDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_exp(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_exp(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z3expDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_exp(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_exp(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH3C00, half 0xH4170>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z3expDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_exp(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_exp(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z3expd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_exp(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_exp(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 1.000000e+00, double 0x4005BF0A8B145769>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z3expDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z3expf(float)
+declare <2 x float>  @_Z3expDv2_f(<2 x float>)
+declare half         @_Z3expDh(half)
+declare <2 x half>   @_Z3expDv2_Dh(<2 x half>)
+declare double       @_Z3expd(double)
+declare <2 x double> @_Z3expDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp10.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp10.ll
new file mode 100644
index 0000000000000..7aea2fd4763b0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp10.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_exp10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_exp10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5exp10f(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_exp10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_exp10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 1.000000e+00, float 1.000000e+01>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5exp10Dv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_exp10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_exp10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5exp10Dh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_exp10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_exp10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH3C00, half 0xH4900>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5exp10Dv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_exp10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_exp10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5exp10d(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_exp10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_exp10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 1.000000e+00, double 1.000000e+01>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5exp10Dv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5exp10f(float)
+declare <2 x float>  @_Z5exp10Dv2_f(<2 x float>)
+declare half         @_Z5exp10Dh(half)
+declare <2 x half>   @_Z5exp10Dv2_Dh(<2 x half>)
+declare double       @_Z5exp10d(double)
+declare <2 x double> @_Z5exp10Dv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp2.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp2.ll
new file mode 100644
index 0000000000000..2021346d0cac1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp2.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_exp2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_exp2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4exp2f(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_exp2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_exp2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 1.000000e+00, float 2.000000e+00>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4exp2Dv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_exp2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_exp2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4exp2Dh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_exp2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_exp2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH3C00, half 0xH4000>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4exp2Dv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_exp2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_exp2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4exp2d(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_exp2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_exp2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 1.000000e+00, double 2.000000e+00>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4exp2Dv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4exp2f(float)
+declare <2 x float>  @_Z4exp2Dv2_f(<2 x float>)
+declare half         @_Z4exp2Dh(half)
+declare <2 x half>   @_Z4exp2Dv2_Dh(<2 x half>)
+declare double       @_Z4exp2d(double)
+declare <2 x double> @_Z4exp2Dv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-expm1.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-expm1.ll
new file mode 100644
index 0000000000000..0c08241a9e575
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-expm1.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_expm1(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_expm1(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5expm1f(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_expm1(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_expm1(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float -0.000000e+00>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5expm1Dv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_expm1(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_expm1(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5expm1Dh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_expm1(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_expm1(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH8000>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5expm1Dv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_expm1(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_expm1(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5expm1d(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_expm1(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_expm1(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double -0.000000e+00>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5expm1Dv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5expm1f(float)
+declare <2 x float>  @_Z5expm1Dv2_f(<2 x float>)
+declare half         @_Z5expm1Dh(half)
+declare <2 x half>   @_Z5expm1Dv2_Dh(<2 x half>)
+declare double       @_Z5expm1d(double)
+declare <2 x double> @_Z5expm1Dv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log.ll
new file mode 100644
index 0000000000000..305da227ad78f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_log(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_log(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z3logf(float 1.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_log(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_log(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z3logDv2_f(<2 x float> <float 1.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_log(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_log(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z3logDh(half 1.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_log(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_log(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z3logDv2_Dh(<2 x half> <half 1.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_log(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_log(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z3logd(double 1.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_log(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_log(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z3logDv2_d(<2 x double> <double 1.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z3logf(float)
+declare <2 x float>  @_Z3logDv2_f(<2 x float>)
+declare half         @_Z3logDh(half)
+declare <2 x half>   @_Z3logDv2_Dh(<2 x half>)
+declare double       @_Z3logd(double)
+declare <2 x double> @_Z3logDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log10.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log10.ll
new file mode 100644
index 0000000000000..6fb830efb93c3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log10.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_log10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_log10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5log10f(float 1.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_log10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_log10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5log10Dv2_f(<2 x float> <float 1.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_log10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_log10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5log10Dh(half 1.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_log10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_log10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5log10Dv2_Dh(<2 x half> <half 1.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_log10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_log10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5log10d(double 1.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_log10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_log10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5log10Dv2_d(<2 x double> <double 1.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5log10f(float)
+declare <2 x float>  @_Z5log10Dv2_f(<2 x float>)
+declare half         @_Z5log10Dh(half)
+declare <2 x half>   @_Z5log10Dv2_Dh(<2 x half>)
+declare double       @_Z5log10d(double)
+declare <2 x double> @_Z5log10Dv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log2.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log2.ll
new file mode 100644
index 0000000000000..39aad939a395f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log2.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_log2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_log2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4log2f(float 1.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_log2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_log2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4log2Dv2_f(<2 x float> <float 1.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_log2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_log2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4log2Dh(half 1.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_log2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_log2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4log2Dv2_Dh(<2 x half> <half 1.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_log2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_log2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4log2d(double 1.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_log2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_log2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4log2Dv2_d(<2 x double> <double 1.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4log2f(float)
+declare <2 x float>  @_Z4log2Dv2_f(<2 x float>)
+declare half         @_Z4log2Dh(half)
+declare <2 x half>   @_Z4log2Dv2_Dh(<2 x half>)
+declare double       @_Z4log2d(double)
+declare <2 x double> @_Z4log2Dv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-rsqrt.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-rsqrt.ll
new file mode 100644
index 0000000000000..87a2d67e48532
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-rsqrt.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_rsqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_rsqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5rsqrtf(float 1.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_rsqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_rsqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5rsqrtDv2_f(<2 x float> <float 1.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_rsqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_rsqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5rsqrtDh(half 1.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_rsqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_rsqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5rsqrtDv2_Dh(<2 x half> <half 1.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_rsqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_rsqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5rsqrtd(double 1.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_rsqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_rsqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5rsqrtDv2_d(<2 x double> <double 1.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5rsqrtf(float)
+declare <2 x float>  @_Z5rsqrtDv2_f(<2 x float>)
+declare half         @_Z5rsqrtDh(half)
+declare <2 x half>   @_Z5rsqrtDv2_Dh(<2 x half>)
+declare double       @_Z5rsqrtd(double)
+declare <2 x double> @_Z5rsqrtDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sin.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sin.ll
new file mode 100644
index 0000000000000..8016e93aadca8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sin.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_sin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_sin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z3sinf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_sin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_sin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z3sinDv2_f(<2 x float> zeroinitializer)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_sin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_sin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z3sinDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_sin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_sin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z3sinDv2_Dh(<2 x half> zeroinitializer)
+  store <2 x half> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_sin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_sin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z3sind(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_sin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_sin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z3sinDv2_d(<2 x double> zeroinitializer)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z3sinf(float)
+declare <2 x float>  @_Z3sinDv2_f(<2 x float>)
+declare half         @_Z3sinDh(half)
+declare <2 x half>   @_Z3sinDv2_Dh(<2 x half>)
+declare double       @_Z3sind(double)
+declare <2 x double> @_Z3sinDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinh.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinh.ll
new file mode 100644
index 0000000000000..30674c94d6d4e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinh.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_sinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_sinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4sinhf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_sinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_sinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4sinhDv2_f(<2 x float> zeroinitializer)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_sinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_sinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4sinhDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_sinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_sinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4sinhDv2_Dh(<2 x half> zeroinitializer)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_sinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_sinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4sinhd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_sinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_sinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4sinhDv2_d(<2 x double> zeroinitializer)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4sinhf(float)
+declare <2 x float>  @_Z4sinhDv2_f(<2 x float>)
+declare half         @_Z4sinhDh(half)
+declare <2 x half>   @_Z4sinhDv2_Dh(<2 x half>)
+declare double       @_Z4sinhd(double)
+declare <2 x double> @_Z4sinhDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinpi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinpi.ll
new file mode 100644
index 0000000000000..0695462e3ae20
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinpi.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_sinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_sinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5sinpif(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_sinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_sinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5sinpiDv2_f(<2 x float> zeroinitializer)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_sinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_sinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5sinpiDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_sinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_sinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5sinpiDv2_Dh(<2 x half> zeroinitializer)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_sinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_sinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5sinpid(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_sinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_sinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5sinpiDv2_d(<2 x double> zeroinitializer)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5sinpif(float)
+declare <2 x float>  @_Z5sinpiDv2_f(<2 x float>)
+declare half         @_Z5sinpiDh(half)
+declare <2 x half>   @_Z5sinpiDv2_Dh(<2 x half>)
+declare double       @_Z5sinpid(double)
+declare <2 x double> @_Z5sinpiDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sqrt.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sqrt.ll
new file mode 100644
index 0000000000000..df302072f48cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sqrt.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_sqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_sqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4sqrtf(float 1.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_sqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_sqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4sqrtDv2_f(<2 x float> <float 1.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_sqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_sqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4sqrtDh(half 1.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_sqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_sqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4sqrtDv2_Dh(<2 x half> <half 1.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_sqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_sqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4sqrtd(double 1.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_sqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_sqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4sqrtDv2_d(<2 x double> <double 1.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4sqrtf(float)
+declare <2 x float>  @_Z4sqrtDv2_f(<2 x float>)
+declare half         @_Z4sqrtDh(half)
+declare <2 x half>   @_Z4sqrtDv2_Dh(<2 x half>)
+declare double       @_Z4sqrtd(double)
+declare <2 x double> @_Z4sqrtDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tan.ll
new file mode 100644
index 0000000000000..10d4c15f697cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tan.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_tan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_tan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z3tanf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_tan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_tan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z3tanDv2_f(<2 x float> zeroinitializer)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_tan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_tan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z3tanDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_tan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_tan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z3tanDv2_Dh(<2 x half> zeroinitializer)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_tan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_tan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z3tand(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_tan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_tan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z3tanDv2_d(<2 x double> zeroinitializer)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z3tanf(float)
+declare <2 x float>  @_Z3tanDv2_f(<2 x float>)
+declare half         @_Z3tanDh(half)
+declare <2 x half>   @_Z3tanDv2_Dh(<2 x half>)
+declare double       @_Z3tand(double)
+declare <2 x double> @_Z3tanDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanh.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanh.ll
new file mode 100644
index 0000000000000..dfbf6d77ba664
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanh.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_tanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_tanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4tanhf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_tanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_tanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4tanhDv2_f(<2 x float> zeroinitializer)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_tanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_tanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4tanhDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_tanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_tanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4tanhDv2_Dh(<2 x half> zeroinitializer)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_tanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_tanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4tanhd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_tanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_tanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4tanhDv2_d(<2 x double> zeroinitializer)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4tanhf(float)
+declare <2 x float>  @_Z4tanhDv2_f(<2 x float>)
+declare half         @_Z4tanhDh(half)
+declare <2 x half>   @_Z4tanhDv2_Dh(<2 x half>)
+declare double       @_Z4tanhd(double)
+declare <2 x double> @_Z4tanhDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanpi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanpi.ll
new file mode 100644
index 0000000000000..33d7e3199127d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanpi.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_tanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_tanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5tanpif(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_tanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_tanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5tanpiDv2_f(<2 x float> zeroinitializer)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_tanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_tanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5tanpiDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_tanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_tanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5tanpiDv2_Dh(<2 x half> zeroinitializer)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_tanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_tanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5tanpid(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_tanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_tanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5tanpiDv2_d(<2 x double> zeroinitializer)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5tanpif(float)
+declare <2 x float>  @_Z5tanpiDv2_f(<2 x float>)
+declare half         @_Z5tanpiDh(half)
+declare <2 x half>   @_Z5tanpiDv2_Dh(<2 x half>)
+declare double       @_Z5tanpid(double)
+declare <2 x double> @_Z5tanpiDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tgamma.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tgamma.ll
new file mode 100644
index 0000000000000..8212c49738f9a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tgamma.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_tgamma(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_tgamma(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z6tgammaf(float 1.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_tgamma(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_tgamma(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z6tgammaDv2_f(<2 x float> <float 1.000000e+00, float 2.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_tgamma(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_tgamma(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z6tgammaDh(half 1.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_tgamma(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_tgamma(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z6tgammaDv2_Dh(<2 x half> <half 1.000000e+00, half 2.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_tgamma(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_tgamma(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z6tgammad(double 1.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_tgamma(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_tgamma(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z6tgammaDv2_d(<2 x double> <double 1.000000e+00, double 2.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z6tgammaf(float)
+declare <2 x float>  @_Z6tgammaDv2_f(<2 x float>)
+declare half         @_Z6tgammaDh(half)
+declare <2 x half>   @_Z6tgammaDv2_Dh(<2 x half>)
+declare double       @_Z6tgammad(double)
+declare <2 x double> @_Z6tgammaDv2_d(<2 x double>)