[llvm] [AMDGPU] Add half vector support for table-driven libcall optimization (PR #178638)

Tue Feb 10 05:17:41 PST 2026

https://github.com/steffenlarsen updated https://github.com/llvm/llvm-project/pull/178638

>From e9fc863388d6fe4bcec65a4bc7e72a22e810f8ab Mon Sep 17 00:00:00 2001
From: Steffen Holst Larsen <HolstLarsen.Steffen at amd.com>
Date: Wed, 28 Jan 2026 07:58:03 -0600
Subject: [PATCH 1/8] [AMDGPU] Add half vector support for table-driven libcall
 optimzation

When replacing certain AMDGPU library calls with constant data vectors,
the existing implementation only handled single and double-precision
floats. This change extends the functionality to also support
half-precision floats.

Additionally, it refactors the function responsible for generating
constant float data vectors to improve readability and reduces code
duplication. In tandem with this refactoring, the patch relaxes the
check for constant data vectors to include any constant of vector type.
This allows other constant vectors to be processed, such as those
created from constant aggregate zeros (e.g.
`<2 x float> zeroinitializer`).

Signed-off-by: Steffen Holst Larsen <HolstLarsen.Steffen at amd.com>
---
 llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp     | 102 +++++++++---------
 .../amdgpu-simplify-libcall-tdo-acos.ll       |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-acosh.ll      |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-acospi.ll     |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-asin.ll       |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-asinh.ll      |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-asinpi.ll     |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-atan.ll       |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-atanh.ll      |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-atanpi.ll     |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-cbrt.ll       |  87 +++++++++++++++
 .../AMDGPU/amdgpu-simplify-libcall-tdo-cos.ll |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-cosh.ll       |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-cospi.ll      |  87 +++++++++++++++
 .../AMDGPU/amdgpu-simplify-libcall-tdo-erf.ll |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-erfc.ll       |  87 +++++++++++++++
 .../AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-exp10.ll      |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-exp2.ll       |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-expm1.ll      |  87 +++++++++++++++
 .../AMDGPU/amdgpu-simplify-libcall-tdo-log.ll |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-log10.ll      |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-log2.ll       |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-rsqrt.ll      |  87 +++++++++++++++
 .../AMDGPU/amdgpu-simplify-libcall-tdo-sin.ll |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-sinh.ll       |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-sinpi.ll      |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-sqrt.ll       |  87 +++++++++++++++
 .../AMDGPU/amdgpu-simplify-libcall-tdo-tan.ll |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-tanh.ll       |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-tanpi.ll      |  87 +++++++++++++++
 .../amdgpu-simplify-libcall-tdo-tgamma.ll     |  87 +++++++++++++++
 32 files changed, 2750 insertions(+), 49 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acos.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acosh.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acospi.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asin.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinh.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinpi.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atan.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanh.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanpi.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cbrt.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cos.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cosh.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cospi.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erf.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erfc.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp10.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp2.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-expm1.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log10.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log2.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-rsqrt.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sin.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinh.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinpi.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sqrt.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tan.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanh.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanpi.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tgamma.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 4a553beb63bb1..84b8ec22ff86a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -743,6 +743,38 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
   return false;
 }
 
+static Constant *
+_Z4coshdgetConstantFloatVectorForArgType(LLVMContext &Ctx,
+                                         AMDGPULibFunc::EType ArgType,
+                                         ArrayRef<double> Values, Type *Ty) {
+  switch (ArgType) {
+  case AMDGPULibFunc::F16: {
+    SmallVector<uint16_t, 0> HalfIntValues;
+    for (double D : Values) {
+      APFloat APF16 = APFloat(D);
+      [[maybe_unused]] bool Unused;
+      APF16.convert(llvm::APFloat::IEEEhalf(),
+                    llvm::RoundingMode::NearestTiesToEven, &Unused);
+      uint16_t APF16Int = APF16.bitcastToAPInt().getZExtValue();
+      HalfIntValues.push_back(APF16Int);
+    }
+    ArrayRef<uint16_t> Tmp(HalfIntValues);
+    return ConstantDataVector::getFP(Ty->getScalarType(), Tmp);
+  }
+  case AMDGPULibFunc::F32: {
+    SmallVector<float, 0> FValues;
+    for (double D : Values)
+      FValues.push_back((float)D);
+    ArrayRef<float> Tmp(FValues);
+    return ConstantDataVector::get(Ty->getContext(), Tmp);
+  }
+  case AMDGPULibFunc::F64:
+    return ConstantDataVector::get(Ty->getContext(), Values);
+  default:
+    llvm_unreachable("Unsupported argument type");
+  }
+}
+
 bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
   // Table-Driven optimization
   const TableRef tr = getOptTable(FInfo.getId());
@@ -753,39 +785,26 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
   Value *opr0 = CI->getArgOperand(0);
 
   if (getVecSize(FInfo) > 1) {
-    if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(opr0)) {
-      SmallVector<double, 0> DVal;
+    // Vector version
+    Constant *CV = dyn_cast<Constant>(opr0);
+    if (CV && CV->getType()->isVectorTy()) {
+      SmallVector<double, 0> DValues;
       for (int eltNo = 0; eltNo < getVecSize(FInfo); ++eltNo) {
-        ConstantFP *eltval = dyn_cast<ConstantFP>(
-                               CV->getElementAsConstant((unsigned)eltNo));
+        ConstantFP *eltval =
+            dyn_cast<ConstantFP>(CV->getAggregateElement((unsigned)eltNo));
         assert(eltval && "Non-FP arguments in math function!");
-        bool found = false;
-        for (int i=0; i < sz; ++i) {
-          if (eltval->isExactlyValue(tr[i].input)) {
-            DVal.push_back(tr[i].result);
-            found = true;
-            break;
-          }
-        }
-        if (!found) {
-          // This vector constants not handled yet.
+        auto MatchingRow = std::find_if(
+            tr.begin(), tr.end(), [eltval](const TableEntry &entry) {
+              return eltval->isExactlyValue(entry.input);
+            });
+        if (MatchingRow == tr.end())
           return false;
-        }
+        DValues.push_back(MatchingRow->result);
       }
-      LLVMContext &context = CI->getContext();
-      Constant *nval;
-      if (getArgType(FInfo) == AMDGPULibFunc::F32) {
-        SmallVector<float, 0> FVal;
-        for (double D : DVal)
-          FVal.push_back((float)D);
-        ArrayRef<float> tmp(FVal);
-        nval = ConstantDataVector::get(context, tmp);
-      } else { // F64
-        ArrayRef<double> tmp(DVal);
-        nval = ConstantDataVector::get(context, tmp);
-      }
-      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
-      replaceCall(CI, nval);
+      Constant *NewValues = _Z4coshdgetConstantFloatVectorForArgType(
+          CI->getContext(), getArgType(FInfo), DValues, CI->getType());
+      LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *NewValues << "\n");
+      replaceCall(CI, NewValues);
       return true;
     }
   } else {
@@ -1592,26 +1611,11 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
     if (hasTwoResults)
       nval1 = ConstantFP::get(aCI->getType(), DVal1[0]);
   } else {
-    if (getArgType(FInfo) == AMDGPULibFunc::F32) {
-      SmallVector <float, 0> FVal0, FVal1;
-      for (int i = 0; i < FuncVecSize; ++i)
-        FVal0.push_back((float)DVal0[i]);
-      ArrayRef<float> tmp0(FVal0);
-      nval0 = ConstantDataVector::get(context, tmp0);
-      if (hasTwoResults) {
-        for (int i = 0; i < FuncVecSize; ++i)
-          FVal1.push_back((float)DVal1[i]);
-        ArrayRef<float> tmp1(FVal1);
-        nval1 = ConstantDataVector::get(context, tmp1);
-      }
-    } else {
-      ArrayRef<double> tmp0(DVal0);
-      nval0 = ConstantDataVector::get(context, tmp0);
-      if (hasTwoResults) {
-        ArrayRef<double> tmp1(DVal1);
-        nval1 = ConstantDataVector::get(context, tmp1);
-      }
-    }
+    nval0 = _Z4coshdgetConstantFloatVectorForArgType(context, getArgType(FInfo),
+                                                     DVal0, aCI->getType());
+    if (hasTwoResults)
+      nval1 = _Z4coshdgetConstantFloatVectorForArgType(
+          context, getArgType(FInfo), DVal1, aCI->getType());
   }
 
   if (hasTwoResults) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acos.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acos.ll
new file mode 100644
index 0000000000000..64c8c8186b7ce
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acos.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_acos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_acos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0x3FF921FB60000000, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4acosf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_acos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_acos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 0x3FF921FB60000000, float 0.000000e+00>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4acosDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_acos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_acos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3E48, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4acosDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_acos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_acos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH3E48, half 0xH0000>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4acosDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_acos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_acos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0x3FF921FB54442D18, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4acosd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_acos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_acos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 0x3FF921FB54442D18, double 0.000000e+00>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4acosDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4acosf(float)
+declare <2 x float>  @_Z4acosDv2_f(<2 x float>)
+declare half         @_Z4acosDh(half)
+declare <2 x half>   @_Z4acosDv2_Dh(<2 x half>)
+declare double       @_Z4acosd(double)
+declare <2 x double> @_Z4acosDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acosh.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acosh.ll
new file mode 100644
index 0000000000000..7c13788f5ee60
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acosh.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_acosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_acosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5acoshf(float 1.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_acosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_acosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5acoshDv2_f(<2 x float> <float 1.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_acosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_acosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5acoshDh(half 1.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_acosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_acosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5acoshDv2_Dh(<2 x half> <half 1.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_acosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_acosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5acoshd(double 1.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_acosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_acosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5acoshDv2_d(<2 x double> <double 1.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5acoshf(float)
+declare <2 x float>  @_Z5acoshDv2_f(<2 x float>)
+declare half         @_Z5acoshDh(half)
+declare <2 x half>   @_Z5acoshDv2_Dh(<2 x half>)
+declare double       @_Z5acoshd(double)
+declare <2 x double> @_Z5acoshDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acospi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acospi.ll
new file mode 100644
index 0000000000000..b350fe5957ac7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acospi.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_acospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_acospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 5.000000e-01, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z6acospif(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_acospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_acospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 5.000000e-01, float 0.000000e+00>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z6acospiDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_acospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_acospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3800, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z6acospiDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_acospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_acospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH3800, half 0xH0000>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z6acospiDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_acospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_acospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 5.000000e-01, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z6acospid(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_acospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_acospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 5.000000e-01, double 0.000000e+00>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z6acospiDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z6acospif(float)
+declare <2 x float>  @_Z6acospiDv2_f(<2 x float>)
+declare half         @_Z6acospiDh(half)
+declare <2 x half>   @_Z6acospiDv2_Dh(<2 x half>)
+declare double       @_Z6acospid(double)
+declare <2 x double> @_Z6acospiDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asin.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asin.ll
new file mode 100644
index 0000000000000..d94827e037dd4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asin.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_asin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_asin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4asinf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_asin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_asin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float 0x3FF921FB60000000>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4asinDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_asin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_asin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4asinDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_asin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_asin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH3E48>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4asinDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_asin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_asin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4asind(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_asin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_asin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double 0x3FF921FB54442D18>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4asinDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4asinf(float)
+declare <2 x float>  @_Z4asinDv2_f(<2 x float>)
+declare half         @_Z4asinDh(half)
+declare <2 x half>   @_Z4asinDv2_Dh(<2 x half>)
+declare double       @_Z4asind(double)
+declare <2 x double> @_Z4asinDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinh.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinh.ll
new file mode 100644
index 0000000000000..a8b72c758927e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinh.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_asinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_asinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5asinhf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_asinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_asinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float -0.000000e+00>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5asinhDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_asinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_asinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5asinhDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_asinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_asinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH8000>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5asinhDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_asinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_asinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5asinhd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_asinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_asinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double -0.000000e+00>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5asinhDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5asinhf(float)
+declare <2 x float>  @_Z5asinhDv2_f(<2 x float>)
+declare half         @_Z5asinhDh(half)
+declare <2 x half>   @_Z5asinhDv2_Dh(<2 x half>)
+declare double       @_Z5asinhd(double)
+declare <2 x double> @_Z5asinhDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinpi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinpi.ll
new file mode 100644
index 0000000000000..ed2d4bdb8875c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinpi.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_asinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_asinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z6asinpif(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_asinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_asinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float 5.000000e-01>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z6asinpiDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_asinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_asinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z6asinpiDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_asinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_asinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH3800>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z6asinpiDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_asinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_asinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z6asinpid(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_asinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_asinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double 5.000000e-01>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z6asinpiDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z6asinpif(float)
+declare <2 x float>  @_Z6asinpiDv2_f(<2 x float>)
+declare half         @_Z6asinpiDh(half)
+declare <2 x half>   @_Z6asinpiDv2_Dh(<2 x half>)
+declare double       @_Z6asinpid(double)
+declare <2 x double> @_Z6asinpiDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atan.ll
new file mode 100644
index 0000000000000..aa74d2228fd1d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atan.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_atan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_atan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4atanf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_atan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_atan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float 0x3FE921FB60000000>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4atanDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_atan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_atan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4atanDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_atan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_atan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH3A48>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4atanDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_atan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_atan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4atand(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_atan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_atan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double 0x3FE921FB54442D18>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4atanDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4atanf(float)
+declare <2 x float>  @_Z4atanDv2_f(<2 x float>)
+declare half         @_Z4atanDh(half)
+declare <2 x half>   @_Z4atanDv2_Dh(<2 x half>)
+declare double       @_Z4atand(double)
+declare <2 x double> @_Z4atanDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanh.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanh.ll
new file mode 100644
index 0000000000000..c1eac02bcb83d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanh.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_atanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_atanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5atanhf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_atanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_atanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float -0.000000e+00>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5atanhDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_atanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_atanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5atanhDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_atanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_atanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH8000>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5atanhDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_atanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_atanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5atanhd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_atanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_atanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double -0.000000e+00>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5atanhDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5atanhf(float)
+declare <2 x float>  @_Z5atanhDv2_f(<2 x float>)
+declare half         @_Z5atanhDh(half)
+declare <2 x half>   @_Z5atanhDv2_Dh(<2 x half>)
+declare double       @_Z5atanhd(double)
+declare <2 x double> @_Z5atanhDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanpi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanpi.ll
new file mode 100644
index 0000000000000..0b985d07935c4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanpi.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_atanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_atanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z6atanpif(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_atanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_atanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float 2.500000e-01>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z6atanpiDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_atanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_atanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z6atanpiDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_atanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_atanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH3400>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z6atanpiDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_atanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_atanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z6atanpid(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_atanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_atanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double 2.500000e-01>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z6atanpiDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z6atanpif(float)
+declare <2 x float>  @_Z6atanpiDv2_f(<2 x float>)
+declare half         @_Z6atanpiDh(half)
+declare <2 x half>   @_Z6atanpiDv2_Dh(<2 x half>)
+declare double       @_Z6atanpid(double)
+declare <2 x double> @_Z6atanpiDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cbrt.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cbrt.ll
new file mode 100644
index 0000000000000..e99d5fee0f988
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cbrt.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_cbrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_cbrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4cbrtf(float 1.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_cbrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_cbrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 1.000000e+00, float -1.000000e+00>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4cbrtDv2_f(<2 x float> <float 1.000000e+00, float -1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_cbrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_cbrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4cbrtDh(half 1.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_cbrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_cbrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH3C00, half 0xHBC00>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4cbrtDv2_Dh(<2 x half> <half 1.000000e+00, half -1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_cbrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_cbrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4cbrtd(double 1.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_cbrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_cbrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 1.000000e+00, double -1.000000e+00>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4cbrtDv2_d(<2 x double> <double 1.000000e+00, double -1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4cbrtf(float)
+declare <2 x float>  @_Z4cbrtDv2_f(<2 x float>)
+declare half         @_Z4cbrtDh(half)
+declare <2 x half>   @_Z4cbrtDv2_Dh(<2 x half>)
+declare double       @_Z4cbrtd(double)
+declare <2 x double> @_Z4cbrtDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cos.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cos.ll
new file mode 100644
index 0000000000000..0998ae8ab3b25
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cos.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_cos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_cos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z3cosf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_cos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_cos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z3cosDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_cos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_cos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z3cosDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_cos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_cos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z3cosDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_cos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_cos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z3cosd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_cos(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_cos(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z3cosDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z3cosf(float)
+declare <2 x float>  @_Z3cosDv2_f(<2 x float>)
+declare half         @_Z3cosDh(half)
+declare <2 x half>   @_Z3cosDv2_Dh(<2 x half>)
+declare double       @_Z3cosd(double)
+declare <2 x double> @_Z3cosDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cosh.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cosh.ll
new file mode 100644
index 0000000000000..c151ed27ca197
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cosh.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_cosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_cosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4coshf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_cosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_cosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4coshDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_cosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_cosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4coshDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_cosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_cosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4coshDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_cosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_cosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4coshd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_cosh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_cosh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4coshDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4coshf(float)
+declare <2 x float>  @_Z4coshDv2_f(<2 x float>)
+declare half         @_Z4coshDh(half)
+declare <2 x half>   @_Z4coshDv2_Dh(<2 x half>)
+declare double       @_Z4coshd(double)
+declare <2 x double> @_Z4coshDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cospi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cospi.ll
new file mode 100644
index 0000000000000..4dce71bc5a5df
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cospi.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_cospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_cospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5cospif(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_cospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_cospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5cospiDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_cospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_cospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5cospiDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_cospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_cospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5cospiDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_cospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_cospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5cospid(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_cospi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_cospi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5cospiDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5cospif(float)
+declare <2 x float>  @_Z5cospiDv2_f(<2 x float>)
+declare half         @_Z5cospiDh(half)
+declare <2 x half>   @_Z5cospiDv2_Dh(<2 x half>)
+declare double       @_Z5cospid(double)
+declare <2 x double> @_Z5cospiDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erf.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erf.ll
new file mode 100644
index 0000000000000..ea54a59dcf907
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erf.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_erf(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_erf(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z3erff(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_erf(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_erf(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float -0.000000e+00>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z3erfDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_erf(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_erf(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z3erfDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_erf(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_erf(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH8000>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z3erfDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_erf(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_erf(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z3erfd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_erf(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_erf(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double -0.000000e+00>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z3erfDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z3erff(float)
+declare <2 x float>  @_Z3erfDv2_f(<2 x float>)
+declare half         @_Z3erfDh(half)
+declare <2 x half>   @_Z3erfDv2_Dh(<2 x half>)
+declare double       @_Z3erfd(double)
+declare <2 x double> @_Z3erfDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erfc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erfc.ll
new file mode 100644
index 0000000000000..1b1c18940987e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erfc.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_erfc(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_erfc(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4erfcf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_erfc(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_erfc(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4erfcDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_erfc(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_erfc(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4erfcDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_erfc(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_erfc(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4erfcDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_erfc(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_erfc(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4erfcd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_erfc(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_erfc(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4erfcDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4erfcf(float)
+declare <2 x float>  @_Z4erfcDv2_f(<2 x float>)
+declare half         @_Z4erfcDh(half)
+declare <2 x half>   @_Z4erfcDv2_Dh(<2 x half>)
+declare double       @_Z4erfcd(double)
+declare <2 x double> @_Z4erfcDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll
new file mode 100644
index 0000000000000..de29931287665
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_exp(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_exp(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z3expf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_exp(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_exp(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 1.000000e+00, float 0x4005BF0A80000000>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z3expDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_exp(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_exp(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z3expDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_exp(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_exp(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH3C00, half 0xH4170>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z3expDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_exp(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_exp(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z3expd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_exp(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_exp(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 1.000000e+00, double 0x4005BF0A8B145769>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z3expDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z3expf(float)
+declare <2 x float>  @_Z3expDv2_f(<2 x float>)
+declare half         @_Z3expDh(half)
+declare <2 x half>   @_Z3expDv2_Dh(<2 x half>)
+declare double       @_Z3expd(double)
+declare <2 x double> @_Z3expDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp10.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp10.ll
new file mode 100644
index 0000000000000..7aea2fd4763b0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp10.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_exp10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_exp10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5exp10f(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_exp10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_exp10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 1.000000e+00, float 1.000000e+01>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5exp10Dv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_exp10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_exp10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5exp10Dh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_exp10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_exp10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH3C00, half 0xH4900>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5exp10Dv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_exp10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_exp10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5exp10d(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_exp10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_exp10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 1.000000e+00, double 1.000000e+01>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5exp10Dv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5exp10f(float)
+declare <2 x float>  @_Z5exp10Dv2_f(<2 x float>)
+declare half         @_Z5exp10Dh(half)
+declare <2 x half>   @_Z5exp10Dv2_Dh(<2 x half>)
+declare double       @_Z5exp10d(double)
+declare <2 x double> @_Z5exp10Dv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp2.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp2.ll
new file mode 100644
index 0000000000000..2021346d0cac1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp2.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_exp2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_exp2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4exp2f(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_exp2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_exp2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 1.000000e+00, float 2.000000e+00>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4exp2Dv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_exp2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_exp2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4exp2Dh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_exp2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_exp2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH3C00, half 0xH4000>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4exp2Dv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_exp2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_exp2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4exp2d(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_exp2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_exp2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 1.000000e+00, double 2.000000e+00>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4exp2Dv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4exp2f(float)
+declare <2 x float>  @_Z4exp2Dv2_f(<2 x float>)
+declare half         @_Z4exp2Dh(half)
+declare <2 x half>   @_Z4exp2Dv2_Dh(<2 x half>)
+declare double       @_Z4exp2d(double)
+declare <2 x double> @_Z4exp2Dv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-expm1.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-expm1.ll
new file mode 100644
index 0000000000000..0c08241a9e575
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-expm1.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_expm1(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_expm1(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5expm1f(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_expm1(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_expm1(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float -0.000000e+00>, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5expm1Dv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_expm1(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_expm1(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5expm1Dh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_expm1(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_expm1(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH8000>, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5expm1Dv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_expm1(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_expm1(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5expm1d(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_expm1(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_expm1(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double -0.000000e+00>, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5expm1Dv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5expm1f(float)
+declare <2 x float>  @_Z5expm1Dv2_f(<2 x float>)
+declare half         @_Z5expm1Dh(half)
+declare <2 x half>   @_Z5expm1Dv2_Dh(<2 x half>)
+declare double       @_Z5expm1d(double)
+declare <2 x double> @_Z5expm1Dv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log.ll
new file mode 100644
index 0000000000000..305da227ad78f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_log(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_log(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z3logf(float 1.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_log(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_log(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z3logDv2_f(<2 x float> <float 1.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_log(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_log(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z3logDh(half 1.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_log(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_log(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z3logDv2_Dh(<2 x half> <half 1.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_log(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_log(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z3logd(double 1.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_log(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_log(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z3logDv2_d(<2 x double> <double 1.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z3logf(float)
+declare <2 x float>  @_Z3logDv2_f(<2 x float>)
+declare half         @_Z3logDh(half)
+declare <2 x half>   @_Z3logDv2_Dh(<2 x half>)
+declare double       @_Z3logd(double)
+declare <2 x double> @_Z3logDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log10.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log10.ll
new file mode 100644
index 0000000000000..6fb830efb93c3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log10.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_log10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_log10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5log10f(float 1.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_log10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_log10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5log10Dv2_f(<2 x float> <float 1.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_log10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_log10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5log10Dh(half 1.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_log10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_log10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5log10Dv2_Dh(<2 x half> <half 1.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_log10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_log10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5log10d(double 1.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_log10(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_log10(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5log10Dv2_d(<2 x double> <double 1.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5log10f(float)
+declare <2 x float>  @_Z5log10Dv2_f(<2 x float>)
+declare half         @_Z5log10Dh(half)
+declare <2 x half>   @_Z5log10Dv2_Dh(<2 x half>)
+declare double       @_Z5log10d(double)
+declare <2 x double> @_Z5log10Dv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log2.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log2.ll
new file mode 100644
index 0000000000000..39aad939a395f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log2.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_log2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_log2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4log2f(float 1.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_log2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_log2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4log2Dv2_f(<2 x float> <float 1.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_log2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_log2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4log2Dh(half 1.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_log2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_log2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4log2Dv2_Dh(<2 x half> <half 1.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_log2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_log2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4log2d(double 1.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_log2(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_log2(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4log2Dv2_d(<2 x double> <double 1.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4log2f(float)
+declare <2 x float>  @_Z4log2Dv2_f(<2 x float>)
+declare half         @_Z4log2Dh(half)
+declare <2 x half>   @_Z4log2Dv2_Dh(<2 x half>)
+declare double       @_Z4log2d(double)
+declare <2 x double> @_Z4log2Dv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-rsqrt.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-rsqrt.ll
new file mode 100644
index 0000000000000..87a2d67e48532
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-rsqrt.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_rsqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_rsqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5rsqrtf(float 1.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_rsqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_rsqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5rsqrtDv2_f(<2 x float> <float 1.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_rsqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_rsqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5rsqrtDh(half 1.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_rsqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_rsqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5rsqrtDv2_Dh(<2 x half> <half 1.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_rsqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_rsqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5rsqrtd(double 1.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_rsqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_rsqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5rsqrtDv2_d(<2 x double> <double 1.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5rsqrtf(float)
+declare <2 x float>  @_Z5rsqrtDv2_f(<2 x float>)
+declare half         @_Z5rsqrtDh(half)
+declare <2 x half>   @_Z5rsqrtDv2_Dh(<2 x half>)
+declare double       @_Z5rsqrtd(double)
+declare <2 x double> @_Z5rsqrtDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sin.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sin.ll
new file mode 100644
index 0000000000000..8016e93aadca8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sin.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_sin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_sin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z3sinf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_sin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_sin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z3sinDv2_f(<2 x float> zeroinitializer)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_sin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_sin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z3sinDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_sin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_sin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z3sinDv2_Dh(<2 x half> zeroinitializer)
+  store <2 x half> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_sin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_sin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z3sind(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_sin(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_sin(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z3sinDv2_d(<2 x double> zeroinitializer)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z3sinf(float)
+declare <2 x float>  @_Z3sinDv2_f(<2 x float>)
+declare half         @_Z3sinDh(half)
+declare <2 x half>   @_Z3sinDv2_Dh(<2 x half>)
+declare double       @_Z3sind(double)
+declare <2 x double> @_Z3sinDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinh.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinh.ll
new file mode 100644
index 0000000000000..30674c94d6d4e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinh.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_sinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_sinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4sinhf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_sinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_sinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4sinhDv2_f(<2 x float> zeroinitializer)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_sinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_sinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4sinhDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_sinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_sinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4sinhDv2_Dh(<2 x half> zeroinitializer)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_sinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_sinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4sinhd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_sinh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_sinh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4sinhDv2_d(<2 x double> zeroinitializer)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4sinhf(float)
+declare <2 x float>  @_Z4sinhDv2_f(<2 x float>)
+declare half         @_Z4sinhDh(half)
+declare <2 x half>   @_Z4sinhDv2_Dh(<2 x half>)
+declare double       @_Z4sinhd(double)
+declare <2 x double> @_Z4sinhDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinpi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinpi.ll
new file mode 100644
index 0000000000000..0695462e3ae20
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinpi.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_sinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_sinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5sinpif(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_sinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_sinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5sinpiDv2_f(<2 x float> zeroinitializer)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_sinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_sinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5sinpiDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_sinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_sinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5sinpiDv2_Dh(<2 x half> zeroinitializer)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_sinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_sinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5sinpid(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_sinpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_sinpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5sinpiDv2_d(<2 x double> zeroinitializer)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5sinpif(float)
+declare <2 x float>  @_Z5sinpiDv2_f(<2 x float>)
+declare half         @_Z5sinpiDh(half)
+declare <2 x half>   @_Z5sinpiDv2_Dh(<2 x half>)
+declare double       @_Z5sinpid(double)
+declare <2 x double> @_Z5sinpiDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sqrt.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sqrt.ll
new file mode 100644
index 0000000000000..df302072f48cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sqrt.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_sqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_sqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4sqrtf(float 1.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_sqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_sqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4sqrtDv2_f(<2 x float> <float 1.000000e+00, float 1.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_sqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_sqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4sqrtDh(half 1.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_sqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_sqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4sqrtDv2_Dh(<2 x half> <half 1.000000e+00, half 1.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_sqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_sqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4sqrtd(double 1.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_sqrt(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_sqrt(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4sqrtDv2_d(<2 x double> <double 1.000000e+00, double 1.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4sqrtf(float)
+declare <2 x float>  @_Z4sqrtDv2_f(<2 x float>)
+declare half         @_Z4sqrtDh(half)
+declare <2 x half>   @_Z4sqrtDv2_Dh(<2 x half>)
+declare double       @_Z4sqrtd(double)
+declare <2 x double> @_Z4sqrtDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tan.ll
new file mode 100644
index 0000000000000..10d4c15f697cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tan.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_tan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_tan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z3tanf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_tan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_tan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z3tanDv2_f(<2 x float> zeroinitializer)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_tan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_tan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z3tanDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_tan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_tan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z3tanDv2_Dh(<2 x half> zeroinitializer)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_tan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_tan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z3tand(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_tan(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_tan(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z3tanDv2_d(<2 x double> zeroinitializer)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z3tanf(float)
+declare <2 x float>  @_Z3tanDv2_f(<2 x float>)
+declare half         @_Z3tanDh(half)
+declare <2 x half>   @_Z3tanDv2_Dh(<2 x half>)
+declare double       @_Z3tand(double)
+declare <2 x double> @_Z3tanDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanh.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanh.ll
new file mode 100644
index 0000000000000..dfbf6d77ba664
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanh.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_tanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_tanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z4tanhf(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_tanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_tanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z4tanhDv2_f(<2 x float> zeroinitializer)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_tanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_tanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z4tanhDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_tanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_tanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z4tanhDv2_Dh(<2 x half> zeroinitializer)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_tanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_tanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z4tanhd(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_tanh(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_tanh(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z4tanhDv2_d(<2 x double> zeroinitializer)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z4tanhf(float)
+declare <2 x float>  @_Z4tanhDv2_f(<2 x float>)
+declare half         @_Z4tanhDh(half)
+declare <2 x half>   @_Z4tanhDv2_Dh(<2 x half>)
+declare double       @_Z4tanhd(double)
+declare <2 x double> @_Z4tanhDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanpi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanpi.ll
new file mode 100644
index 0000000000000..33d7e3199127d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanpi.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_tanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_tanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z5tanpif(float 0.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_tanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_tanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z5tanpiDv2_f(<2 x float> zeroinitializer)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_tanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_tanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z5tanpiDh(half 0.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_tanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_tanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z5tanpiDv2_Dh(<2 x half> zeroinitializer)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_tanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_tanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z5tanpid(double 0.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_tanpi(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_tanpi(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z5tanpiDv2_d(<2 x double> zeroinitializer)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z5tanpif(float)
+declare <2 x float>  @_Z5tanpiDv2_f(<2 x float>)
+declare half         @_Z5tanpiDh(half)
+declare <2 x half>   @_Z5tanpiDv2_Dh(<2 x half>)
+declare double       @_Z5tanpid(double)
+declare <2 x double> @_Z5tanpiDv2_d(<2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tgamma.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tgamma.ll
new file mode 100644
index 0000000000000..8212c49738f9a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tgamma.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+define amdgpu_kernel void @test_tdo_scalar_f32_tgamma(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_tgamma(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call float @_Z6tgammaf(float 1.000000e+00)
+  store float %c, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f32_tgamma(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_tgamma(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x float> @_Z6tgammaDv2_f(<2 x float> <float 1.000000e+00, float 2.000000e+00>)
+  store <2 x float> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f16_tgamma(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_tgamma(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call half @_Z6tgammaDh(half 1.000000e+00)
+  store half %c, ptr addrspace(1) %out, align 2
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f16_tgamma(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_tgamma(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x half> @_Z6tgammaDv2_Dh(<2 x half> <half 1.000000e+00, half 2.000000e+00>)
+  store <2 x half> %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_scalar_f64_tgamma(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_tgamma(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call double @_Z6tgammad(double 1.000000e+00)
+  store double %c, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_tdo_v2_f64_tgamma(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_tgamma(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %c = call <2 x double> @_Z6tgammaDv2_d(<2 x double> <double 1.000000e+00, double 2.000000e+00>)
+  store <2 x double> %c, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+declare float        @_Z6tgammaf(float)
+declare <2 x float>  @_Z6tgammaDv2_f(<2 x float>)
+declare half         @_Z6tgammaDh(half)
+declare <2 x half>   @_Z6tgammaDv2_Dh(<2 x half>)
+declare double       @_Z6tgammad(double)
+declare <2 x double> @_Z6tgammaDv2_d(<2 x double>)

>From b9eb7966a7e5b7e14cc44d8b4b7b6c949e6a9089 Mon Sep 17 00:00:00 2001
From: Steffen Holst Larsen <HolstLarsen.Steffen at amd.com>
Date: Thu, 29 Jan 2026 09:14:58 -0600
Subject: [PATCH 2/8] Address comments

Signed-off-by: Steffen Holst Larsen <HolstLarsen.Steffen at amd.com>
---
 llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp     | 38 ++++++-----
 .../amdgpu-simplify-libcall-tdo-acos.ll       | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-acosh.ll      | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-acospi.ll     | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-asin.ll       | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-asinh.ll      | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-asinpi.ll     | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-atan.ll       | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-atanh.ll      | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-atanpi.ll     | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-cbrt.ll       | 66 +++++++------------
 .../AMDGPU/amdgpu-simplify-libcall-tdo-cos.ll | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-cosh.ll       | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-cospi.ll      | 66 +++++++------------
 .../AMDGPU/amdgpu-simplify-libcall-tdo-erf.ll | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-erfc.ll       | 66 +++++++------------
 .../AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-exp10.ll      | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-exp2.ll       | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-expm1.ll      | 66 +++++++------------
 .../AMDGPU/amdgpu-simplify-libcall-tdo-log.ll | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-log10.ll      | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-log2.ll       | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-rsqrt.ll      | 66 +++++++------------
 .../AMDGPU/amdgpu-simplify-libcall-tdo-sin.ll | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-sinh.ll       | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-sinpi.ll      | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-sqrt.ll       | 66 +++++++------------
 .../AMDGPU/amdgpu-simplify-libcall-tdo-tan.ll | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-tanh.ll       | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-tanpi.ll      | 66 +++++++------------
 .../amdgpu-simplify-libcall-tdo-tgamma.ll     | 66 +++++++------------
 32 files changed, 764 insertions(+), 1320 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 84b8ec22ff86a..b118d7fb50a98 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -743,16 +743,17 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
   return false;
 }
 
-static Constant *
-_Z4coshdgetConstantFloatVectorForArgType(LLVMContext &Ctx,
-                                         AMDGPULibFunc::EType ArgType,
-                                         ArrayRef<double> Values, Type *Ty) {
+static Constant *getConstantFloatVectorForArgType(LLVMContext &Ctx,
+                                                  AMDGPULibFunc::EType ArgType,
+                                                  ArrayRef<double> Values,
+                                                  Type *Ty) {
   switch (ArgType) {
   case AMDGPULibFunc::F16: {
-    SmallVector<uint16_t, 0> HalfIntValues;
+    SmallVector<uint16_t, 4> HalfIntValues;
+    HalfIntValues.reserve(Values.size());
     for (double D : Values) {
       APFloat APF16 = APFloat(D);
-      [[maybe_unused]] bool Unused;
+      bool Unused;
       APF16.convert(llvm::APFloat::IEEEhalf(),
                     llvm::RoundingMode::NearestTiesToEven, &Unused);
       uint16_t APF16Int = APF16.bitcastToAPInt().getZExtValue();
@@ -762,7 +763,8 @@ _Z4coshdgetConstantFloatVectorForArgType(LLVMContext &Ctx,
     return ConstantDataVector::getFP(Ty->getScalarType(), Tmp);
   }
   case AMDGPULibFunc::F32: {
-    SmallVector<float, 0> FValues;
+    SmallVector<float, 4> FValues;
+    FValues.reserve(Values.size());
     for (double D : Values)
       FValues.push_back((float)D);
     ArrayRef<float> Tmp(FValues);
@@ -784,24 +786,24 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
   int const sz = (int)tr.size();
   Value *opr0 = CI->getArgOperand(0);
 
-  if (getVecSize(FInfo) > 1) {
+  int vecSize = getVecSize(FInfo);
+  if (vecSize > 1) {
     // Vector version
     Constant *CV = dyn_cast<Constant>(opr0);
     if (CV && CV->getType()->isVectorTy()) {
-      SmallVector<double, 0> DValues;
-      for (int eltNo = 0; eltNo < getVecSize(FInfo); ++eltNo) {
+      SmallVector<double, 4> DValues(vecSize);
+      for (int eltNo = 0; eltNo < vecSize; ++eltNo) {
         ConstantFP *eltval =
-            dyn_cast<ConstantFP>(CV->getAggregateElement((unsigned)eltNo));
-        assert(eltval && "Non-FP arguments in math function!");
+            cast<ConstantFP>(CV->getAggregateElement((unsigned)eltNo));
         auto MatchingRow = std::find_if(
             tr.begin(), tr.end(), [eltval](const TableEntry &entry) {
               return eltval->isExactlyValue(entry.input);
             });
         if (MatchingRow == tr.end())
           return false;
-        DValues.push_back(MatchingRow->result);
+        DValues[eltNo] = MatchingRow->result;
       }
-      Constant *NewValues = _Z4coshdgetConstantFloatVectorForArgType(
+      Constant *NewValues = getConstantFloatVectorForArgType(
           CI->getContext(), getArgType(FInfo), DValues, CI->getType());
       LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *NewValues << "\n");
       replaceCall(CI, NewValues);
@@ -1611,11 +1613,11 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
     if (hasTwoResults)
       nval1 = ConstantFP::get(aCI->getType(), DVal1[0]);
   } else {
-    nval0 = _Z4coshdgetConstantFloatVectorForArgType(context, getArgType(FInfo),
-                                                     DVal0, aCI->getType());
+    nval0 = getConstantFloatVectorForArgType(context, getArgType(FInfo), DVal0,
+                                             aCI->getType());
     if (hasTwoResults)
-      nval1 = _Z4coshdgetConstantFloatVectorForArgType(
-          context, getArgType(FInfo), DVal1, aCI->getType());
+      nval1 = getConstantFloatVectorForArgType(context, getArgType(FInfo),
+                                               DVal1, aCI->getType());
   }
 
   if (hasTwoResults) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acos.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acos.ll
index 64c8c8186b7ce..1fe6ccde628d5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acos.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acos.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_acos(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_acos(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_acos() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_acos() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 0x3FF921FB60000000, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 0x3FF921FB60000000
 ;
 entry:
   %c = call float @_Z4acosf(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_acos(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_acos(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_acos() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_acos() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> <float 0x3FF921FB60000000, float 0.000000e+00>, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> <float 0x3FF921FB60000000, float 0.000000e+00>
 ;
 entry:
   %c = call <2 x float> @_Z4acosDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_acos(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_acos(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_acos() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_acos() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH3E48, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH3E48
 ;
 entry:
   %c = call half @_Z4acosDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_acos(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_acos(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_acos() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_acos() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> <half 0xH3E48, half 0xH0000>, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> <half 0xH3E48, half 0xH0000>
 ;
 entry:
   %c = call <2 x half> @_Z4acosDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_acos(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_acos(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_acos() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_acos() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 0x3FF921FB54442D18, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 0x3FF921FB54442D18
 ;
 entry:
   %c = call double @_Z4acosd(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_acos(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_acos(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_acos() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_acos() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> <double 0x3FF921FB54442D18, double 0.000000e+00>, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> <double 0x3FF921FB54442D18, double 0.000000e+00>
 ;
 entry:
   %c = call <2 x double> @_Z4acosDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z4acosf(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acosh.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acosh.ll
index 7c13788f5ee60..39f07af50dba0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acosh.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acosh.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_acosh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_acosh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_acosh() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_acosh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   %c = call float @_Z5acoshf(float 1.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_acosh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_acosh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_acosh() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_acosh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> zeroinitializer
 ;
 entry:
   %c = call <2 x float> @_Z5acoshDv2_f(<2 x float> <float 1.000000e+00, float 1.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_acosh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_acosh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_acosh() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_acosh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH0000
 ;
 entry:
   %c = call half @_Z5acoshDh(half 1.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_acosh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_acosh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_acosh() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_acosh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> zeroinitializer
 ;
 entry:
   %c = call <2 x half> @_Z5acoshDv2_Dh(<2 x half> <half 1.000000e+00, half 1.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_acosh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_acosh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_acosh() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_acosh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
 entry:
   %c = call double @_Z5acoshd(double 1.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_acosh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_acosh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_acosh() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_acosh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> zeroinitializer
 ;
 entry:
   %c = call <2 x double> @_Z5acoshDv2_d(<2 x double> <double 1.000000e+00, double 1.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z5acoshf(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acospi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acospi.ll
index b350fe5957ac7..b279a1a4b5802 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acospi.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acospi.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_acospi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_acospi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_acospi() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_acospi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 5.000000e-01, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 5.000000e-01
 ;
 entry:
   %c = call float @_Z6acospif(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_acospi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_acospi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_acospi() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_acospi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> <float 5.000000e-01, float 0.000000e+00>, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> <float 5.000000e-01, float 0.000000e+00>
 ;
 entry:
   %c = call <2 x float> @_Z6acospiDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_acospi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_acospi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_acospi() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_acospi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH3800, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH3800
 ;
 entry:
   %c = call half @_Z6acospiDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_acospi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_acospi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_acospi() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_acospi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> <half 0xH3800, half 0xH0000>, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> <half 0xH3800, half 0xH0000>
 ;
 entry:
   %c = call <2 x half> @_Z6acospiDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_acospi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_acospi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_acospi() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_acospi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 5.000000e-01, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 5.000000e-01
 ;
 entry:
   %c = call double @_Z6acospid(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_acospi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_acospi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_acospi() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_acospi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> <double 5.000000e-01, double 0.000000e+00>, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> <double 5.000000e-01, double 0.000000e+00>
 ;
 entry:
   %c = call <2 x double> @_Z6acospiDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z6acospif(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asin.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asin.ll
index d94827e037dd4..ac69af487485b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asin.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asin.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_asin(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_asin(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_asin() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_asin() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   %c = call float @_Z4asinf(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_asin(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_asin(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_asin() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_asin() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float 0x3FF921FB60000000>, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> <float 0.000000e+00, float 0x3FF921FB60000000>
 ;
 entry:
   %c = call <2 x float> @_Z4asinDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_asin(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_asin(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_asin() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_asin() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH0000
 ;
 entry:
   %c = call half @_Z4asinDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_asin(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_asin(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_asin() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_asin() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH3E48>, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> <half 0xH0000, half 0xH3E48>
 ;
 entry:
   %c = call <2 x half> @_Z4asinDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_asin(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_asin(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_asin() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_asin() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
 entry:
   %c = call double @_Z4asind(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_asin(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_asin(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_asin() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_asin() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double 0x3FF921FB54442D18>, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> <double 0.000000e+00, double 0x3FF921FB54442D18>
 ;
 entry:
   %c = call <2 x double> @_Z4asinDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z4asinf(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinh.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinh.ll
index a8b72c758927e..161e108f258bb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinh.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinh.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_asinh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_asinh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_asinh() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_asinh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   %c = call float @_Z5asinhf(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_asinh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_asinh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_asinh() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_asinh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float -0.000000e+00>, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> <float 0.000000e+00, float -0.000000e+00>
 ;
 entry:
   %c = call <2 x float> @_Z5asinhDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_asinh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_asinh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_asinh() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_asinh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH0000
 ;
 entry:
   %c = call half @_Z5asinhDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_asinh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_asinh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_asinh() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_asinh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH8000>, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> <half 0xH0000, half 0xH8000>
 ;
 entry:
   %c = call <2 x half> @_Z5asinhDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_asinh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_asinh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_asinh() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_asinh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
 entry:
   %c = call double @_Z5asinhd(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_asinh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_asinh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_asinh() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_asinh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double -0.000000e+00>, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> <double 0.000000e+00, double -0.000000e+00>
 ;
 entry:
   %c = call <2 x double> @_Z5asinhDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z5asinhf(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinpi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinpi.ll
index ed2d4bdb8875c..c0efc8b469bd2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinpi.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinpi.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_asinpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_asinpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_asinpi() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_asinpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   %c = call float @_Z6asinpif(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_asinpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_asinpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_asinpi() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_asinpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float 5.000000e-01>, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> <float 0.000000e+00, float 5.000000e-01>
 ;
 entry:
   %c = call <2 x float> @_Z6asinpiDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_asinpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_asinpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_asinpi() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_asinpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH0000
 ;
 entry:
   %c = call half @_Z6asinpiDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_asinpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_asinpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_asinpi() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_asinpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH3800>, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> <half 0xH0000, half 0xH3800>
 ;
 entry:
   %c = call <2 x half> @_Z6asinpiDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_asinpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_asinpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_asinpi() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_asinpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
 entry:
   %c = call double @_Z6asinpid(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_asinpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_asinpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_asinpi() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_asinpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double 5.000000e-01>, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> <double 0.000000e+00, double 5.000000e-01>
 ;
 entry:
   %c = call <2 x double> @_Z6asinpiDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z6asinpif(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atan.ll
index aa74d2228fd1d..e863bcb8fe7f5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atan.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_atan(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_atan(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_atan() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_atan() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   %c = call float @_Z4atanf(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_atan(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_atan(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_atan() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_atan() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float 0x3FE921FB60000000>, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> <float 0.000000e+00, float 0x3FE921FB60000000>
 ;
 entry:
   %c = call <2 x float> @_Z4atanDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_atan(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_atan(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_atan() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_atan() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH0000
 ;
 entry:
   %c = call half @_Z4atanDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_atan(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_atan(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_atan() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_atan() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH3A48>, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> <half 0xH0000, half 0xH3A48>
 ;
 entry:
   %c = call <2 x half> @_Z4atanDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_atan(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_atan(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_atan() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_atan() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
 entry:
   %c = call double @_Z4atand(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_atan(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_atan(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_atan() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_atan() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double 0x3FE921FB54442D18>, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> <double 0.000000e+00, double 0x3FE921FB54442D18>
 ;
 entry:
   %c = call <2 x double> @_Z4atanDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z4atanf(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanh.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanh.ll
index c1eac02bcb83d..3d93e335d1baa 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanh.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanh.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_atanh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_atanh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_atanh() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_atanh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   %c = call float @_Z5atanhf(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_atanh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_atanh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_atanh() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_atanh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float -0.000000e+00>, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> <float 0.000000e+00, float -0.000000e+00>
 ;
 entry:
   %c = call <2 x float> @_Z5atanhDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_atanh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_atanh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_atanh() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_atanh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH0000
 ;
 entry:
   %c = call half @_Z5atanhDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_atanh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_atanh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_atanh() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_atanh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH8000>, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> <half 0xH0000, half 0xH8000>
 ;
 entry:
   %c = call <2 x half> @_Z5atanhDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_atanh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_atanh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_atanh() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_atanh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
 entry:
   %c = call double @_Z5atanhd(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_atanh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_atanh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_atanh() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_atanh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double -0.000000e+00>, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> <double 0.000000e+00, double -0.000000e+00>
 ;
 entry:
   %c = call <2 x double> @_Z5atanhDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z5atanhf(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanpi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanpi.ll
index 0b985d07935c4..448f687a8a85e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanpi.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanpi.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_atanpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_atanpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_atanpi() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_atanpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   %c = call float @_Z6atanpif(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_atanpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_atanpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_atanpi() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_atanpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float 2.500000e-01>, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> <float 0.000000e+00, float 2.500000e-01>
 ;
 entry:
   %c = call <2 x float> @_Z6atanpiDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_atanpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_atanpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_atanpi() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_atanpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH0000
 ;
 entry:
   %c = call half @_Z6atanpiDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_atanpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_atanpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_atanpi() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_atanpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH3400>, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> <half 0xH0000, half 0xH3400>
 ;
 entry:
   %c = call <2 x half> @_Z6atanpiDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_atanpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_atanpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_atanpi() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_atanpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
 entry:
   %c = call double @_Z6atanpid(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_atanpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_atanpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_atanpi() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_atanpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double 2.500000e-01>, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> <double 0.000000e+00, double 2.500000e-01>
 ;
 entry:
   %c = call <2 x double> @_Z6atanpiDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z6atanpif(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cbrt.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cbrt.ll
index e99d5fee0f988..23efb6009d6b6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cbrt.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cbrt.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_cbrt(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_cbrt(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_cbrt() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_cbrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 1.000000e+00
 ;
 entry:
   %c = call float @_Z4cbrtf(float 1.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_cbrt(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_cbrt(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_cbrt() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_cbrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> <float 1.000000e+00, float -1.000000e+00>, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> <float 1.000000e+00, float -1.000000e+00>
 ;
 entry:
   %c = call <2 x float> @_Z4cbrtDv2_f(<2 x float> <float 1.000000e+00, float -1.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_cbrt(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_cbrt(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_cbrt() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_cbrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH3C00
 ;
 entry:
   %c = call half @_Z4cbrtDh(half 1.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_cbrt(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_cbrt(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_cbrt() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_cbrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> <half 0xH3C00, half 0xHBC00>, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> <half 0xH3C00, half 0xHBC00>
 ;
 entry:
   %c = call <2 x half> @_Z4cbrtDv2_Dh(<2 x half> <half 1.000000e+00, half -1.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_cbrt(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_cbrt(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_cbrt() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_cbrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 1.000000e+00
 ;
 entry:
   %c = call double @_Z4cbrtd(double 1.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_cbrt(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_cbrt(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_cbrt() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_cbrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> <double 1.000000e+00, double -1.000000e+00>, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> <double 1.000000e+00, double -1.000000e+00>
 ;
 entry:
   %c = call <2 x double> @_Z4cbrtDv2_d(<2 x double> <double 1.000000e+00, double -1.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z4cbrtf(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cos.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cos.ll
index 0998ae8ab3b25..2d6b36b243a11 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cos.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cos.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_cos(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_cos(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_cos() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_cos() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 1.000000e+00
 ;
 entry:
   %c = call float @_Z3cosf(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_cos(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_cos(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_cos() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_cos() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> splat (float 1.000000e+00)
 ;
 entry:
   %c = call <2 x float> @_Z3cosDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_cos(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_cos(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_cos() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_cos() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH3C00
 ;
 entry:
   %c = call half @_Z3cosDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_cos(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_cos(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_cos() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_cos() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> splat (half 0xH3C00)
 ;
 entry:
   %c = call <2 x half> @_Z3cosDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_cos(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_cos(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_cos() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_cos() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 1.000000e+00
 ;
 entry:
   %c = call double @_Z3cosd(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_cos(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_cos(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_cos() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_cos() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> splat (double 1.000000e+00)
 ;
 entry:
   %c = call <2 x double> @_Z3cosDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z3cosf(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cosh.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cosh.ll
index c151ed27ca197..6f0b4c042387d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cosh.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cosh.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_cosh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_cosh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_cosh() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_cosh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 1.000000e+00
 ;
 entry:
   %c = call float @_Z4coshf(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_cosh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_cosh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_cosh() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_cosh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> splat (float 1.000000e+00)
 ;
 entry:
   %c = call <2 x float> @_Z4coshDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_cosh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_cosh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_cosh() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_cosh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH3C00
 ;
 entry:
   %c = call half @_Z4coshDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_cosh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_cosh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_cosh() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_cosh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> splat (half 0xH3C00)
 ;
 entry:
   %c = call <2 x half> @_Z4coshDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_cosh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_cosh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_cosh() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_cosh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 1.000000e+00
 ;
 entry:
   %c = call double @_Z4coshd(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_cosh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_cosh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_cosh() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_cosh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> splat (double 1.000000e+00)
 ;
 entry:
   %c = call <2 x double> @_Z4coshDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z4coshf(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cospi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cospi.ll
index 4dce71bc5a5df..886949b7b21e3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cospi.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cospi.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_cospi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_cospi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_cospi() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_cospi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 1.000000e+00
 ;
 entry:
   %c = call float @_Z5cospif(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_cospi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_cospi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_cospi() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_cospi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> splat (float 1.000000e+00)
 ;
 entry:
   %c = call <2 x float> @_Z5cospiDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_cospi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_cospi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_cospi() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_cospi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH3C00
 ;
 entry:
   %c = call half @_Z5cospiDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_cospi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_cospi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_cospi() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_cospi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> splat (half 0xH3C00)
 ;
 entry:
   %c = call <2 x half> @_Z5cospiDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_cospi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_cospi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_cospi() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_cospi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 1.000000e+00
 ;
 entry:
   %c = call double @_Z5cospid(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_cospi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_cospi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_cospi() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_cospi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> splat (double 1.000000e+00)
 ;
 entry:
   %c = call <2 x double> @_Z5cospiDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z5cospif(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erf.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erf.ll
index ea54a59dcf907..3d1d6c9f644b2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erf.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erf.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_erf(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_erf(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_erf() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_erf() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   %c = call float @_Z3erff(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_erf(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_erf(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_erf() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_erf() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float -0.000000e+00>, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> <float 0.000000e+00, float -0.000000e+00>
 ;
 entry:
   %c = call <2 x float> @_Z3erfDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_erf(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_erf(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_erf() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_erf() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH0000
 ;
 entry:
   %c = call half @_Z3erfDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_erf(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_erf(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_erf() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_erf() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH8000>, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> <half 0xH0000, half 0xH8000>
 ;
 entry:
   %c = call <2 x half> @_Z3erfDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_erf(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_erf(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_erf() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_erf() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
 entry:
   %c = call double @_Z3erfd(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_erf(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_erf(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_erf() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_erf() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double -0.000000e+00>, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> <double 0.000000e+00, double -0.000000e+00>
 ;
 entry:
   %c = call <2 x double> @_Z3erfDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z3erff(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erfc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erfc.ll
index 1b1c18940987e..0e0bb9efab575 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erfc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-erfc.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_erfc(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_erfc(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_erfc() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_erfc() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 1.000000e+00
 ;
 entry:
   %c = call float @_Z4erfcf(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_erfc(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_erfc(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_erfc() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_erfc() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> splat (float 1.000000e+00)
 ;
 entry:
   %c = call <2 x float> @_Z4erfcDv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_erfc(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_erfc(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_erfc() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_erfc() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH3C00
 ;
 entry:
   %c = call half @_Z4erfcDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_erfc(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_erfc(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_erfc() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_erfc() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> splat (half 0xH3C00)
 ;
 entry:
   %c = call <2 x half> @_Z4erfcDv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_erfc(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_erfc(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_erfc() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_erfc() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 1.000000e+00
 ;
 entry:
   %c = call double @_Z4erfcd(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_erfc(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_erfc(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_erfc() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_erfc() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> splat (double 1.000000e+00)
 ;
 entry:
   %c = call <2 x double> @_Z4erfcDv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z4erfcf(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll
index de29931287665..ca7a8fbece135 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_exp(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_exp(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_exp() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_exp() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 1.000000e+00
 ;
 entry:
   %c = call float @_Z3expf(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_exp(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_exp(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_exp() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_exp() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> <float 1.000000e+00, float 0x4005BF0A80000000>, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> <float 1.000000e+00, float 0x4005BF0A80000000>
 ;
 entry:
   %c = call <2 x float> @_Z3expDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_exp(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_exp(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_exp() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_exp() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH3C00
 ;
 entry:
   %c = call half @_Z3expDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_exp(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_exp(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_exp() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_exp() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> <half 0xH3C00, half 0xH4170>, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> <half 0xH3C00, half 0xH4170>
 ;
 entry:
   %c = call <2 x half> @_Z3expDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_exp(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_exp(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_exp() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_exp() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 1.000000e+00
 ;
 entry:
   %c = call double @_Z3expd(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_exp(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_exp(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_exp() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_exp() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> <double 1.000000e+00, double 0x4005BF0A8B145769>, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> <double 1.000000e+00, double 0x4005BF0A8B145769>
 ;
 entry:
   %c = call <2 x double> @_Z3expDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z3expf(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp10.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp10.ll
index 7aea2fd4763b0..0b1ce025c9eee 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp10.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_exp10(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_exp10(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_exp10() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_exp10() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 1.000000e+00
 ;
 entry:
   %c = call float @_Z5exp10f(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_exp10(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_exp10(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_exp10() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_exp10() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> <float 1.000000e+00, float 1.000000e+01>, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> <float 1.000000e+00, float 1.000000e+01>
 ;
 entry:
   %c = call <2 x float> @_Z5exp10Dv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_exp10(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_exp10(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_exp10() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_exp10() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH3C00
 ;
 entry:
   %c = call half @_Z5exp10Dh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_exp10(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_exp10(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_exp10() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_exp10() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> <half 0xH3C00, half 0xH4900>, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> <half 0xH3C00, half 0xH4900>
 ;
 entry:
   %c = call <2 x half> @_Z5exp10Dv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_exp10(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_exp10(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_exp10() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_exp10() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 1.000000e+00
 ;
 entry:
   %c = call double @_Z5exp10d(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_exp10(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_exp10(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_exp10() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_exp10() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> <double 1.000000e+00, double 1.000000e+01>, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> <double 1.000000e+00, double 1.000000e+01>
 ;
 entry:
   %c = call <2 x double> @_Z5exp10Dv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z5exp10f(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp2.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp2.ll
index 2021346d0cac1..707aa9ba60618 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp2.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_exp2(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_exp2(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_exp2() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_exp2() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 1.000000e+00
 ;
 entry:
   %c = call float @_Z4exp2f(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_exp2(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_exp2(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_exp2() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_exp2() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> <float 1.000000e+00, float 2.000000e+00>, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> <float 1.000000e+00, float 2.000000e+00>
 ;
 entry:
   %c = call <2 x float> @_Z4exp2Dv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_exp2(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_exp2(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_exp2() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_exp2() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH3C00
 ;
 entry:
   %c = call half @_Z4exp2Dh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_exp2(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_exp2(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_exp2() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_exp2() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> <half 0xH3C00, half 0xH4000>, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> <half 0xH3C00, half 0xH4000>
 ;
 entry:
   %c = call <2 x half> @_Z4exp2Dv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_exp2(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_exp2(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_exp2() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_exp2() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 1.000000e+00
 ;
 entry:
   %c = call double @_Z4exp2d(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_exp2(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_exp2(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_exp2() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_exp2() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> <double 1.000000e+00, double 2.000000e+00>, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> <double 1.000000e+00, double 2.000000e+00>
 ;
 entry:
   %c = call <2 x double> @_Z4exp2Dv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z4exp2f(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-expm1.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-expm1.ll
index 0c08241a9e575..f20312a875d7b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-expm1.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-expm1.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_expm1(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_expm1(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_expm1() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_expm1() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   %c = call float @_Z5expm1f(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_expm1(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_expm1(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_expm1() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_expm1() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float -0.000000e+00>, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> <float 0.000000e+00, float -0.000000e+00>
 ;
 entry:
   %c = call <2 x float> @_Z5expm1Dv2_f(<2 x float> <float 0.000000e+00, float -0.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_expm1(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_expm1(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_expm1() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_expm1() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH0000
 ;
 entry:
   %c = call half @_Z5expm1Dh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_expm1(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_expm1(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_expm1() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_expm1() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> <half 0xH0000, half 0xH8000>, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> <half 0xH0000, half 0xH8000>
 ;
 entry:
   %c = call <2 x half> @_Z5expm1Dv2_Dh(<2 x half> <half 0.000000e+00, half -0.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_expm1(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_expm1(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_expm1() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_expm1() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
 entry:
   %c = call double @_Z5expm1d(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_expm1(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_expm1(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_expm1() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_expm1() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double -0.000000e+00>, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> <double 0.000000e+00, double -0.000000e+00>
 ;
 entry:
   %c = call <2 x double> @_Z5expm1Dv2_d(<2 x double> <double 0.000000e+00, double -0.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z5expm1f(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log.ll
index 305da227ad78f..f926e89e1b905 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_log(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_log(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_log() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_log() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   %c = call float @_Z3logf(float 1.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_log(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_log(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_log() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_log() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> zeroinitializer
 ;
 entry:
   %c = call <2 x float> @_Z3logDv2_f(<2 x float> <float 1.000000e+00, float 1.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_log(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_log(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_log() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_log() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH0000
 ;
 entry:
   %c = call half @_Z3logDh(half 1.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_log(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_log(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_log() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_log() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> zeroinitializer
 ;
 entry:
   %c = call <2 x half> @_Z3logDv2_Dh(<2 x half> <half 1.000000e+00, half 1.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_log(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_log(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_log() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_log() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
 entry:
   %c = call double @_Z3logd(double 1.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_log(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_log(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_log() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_log() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> zeroinitializer
 ;
 entry:
   %c = call <2 x double> @_Z3logDv2_d(<2 x double> <double 1.000000e+00, double 1.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z3logf(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log10.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log10.ll
index 6fb830efb93c3..77656a5242000 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log10.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_log10(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_log10(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_log10() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_log10() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   %c = call float @_Z5log10f(float 1.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_log10(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_log10(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_log10() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_log10() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> zeroinitializer
 ;
 entry:
   %c = call <2 x float> @_Z5log10Dv2_f(<2 x float> <float 1.000000e+00, float 1.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_log10(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_log10(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_log10() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_log10() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH0000
 ;
 entry:
   %c = call half @_Z5log10Dh(half 1.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_log10(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_log10(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_log10() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_log10() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> zeroinitializer
 ;
 entry:
   %c = call <2 x half> @_Z5log10Dv2_Dh(<2 x half> <half 1.000000e+00, half 1.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_log10(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_log10(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_log10() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_log10() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
 entry:
   %c = call double @_Z5log10d(double 1.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_log10(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_log10(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_log10() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_log10() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> zeroinitializer
 ;
 entry:
   %c = call <2 x double> @_Z5log10Dv2_d(<2 x double> <double 1.000000e+00, double 1.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z5log10f(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log2.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log2.ll
index 39aad939a395f..317525c435f52 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-log2.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_log2(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_log2(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_log2() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_log2() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   %c = call float @_Z4log2f(float 1.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_log2(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_log2(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_log2() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_log2() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> zeroinitializer
 ;
 entry:
   %c = call <2 x float> @_Z4log2Dv2_f(<2 x float> <float 1.000000e+00, float 1.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_log2(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_log2(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_log2() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_log2() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH0000
 ;
 entry:
   %c = call half @_Z4log2Dh(half 1.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_log2(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_log2(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_log2() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_log2() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> zeroinitializer
 ;
 entry:
   %c = call <2 x half> @_Z4log2Dv2_Dh(<2 x half> <half 1.000000e+00, half 1.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_log2(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_log2(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_log2() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_log2() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
 entry:
   %c = call double @_Z4log2d(double 1.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_log2(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_log2(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_log2() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_log2() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> zeroinitializer
 ;
 entry:
   %c = call <2 x double> @_Z4log2Dv2_d(<2 x double> <double 1.000000e+00, double 1.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z4log2f(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-rsqrt.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-rsqrt.ll
index 87a2d67e48532..9d794ce5f46b0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-rsqrt.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-rsqrt.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_rsqrt(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_rsqrt(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_rsqrt() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_rsqrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 1.000000e+00
 ;
 entry:
   %c = call float @_Z5rsqrtf(float 1.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_rsqrt(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_rsqrt(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_rsqrt() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_rsqrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> splat (float 1.000000e+00)
 ;
 entry:
   %c = call <2 x float> @_Z5rsqrtDv2_f(<2 x float> <float 1.000000e+00, float 1.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_rsqrt(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_rsqrt(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_rsqrt() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_rsqrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH3C00
 ;
 entry:
   %c = call half @_Z5rsqrtDh(half 1.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_rsqrt(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_rsqrt(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_rsqrt() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_rsqrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> splat (half 0xH3C00)
 ;
 entry:
   %c = call <2 x half> @_Z5rsqrtDv2_Dh(<2 x half> <half 1.000000e+00, half 1.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_rsqrt(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_rsqrt(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_rsqrt() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_rsqrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 1.000000e+00
 ;
 entry:
   %c = call double @_Z5rsqrtd(double 1.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_rsqrt(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_rsqrt(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_rsqrt() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_rsqrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> splat (double 1.000000e+00)
 ;
 entry:
   %c = call <2 x double> @_Z5rsqrtDv2_d(<2 x double> <double 1.000000e+00, double 1.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z5rsqrtf(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sin.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sin.ll
index 8016e93aadca8..c1c6f585ebd30 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sin.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sin.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_sin(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_sin(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_sin() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_sin() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   %c = call float @_Z3sinf(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_sin(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_sin(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_sin() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_sin() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> zeroinitializer
 ;
 entry:
   %c = call <2 x float> @_Z3sinDv2_f(<2 x float> zeroinitializer)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_sin(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_sin(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_sin() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_sin() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH0000
 ;
 entry:
   %c = call half @_Z3sinDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_sin(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_sin(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_sin() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_sin() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> zeroinitializer
 ;
 entry:
   %c = call <2 x half> @_Z3sinDv2_Dh(<2 x half> zeroinitializer)
-  store <2 x half> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_sin(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_sin(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_sin() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_sin() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
 entry:
   %c = call double @_Z3sind(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_sin(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_sin(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_sin() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_sin() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> zeroinitializer
 ;
 entry:
   %c = call <2 x double> @_Z3sinDv2_d(<2 x double> zeroinitializer)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z3sinf(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinh.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinh.ll
index 30674c94d6d4e..f45474ddee859 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinh.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinh.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_sinh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_sinh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_sinh() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_sinh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   %c = call float @_Z4sinhf(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_sinh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_sinh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_sinh() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_sinh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> zeroinitializer
 ;
 entry:
   %c = call <2 x float> @_Z4sinhDv2_f(<2 x float> zeroinitializer)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_sinh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_sinh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_sinh() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_sinh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH0000
 ;
 entry:
   %c = call half @_Z4sinhDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_sinh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_sinh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_sinh() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_sinh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> zeroinitializer
 ;
 entry:
   %c = call <2 x half> @_Z4sinhDv2_Dh(<2 x half> zeroinitializer)
-  store <2 x half> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_sinh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_sinh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_sinh() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_sinh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
 entry:
   %c = call double @_Z4sinhd(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_sinh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_sinh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_sinh() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_sinh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> zeroinitializer
 ;
 entry:
   %c = call <2 x double> @_Z4sinhDv2_d(<2 x double> zeroinitializer)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z4sinhf(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinpi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinpi.ll
index 0695462e3ae20..250abaa28f51d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinpi.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sinpi.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_sinpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_sinpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_sinpi() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_sinpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   %c = call float @_Z5sinpif(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_sinpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_sinpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_sinpi() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_sinpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> zeroinitializer
 ;
 entry:
   %c = call <2 x float> @_Z5sinpiDv2_f(<2 x float> zeroinitializer)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_sinpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_sinpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_sinpi() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_sinpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH0000
 ;
 entry:
   %c = call half @_Z5sinpiDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_sinpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_sinpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_sinpi() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_sinpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> zeroinitializer
 ;
 entry:
   %c = call <2 x half> @_Z5sinpiDv2_Dh(<2 x half> zeroinitializer)
-  store <2 x half> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_sinpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_sinpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_sinpi() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_sinpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
 entry:
   %c = call double @_Z5sinpid(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_sinpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_sinpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_sinpi() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_sinpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> zeroinitializer
 ;
 entry:
   %c = call <2 x double> @_Z5sinpiDv2_d(<2 x double> zeroinitializer)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z5sinpif(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sqrt.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sqrt.ll
index df302072f48cc..ce495cde3df3f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sqrt.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sqrt.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_sqrt(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_sqrt(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_sqrt() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_sqrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 1.000000e+00
 ;
 entry:
   %c = call float @_Z4sqrtf(float 1.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_sqrt(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_sqrt(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_sqrt() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_sqrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> splat (float 1.000000e+00)
 ;
 entry:
   %c = call <2 x float> @_Z4sqrtDv2_f(<2 x float> <float 1.000000e+00, float 1.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_sqrt(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_sqrt(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_sqrt() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_sqrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH3C00
 ;
 entry:
   %c = call half @_Z4sqrtDh(half 1.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_sqrt(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_sqrt(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_sqrt() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_sqrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> splat (half 0xH3C00)
 ;
 entry:
   %c = call <2 x half> @_Z4sqrtDv2_Dh(<2 x half> <half 1.000000e+00, half 1.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_sqrt(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_sqrt(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_sqrt() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_sqrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 1.000000e+00
 ;
 entry:
   %c = call double @_Z4sqrtd(double 1.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_sqrt(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_sqrt(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_sqrt() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_sqrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> splat (double 1.000000e+00)
 ;
 entry:
   %c = call <2 x double> @_Z4sqrtDv2_d(<2 x double> <double 1.000000e+00, double 1.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z4sqrtf(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tan.ll
index 10d4c15f697cc..2aa8ce4eae3f3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tan.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_tan(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_tan(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_tan() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_tan() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   %c = call float @_Z3tanf(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_tan(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_tan(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_tan() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_tan() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> zeroinitializer
 ;
 entry:
   %c = call <2 x float> @_Z3tanDv2_f(<2 x float> zeroinitializer)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_tan(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_tan(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_tan() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_tan() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH0000
 ;
 entry:
   %c = call half @_Z3tanDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_tan(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_tan(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_tan() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_tan() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> zeroinitializer
 ;
 entry:
   %c = call <2 x half> @_Z3tanDv2_Dh(<2 x half> zeroinitializer)
-  store <2 x half> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_tan(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_tan(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_tan() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_tan() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
 entry:
   %c = call double @_Z3tand(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_tan(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_tan(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_tan() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_tan() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> zeroinitializer
 ;
 entry:
   %c = call <2 x double> @_Z3tanDv2_d(<2 x double> zeroinitializer)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z3tanf(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanh.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanh.ll
index dfbf6d77ba664..ba20df929614a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanh.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanh.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_tanh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_tanh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_tanh() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_tanh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   %c = call float @_Z4tanhf(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_tanh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_tanh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_tanh() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_tanh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> zeroinitializer
 ;
 entry:
   %c = call <2 x float> @_Z4tanhDv2_f(<2 x float> zeroinitializer)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_tanh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_tanh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_tanh() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_tanh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH0000
 ;
 entry:
   %c = call half @_Z4tanhDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_tanh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_tanh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_tanh() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_tanh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> zeroinitializer
 ;
 entry:
   %c = call <2 x half> @_Z4tanhDv2_Dh(<2 x half> zeroinitializer)
-  store <2 x half> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_tanh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_tanh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_tanh() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_tanh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
 entry:
   %c = call double @_Z4tanhd(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_tanh(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_tanh(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_tanh() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_tanh() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> zeroinitializer
 ;
 entry:
   %c = call <2 x double> @_Z4tanhDv2_d(<2 x double> zeroinitializer)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z4tanhf(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanpi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanpi.ll
index 33d7e3199127d..72fbc1a82101a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanpi.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tanpi.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_tanpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_tanpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_tanpi() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_tanpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
 entry:
   %c = call float @_Z5tanpif(float 0.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_tanpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_tanpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_tanpi() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_tanpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> zeroinitializer
 ;
 entry:
   %c = call <2 x float> @_Z5tanpiDv2_f(<2 x float> zeroinitializer)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_tanpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_tanpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_tanpi() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_tanpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH0000, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH0000
 ;
 entry:
   %c = call half @_Z5tanpiDh(half 0.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_tanpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_tanpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_tanpi() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_tanpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> zeroinitializer, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> zeroinitializer
 ;
 entry:
   %c = call <2 x half> @_Z5tanpiDv2_Dh(<2 x half> zeroinitializer)
-  store <2 x half> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_tanpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_tanpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_tanpi() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_tanpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 0.000000e+00
 ;
 entry:
   %c = call double @_Z5tanpid(double 0.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_tanpi(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_tanpi(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_tanpi() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_tanpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> zeroinitializer, ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> zeroinitializer
 ;
 entry:
   %c = call <2 x double> @_Z5tanpiDv2_d(<2 x double> zeroinitializer)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z5tanpif(float)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tgamma.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tgamma.ll
index 8212c49738f9a..ec32898911985 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tgamma.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tgamma.ll
@@ -1,82 +1,64 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
 
-define amdgpu_kernel void @test_tdo_scalar_f32_tgamma(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f32_tgamma(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define float @test_tdo_scalar_f32_tgamma() {
+; CHECK-LABEL: define float @test_tdo_scalar_f32_tgamma() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) [[OUT]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret float 1.000000e+00
 ;
 entry:
   %c = call float @_Z6tgammaf(float 1.000000e+00)
-  store float %c, ptr addrspace(1) %out, align 4
-  ret void
+  ret float %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f32_tgamma(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f32_tgamma(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x float> @test_tdo_v2_f32_tgamma() {
+; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_tgamma() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x float> splat (float 1.000000e+00), ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x float> splat (float 1.000000e+00)
 ;
 entry:
   %c = call <2 x float> @_Z6tgammaDv2_f(<2 x float> <float 1.000000e+00, float 2.000000e+00>)
-  store <2 x float> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x float> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f16_tgamma(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f16_tgamma(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define half @test_tdo_scalar_f16_tgamma() {
+; CHECK-LABEL: define half @test_tdo_scalar_f16_tgamma() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store half 0xH3C00, ptr addrspace(1) [[OUT]], align 2
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret half 0xH3C00
 ;
 entry:
   %c = call half @_Z6tgammaDh(half 1.000000e+00)
-  store half %c, ptr addrspace(1) %out, align 2
-  ret void
+  ret half %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f16_tgamma(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f16_tgamma(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x half> @test_tdo_v2_f16_tgamma() {
+; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_tgamma() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x half> splat (half 0xH3C00), ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x half> splat (half 0xH3C00)
 ;
 entry:
   %c = call <2 x half> @_Z6tgammaDv2_Dh(<2 x half> <half 1.000000e+00, half 2.000000e+00>)
-  store <2 x half> %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret <2 x half> %c
 }
 
-define amdgpu_kernel void @test_tdo_scalar_f64_tgamma(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_scalar_f64_tgamma(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define double @test_tdo_scalar_f64_tgamma() {
+; CHECK-LABEL: define double @test_tdo_scalar_f64_tgamma() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store double 1.000000e+00, ptr addrspace(1) [[OUT]], align 8
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret double 1.000000e+00
 ;
 entry:
   %c = call double @_Z6tgammad(double 1.000000e+00)
-  store double %c, ptr addrspace(1) %out, align 8
-  ret void
+  ret double %c
 }
 
-define amdgpu_kernel void @test_tdo_v2_f64_tgamma(ptr addrspace(1) %out) {
-; CHECK-LABEL: define amdgpu_kernel void @test_tdo_v2_f64_tgamma(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) {
+define <2 x double> @test_tdo_v2_f64_tgamma() {
+; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_tgamma() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    store <2 x double> splat (double 1.000000e+00), ptr addrspace(1) [[OUT]], align 16
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    ret <2 x double> splat (double 1.000000e+00)
 ;
 entry:
   %c = call <2 x double> @_Z6tgammaDv2_d(<2 x double> <double 1.000000e+00, double 2.000000e+00>)
-  store <2 x double> %c, ptr addrspace(1) %out, align 16
-  ret void
+  ret <2 x double> %c
 }
 
 declare float        @_Z6tgammaf(float)

>From 7e2fff80317b35a174b3f38081727ab278ff1823 Mon Sep 17 00:00:00 2001
From: Steffen Holst Larsen <HolstLarsen.Steffen at amd.com>
Date: Tue, 3 Feb 2026 04:52:42 -0600
Subject: [PATCH 3/8] Use APFloat instead

Signed-off-by: Steffen Holst Larsen <HolstLarsen.Steffen at amd.com>
---
 llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp | 308 +++++++++-------------
 1 file changed, 123 insertions(+), 185 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index b118d7fb50a98..43b72f5ea9aa8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -76,8 +76,8 @@ class AMDGPULibCalls {
   bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo);
 
   // evaluate calls if calls' arguments are constants.
-  bool evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0, double &Res1,
-                              Constant *copr0, Constant *copr1);
+  bool evaluateScalarMathFunc(const FuncInfo &FInfo, APFloat &Res0,
+                              APFloat &Res1, Constant *copr0, Constant *copr1);
   bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo);
 
   /// Insert a value to sincos function \p Fsincos. Returns (value of sin, value
@@ -743,38 +743,24 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
   return false;
 }
 
-static Constant *getConstantFloatVectorForArgType(LLVMContext &Ctx,
-                                                  AMDGPULibFunc::EType ArgType,
-                                                  ArrayRef<double> Values,
-                                                  Type *Ty) {
-  switch (ArgType) {
-  case AMDGPULibFunc::F16: {
-    SmallVector<uint16_t, 4> HalfIntValues;
-    HalfIntValues.reserve(Values.size());
-    for (double D : Values) {
-      APFloat APF16 = APFloat(D);
-      bool Unused;
-      APF16.convert(llvm::APFloat::IEEEhalf(),
-                    llvm::RoundingMode::NearestTiesToEven, &Unused);
-      uint16_t APF16Int = APF16.bitcastToAPInt().getZExtValue();
-      HalfIntValues.push_back(APF16Int);
-    }
-    ArrayRef<uint16_t> Tmp(HalfIntValues);
-    return ConstantDataVector::getFP(Ty->getScalarType(), Tmp);
-  }
-  case AMDGPULibFunc::F32: {
-    SmallVector<float, 4> FValues;
-    FValues.reserve(Values.size());
-    for (double D : Values)
-      FValues.push_back((float)D);
-    ArrayRef<float> Tmp(FValues);
-    return ConstantDataVector::get(Ty->getContext(), Tmp);
-  }
-  case AMDGPULibFunc::F64:
-    return ConstantDataVector::get(Ty->getContext(), Values);
-  default:
-    llvm_unreachable("Unsupported argument type");
+static Constant *
+getConstantFloatVectorForArgType(LLVMContext &Ctx, AMDGPULibFunc::EType ArgType,
+                                 const ArrayRef<APFloat> Values,
+                                 const Type *Ty) {
+  SmallVector<Constant *, 4> ConstValues;
+  ConstValues.reserve(Values.size());
+  for (const APFloat &APF : Values) {
+    APFloat APFCopy = APF;
+    const auto &FltSem =
+        ArgType == AMDGPULibFunc::F16
+            ? APFloat::IEEEhalf()
+            : (ArgType == AMDGPULibFunc::F32 ? APFloat::IEEEsingle()
+                                             : APFloat::IEEEdouble());
+    bool Unused;
+    APFCopy.convert(FltSem, APFloat::rmNearestTiesToEven, &Unused);
+    ConstValues.push_back(ConstantFP::get(Ty->getScalarType(), APFCopy));
   }
+  return ConstantVector::get(ConstValues);
 }
 
 bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
@@ -791,7 +777,8 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
     // Vector version
     Constant *CV = dyn_cast<Constant>(opr0);
     if (CV && CV->getType()->isVectorTy()) {
-      SmallVector<double, 4> DValues(vecSize);
+      SmallVector<APFloat, 4> Values;
+      Values.reserve(vecSize);
       for (int eltNo = 0; eltNo < vecSize; ++eltNo) {
         ConstantFP *eltval =
             cast<ConstantFP>(CV->getAggregateElement((unsigned)eltNo));
@@ -801,10 +788,10 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
             });
         if (MatchingRow == tr.end())
           return false;
-        DValues[eltNo] = MatchingRow->result;
+        Values.push_back(APFloat(MatchingRow->result));
       }
       Constant *NewValues = getConstantFloatVectorForArgType(
-          CI->getContext(), getArgType(FInfo), DValues, CI->getType());
+          CI->getContext(), getArgType(FInfo), Values, CI->getType());
       LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *NewValues << "\n");
       replaceCall(CI, NewValues);
       return true;
@@ -1400,9 +1387,9 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
   return true;
 }
 
-bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0,
-                                            double &Res1, Constant *copr0,
-                                            Constant *copr1) {
+bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo,
+                                            APFloat &Res0, APFloat &Res1,
+                                            Constant *copr0, Constant *copr1) {
   // By default, opr0/opr1/opr3 holds values of float/double type.
   // If they are not float/double, each function has to its
   // operand separately.
@@ -1421,148 +1408,97 @@ bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0,
              : (double)fpopr1->getValueAPF().convertToFloat();
   }
 
-  switch (FInfo.getId()) {
-  default : return false;
-
-  case AMDGPULibFunc::EI_ACOS:
-    Res0 = acos(opr0);
-    return true;
-
-  case AMDGPULibFunc::EI_ACOSH:
-    // acosh(x) == log(x + sqrt(x*x - 1))
-    Res0 = log(opr0 + sqrt(opr0*opr0 - 1.0));
-    return true;
-
-  case AMDGPULibFunc::EI_ACOSPI:
-    Res0 = acos(opr0) / MATH_PI;
-    return true;
-
-  case AMDGPULibFunc::EI_ASIN:
-    Res0 = asin(opr0);
-    return true;
-
-  case AMDGPULibFunc::EI_ASINH:
-    // asinh(x) == log(x + sqrt(x*x + 1))
-    Res0 = log(opr0 + sqrt(opr0*opr0 + 1.0));
-    return true;
-
-  case AMDGPULibFunc::EI_ASINPI:
-    Res0 = asin(opr0) / MATH_PI;
-    return true;
-
-  case AMDGPULibFunc::EI_ATAN:
-    Res0 = atan(opr0);
-    return true;
-
-  case AMDGPULibFunc::EI_ATANH:
-    // atanh(x) == (log(x+1) - log(x-1))/2;
-    Res0 = (log(opr0 + 1.0) - log(opr0 - 1.0))/2.0;
-    return true;
-
-  case AMDGPULibFunc::EI_ATANPI:
-    Res0 = atan(opr0) / MATH_PI;
-    return true;
-
-  case AMDGPULibFunc::EI_CBRT:
-    Res0 = (opr0 < 0.0) ? -pow(-opr0, 1.0/3.0) : pow(opr0, 1.0/3.0);
-    return true;
-
-  case AMDGPULibFunc::EI_COS:
-    Res0 = cos(opr0);
-    return true;
-
-  case AMDGPULibFunc::EI_COSH:
-    Res0 = cosh(opr0);
-    return true;
-
-  case AMDGPULibFunc::EI_COSPI:
-    Res0 = cos(MATH_PI * opr0);
-    return true;
-
-  case AMDGPULibFunc::EI_EXP:
-    Res0 = exp(opr0);
-    return true;
-
-  case AMDGPULibFunc::EI_EXP2:
-    Res0 = pow(2.0, opr0);
-    return true;
-
-  case AMDGPULibFunc::EI_EXP10:
-    Res0 = pow(10.0, opr0);
-    return true;
-
-  case AMDGPULibFunc::EI_LOG:
-    Res0 = log(opr0);
-    return true;
-
-  case AMDGPULibFunc::EI_LOG2:
-    Res0 = log(opr0) / log(2.0);
-    return true;
-
-  case AMDGPULibFunc::EI_LOG10:
-    Res0 = log(opr0) / log(10.0);
-    return true;
-
-  case AMDGPULibFunc::EI_RSQRT:
-    Res0 = 1.0 / sqrt(opr0);
-    return true;
-
-  case AMDGPULibFunc::EI_SIN:
-    Res0 = sin(opr0);
-    return true;
-
-  case AMDGPULibFunc::EI_SINH:
-    Res0 = sinh(opr0);
-    return true;
-
-  case AMDGPULibFunc::EI_SINPI:
-    Res0 = sin(MATH_PI * opr0);
-    return true;
-
-  case AMDGPULibFunc::EI_TAN:
-    Res0 = tan(opr0);
-    return true;
-
-  case AMDGPULibFunc::EI_TANH:
-    Res0 = tanh(opr0);
-    return true;
-
-  case AMDGPULibFunc::EI_TANPI:
-    Res0 = tan(MATH_PI * opr0);
-    return true;
-
-  // two-arg functions
-  case AMDGPULibFunc::EI_POW:
-  case AMDGPULibFunc::EI_POWR:
-    Res0 = pow(opr0, opr1);
-    return true;
-
-  case AMDGPULibFunc::EI_POWN: {
-    if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
-      double val = (double)iopr1->getSExtValue();
-      Res0 = pow(opr0, val);
-      return true;
+  auto Res = [&FInfo, opr0, opr1,
+              copr1]() -> std::optional<std::pair<double, double>> {
+    switch (FInfo.getId()) {
+    default:
+      return std::nullopt;
+    case AMDGPULibFunc::EI_ACOS:
+      return std::make_pair(acos(opr0), 0.0);
+    case AMDGPULibFunc::EI_ACOSH:
+      // acosh(x) == log(x + sqrt(x*x - 1))
+      return std::make_pair(log(opr0 + sqrt(opr0 * opr0 - 1.0)), 0.0);
+    case AMDGPULibFunc::EI_ACOSPI:
+      return std::make_pair(acos(opr0) / MATH_PI, 0.0);
+    case AMDGPULibFunc::EI_ASIN:
+      return std::make_pair(asin(opr0), 0.0);
+    case AMDGPULibFunc::EI_ASINH:
+      // asinh(x) == log(x + sqrt(x*x + 1))
+      return std::make_pair(log(opr0 + sqrt(opr0 * opr0 + 1.0)), 0.0);
+    case AMDGPULibFunc::EI_ASINPI:
+      return std::make_pair(asin(opr0) / MATH_PI, 0.0);
+    case AMDGPULibFunc::EI_ATAN:
+      return std::make_pair(atan(opr0), 0.0);
+    case AMDGPULibFunc::EI_ATANH:
+      // atanh(x) == (log(x+1) - log(x-1))/2;
+      return std::make_pair((log(opr0 + 1.0) - log(opr0 - 1.0)) / 2.0, 0.0);
+    case AMDGPULibFunc::EI_ATANPI:
+      return std::make_pair(atan(opr0) / MATH_PI, 0.0);
+    case AMDGPULibFunc::EI_CBRT:
+      return std::make_pair(
+          (opr0 < 0.0) ? -pow(-opr0, 1.0 / 3.0) : pow(opr0, 1.0 / 3.0), 0.0);
+    case AMDGPULibFunc::EI_COS:
+      return std::make_pair(cos(opr0), 0.0);
+    case AMDGPULibFunc::EI_COSH:
+      return std::make_pair(cosh(opr0), 0.0);
+    case AMDGPULibFunc::EI_COSPI:
+      return std::make_pair(cos(MATH_PI * opr0), 0.0);
+    case AMDGPULibFunc::EI_EXP:
+      return std::make_pair(exp(opr0), 0.0);
+    case AMDGPULibFunc::EI_EXP2:
+      return std::make_pair(pow(2.0, opr0), 0.0);
+    case AMDGPULibFunc::EI_EXP10:
+      return std::make_pair(pow(10.0, opr0), 0.0);
+    case AMDGPULibFunc::EI_LOG:
+      return std::make_pair(log(opr0), 0.0);
+    case AMDGPULibFunc::EI_LOG2:
+      return std::make_pair(log(opr0) / log(2.0), 0.0);
+    case AMDGPULibFunc::EI_LOG10:
+      return std::make_pair(log(opr0) / log(10.0), 0.0);
+    case AMDGPULibFunc::EI_RSQRT:
+      return std::make_pair(1.0 / sqrt(opr0), 0.0);
+    case AMDGPULibFunc::EI_SIN:
+      return std::make_pair(sin(opr0), 0.0);
+    case AMDGPULibFunc::EI_SINH:
+      return std::make_pair(sinh(opr0), 0.0);
+    case AMDGPULibFunc::EI_SINPI:
+      return std::make_pair(sin(MATH_PI * opr0), 0.0);
+    case AMDGPULibFunc::EI_TAN:
+      return std::make_pair(tan(opr0), 0.0);
+    case AMDGPULibFunc::EI_TANH:
+      return std::make_pair(tanh(opr0), 0.0);
+    case AMDGPULibFunc::EI_TANPI:
+      return std::make_pair(tan(MATH_PI * opr0), 0.0);
+    // two-arg functions
+    case AMDGPULibFunc::EI_POW:
+    case AMDGPULibFunc::EI_POWR:
+      return std::make_pair(pow(opr0, opr1), 0.0);
+    case AMDGPULibFunc::EI_POWN: {
+      if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
+        double val = (double)iopr1->getSExtValue();
+        return std::make_pair(pow(opr0, val), 0.0);
+      }
+      return std::nullopt;
     }
-    return false;
-  }
 
-  case AMDGPULibFunc::EI_ROOTN: {
-    if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
-      double val = (double)iopr1->getSExtValue();
-      Res0 = pow(opr0, 1.0 / val);
-      return true;
+    case AMDGPULibFunc::EI_ROOTN: {
+      if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
+        double val = (double)iopr1->getSExtValue();
+        return std::make_pair(pow(opr0, 1.0 / val), 0.0);
+      }
+      return std::nullopt;
     }
-    return false;
-  }
-
-  // with ptr arg
-  case AMDGPULibFunc::EI_SINCOS:
-    Res0 = sin(opr0);
-    Res1 = cos(opr0);
-    return true;
-  }
+    // with ptr arg
+    case AMDGPULibFunc::EI_SINCOS:
+      return std::make_pair(sin(opr0), cos(opr0));
+    }
+  }();
 
-  return false;
+  if (!Res.has_value())
+    return false;
+  Res0 = APFloat(Res->first);
+  Res1 = APFloat(Res->second);
+  return true;
 }
 
 bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
@@ -1587,11 +1523,12 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
   // At this point, all arguments to aCI are constants.
 
   // max vector size is 16, and sincos will generate two results.
-  double DVal0[16], DVal1[16];
+  SmallVector<APFloat, 16> Val0, Val1;
   int FuncVecSize = getVecSize(FInfo);
   bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS);
   if (FuncVecSize == 1) {
-    if (!evaluateScalarMathFunc(FInfo, DVal0[0], DVal1[0], copr0, copr1)) {
+    if (!evaluateScalarMathFunc(FInfo, Val0.emplace_back(0.0),
+                                Val1.emplace_back(0.0), copr0, copr1)) {
       return false;
     }
   } else {
@@ -1600,7 +1537,8 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
     for (int i = 0; i < FuncVecSize; ++i) {
       Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr;
       Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr;
-      if (!evaluateScalarMathFunc(FInfo, DVal0[i], DVal1[i], celt0, celt1)) {
+      if (!evaluateScalarMathFunc(FInfo, Val0.emplace_back(0.0),
+                                  Val1.emplace_back(0.0), celt0, celt1)) {
         return false;
       }
     }
@@ -1609,15 +1547,15 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
   LLVMContext &context = aCI->getContext();
   Constant *nval0, *nval1;
   if (FuncVecSize == 1) {
-    nval0 = ConstantFP::get(aCI->getType(), DVal0[0]);
+    nval0 = ConstantFP::get(aCI->getType(), Val0[0]);
     if (hasTwoResults)
-      nval1 = ConstantFP::get(aCI->getType(), DVal1[0]);
+      nval1 = ConstantFP::get(aCI->getType(), Val1[0]);
   } else {
-    nval0 = getConstantFloatVectorForArgType(context, getArgType(FInfo), DVal0,
+    nval0 = getConstantFloatVectorForArgType(context, getArgType(FInfo), Val0,
                                              aCI->getType());
     if (hasTwoResults)
-      nval1 = getConstantFloatVectorForArgType(context, getArgType(FInfo),
-                                               DVal1, aCI->getType());
+      nval1 = getConstantFloatVectorForArgType(context, getArgType(FInfo), Val1,
+                                               aCI->getType());
   }
 
   if (hasTwoResults) {

>From d8c7f9ef3055938c0398dd47719c386f94844635 Mon Sep 17 00:00:00 2001
From: Steffen Holst Larsen <HolstLarsen.Steffen at amd.com>
Date: Tue, 3 Feb 2026 06:09:28 -0600
Subject: [PATCH 4/8] Make vector tests check all input values

Signed-off-by: Steffen Holst Larsen <HolstLarsen.Steffen at amd.com>
---
 .../amdgpu-simplify-libcall-tdo-acos.ll       | 36 +++++++++----------
 .../amdgpu-simplify-libcall-tdo-acospi.ll     | 36 +++++++++----------
 .../amdgpu-simplify-libcall-tdo-asin.ll       | 36 +++++++++----------
 .../amdgpu-simplify-libcall-tdo-asinpi.ll     | 36 +++++++++----------
 .../amdgpu-simplify-libcall-tdo-atan.ll       | 36 +++++++++----------
 .../amdgpu-simplify-libcall-tdo-atanpi.ll     | 36 +++++++++----------
 .../amdgpu-simplify-libcall-tdo-cbrt.ll       | 36 +++++++++----------
 .../AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll | 36 +++++++++----------
 .../amdgpu-simplify-libcall-tdo-exp10.ll      | 36 +++++++++----------
 .../amdgpu-simplify-libcall-tdo-exp2.ll       | 36 +++++++++----------
 .../amdgpu-simplify-libcall-tdo-sqrt.ll       | 36 +++++++++----------
 .../amdgpu-simplify-libcall-tdo-tgamma.ll     | 36 +++++++++----------
 12 files changed, 216 insertions(+), 216 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acos.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acos.ll
index 1fe6ccde628d5..843602d8fcad9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acos.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acos.ll
@@ -11,14 +11,14 @@ entry:
   ret float %c
 }
 
-define <2 x float> @test_tdo_v2_f32_acos() {
-; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_acos() {
+define <4 x float> @test_tdo_v2_f32_acos() {
+; CHECK-LABEL: define <4 x float> @test_tdo_v2_f32_acos() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x float> <float 0x3FF921FB60000000, float 0.000000e+00>
+; CHECK-NEXT:    ret <4 x float> <float 0x3FF921FB60000000, float 0x3FF921FB60000000, float 0.000000e+00, float 0x400921FB60000000>
 ;
 entry:
-  %c = call <2 x float> @_Z4acosDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
-  ret <2 x float> %c
+  %c = call <4 x float> @_Z4acosDv4_f(<4 x float> <float 0.000000e+00, float -0.000000e+00, float 1.000000e+00, float -1.000000e+00>)
+  ret <4 x float> %c
 }
 
 define half @test_tdo_scalar_f16_acos() {
@@ -31,14 +31,14 @@ entry:
   ret half %c
 }
 
-define <2 x half> @test_tdo_v2_f16_acos() {
-; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_acos() {
+define <4 x half> @test_tdo_v2_f16_acos() {
+; CHECK-LABEL: define <4 x half> @test_tdo_v2_f16_acos() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x half> <half 0xH3E48, half 0xH0000>
+; CHECK-NEXT:    ret <4 x half> <half 0xH3E48, half 0xH3E48, half 0xH0000, half 0xH4248>
 ;
 entry:
-  %c = call <2 x half> @_Z4acosDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
-  ret <2 x half> %c
+  %c = call <4 x half> @_Z4acosDv4_Dh(<4 x half> <half 0.000000e+00, half -0.000000e+00, half 1.000000e+00, half -1.000000e+00>)
+  ret <4 x half> %c
 }
 
 define double @test_tdo_scalar_f64_acos() {
@@ -51,19 +51,19 @@ entry:
   ret double %c
 }
 
-define <2 x double> @test_tdo_v2_f64_acos() {
-; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_acos() {
+define <4 x double> @test_tdo_v2_f64_acos() {
+; CHECK-LABEL: define <4 x double> @test_tdo_v2_f64_acos() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x double> <double 0x3FF921FB54442D18, double 0.000000e+00>
+; CHECK-NEXT:    ret <4 x double> <double 0x3FF921FB54442D18, double 0x3FF921FB54442D18, double 0.000000e+00, double 0x400921FB54442D18>
 ;
 entry:
-  %c = call <2 x double> @_Z4acosDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
-  ret <2 x double> %c
+  %c = call <4 x double> @_Z4acosDv4_d(<4 x double> <double 0.000000e+00, double -0.000000e+00, double 1.000000e+00, double -1.000000e+00>)
+  ret <4 x double> %c
 }
 
 declare float        @_Z4acosf(float)
-declare <2 x float>  @_Z4acosDv2_f(<2 x float>)
+declare <4 x float>  @_Z4acosDv4_f(<4 x float>)
 declare half         @_Z4acosDh(half)
-declare <2 x half>   @_Z4acosDv2_Dh(<2 x half>)
+declare <4 x half>   @_Z4acosDv4_Dh(<4 x half>)
 declare double       @_Z4acosd(double)
-declare <2 x double> @_Z4acosDv2_d(<2 x double>)
+declare <4 x double> @_Z4acosDv4_d(<4 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acospi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acospi.ll
index b279a1a4b5802..23193b5b3c10e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acospi.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-acospi.ll
@@ -11,14 +11,14 @@ entry:
   ret float %c
 }
 
-define <2 x float> @test_tdo_v2_f32_acospi() {
-; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_acospi() {
+define <4 x float> @test_tdo_v2_f32_acospi() {
+; CHECK-LABEL: define <4 x float> @test_tdo_v2_f32_acospi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x float> <float 5.000000e-01, float 0.000000e+00>
+; CHECK-NEXT:    ret <4 x float> <float 5.000000e-01, float 5.000000e-01, float 0.000000e+00, float 1.000000e+00>
 ;
 entry:
-  %c = call <2 x float> @_Z6acospiDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
-  ret <2 x float> %c
+  %c = call <4 x float> @_Z6acospiDv4_f(<4 x float> <float 0.000000e+00, float -0.000000e+00, float 1.000000e+00, float -1.000000e+00>)
+  ret <4 x float> %c
 }
 
 define half @test_tdo_scalar_f16_acospi() {
@@ -31,14 +31,14 @@ entry:
   ret half %c
 }
 
-define <2 x half> @test_tdo_v2_f16_acospi() {
-; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_acospi() {
+define <4 x half> @test_tdo_v2_f16_acospi() {
+; CHECK-LABEL: define <4 x half> @test_tdo_v2_f16_acospi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x half> <half 0xH3800, half 0xH0000>
+; CHECK-NEXT:    ret <4 x half> <half 0xH3800, half 0xH3800, half 0xH0000, half 0xH3C00>
 ;
 entry:
-  %c = call <2 x half> @_Z6acospiDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
-  ret <2 x half> %c
+  %c = call <4 x half> @_Z6acospiDv4_Dh(<4 x half> <half 0.000000e+00, half -0.000000e+00, half 1.000000e+00, half -1.000000e+00>)
+  ret <4 x half> %c
 }
 
 define double @test_tdo_scalar_f64_acospi() {
@@ -51,19 +51,19 @@ entry:
   ret double %c
 }
 
-define <2 x double> @test_tdo_v2_f64_acospi() {
-; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_acospi() {
+define <4 x double> @test_tdo_v2_f64_acospi() {
+; CHECK-LABEL: define <4 x double> @test_tdo_v2_f64_acospi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x double> <double 5.000000e-01, double 0.000000e+00>
+; CHECK-NEXT:    ret <4 x double> <double 5.000000e-01, double 5.000000e-01, double 0.000000e+00, double 1.000000e+00>
 ;
 entry:
-  %c = call <2 x double> @_Z6acospiDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
-  ret <2 x double> %c
+  %c = call <4 x double> @_Z6acospiDv4_d(<4 x double> <double 0.000000e+00, double -0.000000e+00, double 1.000000e+00, double -1.000000e+00>)
+  ret <4 x double> %c
 }
 
 declare float        @_Z6acospif(float)
-declare <2 x float>  @_Z6acospiDv2_f(<2 x float>)
+declare <4 x float>  @_Z6acospiDv4_f(<4 x float>)
 declare half         @_Z6acospiDh(half)
-declare <2 x half>   @_Z6acospiDv2_Dh(<2 x half>)
+declare <4 x half>   @_Z6acospiDv4_Dh(<4 x half>)
 declare double       @_Z6acospid(double)
-declare <2 x double> @_Z6acospiDv2_d(<2 x double>)
+declare <4 x double> @_Z6acospiDv4_d(<4 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asin.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asin.ll
index ac69af487485b..56ab0ff089e4e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asin.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asin.ll
@@ -11,14 +11,14 @@ entry:
   ret float %c
 }
 
-define <2 x float> @test_tdo_v2_f32_asin() {
-; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_asin() {
+define <4 x float> @test_tdo_v2_f32_asin() {
+; CHECK-LABEL: define <4 x float> @test_tdo_v2_f32_asin() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x float> <float 0.000000e+00, float 0x3FF921FB60000000>
+; CHECK-NEXT:    ret <4 x float> <float 0.000000e+00, float -0.000000e+00, float 0x3FF921FB60000000, float 0xBFF921FB60000000>
 ;
 entry:
-  %c = call <2 x float> @_Z4asinDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
-  ret <2 x float> %c
+  %c = call <4 x float> @_Z4asinDv4_f(<4 x float> <float 0.000000e+00, float -0.000000e+00, float 1.000000e+00, float -1.000000e+00>)
+  ret <4 x float> %c
 }
 
 define half @test_tdo_scalar_f16_asin() {
@@ -31,14 +31,14 @@ entry:
   ret half %c
 }
 
-define <2 x half> @test_tdo_v2_f16_asin() {
-; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_asin() {
+define <4 x half> @test_tdo_v2_f16_asin() {
+; CHECK-LABEL: define <4 x half> @test_tdo_v2_f16_asin() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x half> <half 0xH0000, half 0xH3E48>
+; CHECK-NEXT:    ret <4 x half> <half 0xH0000, half 0xH8000, half 0xH3E48, half 0xHBE48>
 ;
 entry:
-  %c = call <2 x half> @_Z4asinDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
-  ret <2 x half> %c
+  %c = call <4 x half> @_Z4asinDv4_Dh(<4 x half> <half 0.000000e+00, half -0.000000e+00, half 1.000000e+00, half -1.000000e+00>)
+  ret <4 x half> %c
 }
 
 define double @test_tdo_scalar_f64_asin() {
@@ -51,19 +51,19 @@ entry:
   ret double %c
 }
 
-define <2 x double> @test_tdo_v2_f64_asin() {
-; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_asin() {
+define <4 x double> @test_tdo_v2_f64_asin() {
+; CHECK-LABEL: define <4 x double> @test_tdo_v2_f64_asin() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x double> <double 0.000000e+00, double 0x3FF921FB54442D18>
+; CHECK-NEXT:    ret <4 x double> <double 0.000000e+00, double -0.000000e+00, double 0x3FF921FB54442D18, double 0xBFF921FB54442D18>
 ;
 entry:
-  %c = call <2 x double> @_Z4asinDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
-  ret <2 x double> %c
+  %c = call <4 x double> @_Z4asinDv4_d(<4 x double> <double 0.000000e+00, double -0.000000e+00, double 1.000000e+00, double -1.000000e+00>)
+  ret <4 x double> %c
 }
 
 declare float        @_Z4asinf(float)
-declare <2 x float>  @_Z4asinDv2_f(<2 x float>)
+declare <4 x float>  @_Z4asinDv4_f(<4 x float>)
 declare half         @_Z4asinDh(half)
-declare <2 x half>   @_Z4asinDv2_Dh(<2 x half>)
+declare <4 x half>   @_Z4asinDv4_Dh(<4 x half>)
 declare double       @_Z4asind(double)
-declare <2 x double> @_Z4asinDv2_d(<2 x double>)
+declare <4 x double> @_Z4asinDv4_d(<4 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinpi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinpi.ll
index c0efc8b469bd2..349da1b4f1beb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinpi.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-asinpi.ll
@@ -11,14 +11,14 @@ entry:
   ret float %c
 }
 
-define <2 x float> @test_tdo_v2_f32_asinpi() {
-; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_asinpi() {
+define <4 x float> @test_tdo_v2_f32_asinpi() {
+; CHECK-LABEL: define <4 x float> @test_tdo_v2_f32_asinpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x float> <float 0.000000e+00, float 5.000000e-01>
+; CHECK-NEXT:    ret <4 x float> <float 0.000000e+00, float -0.000000e+00, float 5.000000e-01, float -5.000000e-01>
 ;
 entry:
-  %c = call <2 x float> @_Z6asinpiDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
-  ret <2 x float> %c
+  %c = call <4 x float> @_Z6asinpiDv4_f(<4 x float> <float 0.000000e+00, float -0.000000e+00, float 1.000000e+00, float -1.000000e+00>)
+  ret <4 x float> %c
 }
 
 define half @test_tdo_scalar_f16_asinpi() {
@@ -31,14 +31,14 @@ entry:
   ret half %c
 }
 
-define <2 x half> @test_tdo_v2_f16_asinpi() {
-; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_asinpi() {
+define <4 x half> @test_tdo_v2_f16_asinpi() {
+; CHECK-LABEL: define <4 x half> @test_tdo_v2_f16_asinpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x half> <half 0xH0000, half 0xH3800>
+; CHECK-NEXT:    ret <4 x half> <half 0xH0000, half 0xH8000, half 0xH3800, half 0xHB800>
 ;
 entry:
-  %c = call <2 x half> @_Z6asinpiDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
-  ret <2 x half> %c
+  %c = call <4 x half> @_Z6asinpiDv4_Dh(<4 x half> <half 0.000000e+00, half -0.000000e+00, half 1.000000e+00, half -1.000000e+00>)
+  ret <4 x half> %c
 }
 
 define double @test_tdo_scalar_f64_asinpi() {
@@ -51,19 +51,19 @@ entry:
   ret double %c
 }
 
-define <2 x double> @test_tdo_v2_f64_asinpi() {
-; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_asinpi() {
+define <4 x double> @test_tdo_v2_f64_asinpi() {
+; CHECK-LABEL: define <4 x double> @test_tdo_v2_f64_asinpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x double> <double 0.000000e+00, double 5.000000e-01>
+; CHECK-NEXT:    ret <4 x double> <double 0.000000e+00, double -0.000000e+00, double 5.000000e-01, double -5.000000e-01>
 ;
 entry:
-  %c = call <2 x double> @_Z6asinpiDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
-  ret <2 x double> %c
+  %c = call <4 x double> @_Z6asinpiDv4_d(<4 x double> <double 0.000000e+00, double -0.000000e+00, double 1.000000e+00, double -1.000000e+00>)
+  ret <4 x double> %c
 }
 
 declare float        @_Z6asinpif(float)
-declare <2 x float>  @_Z6asinpiDv2_f(<2 x float>)
+declare <4 x float>  @_Z6asinpiDv4_f(<4 x float>)
 declare half         @_Z6asinpiDh(half)
-declare <2 x half>   @_Z6asinpiDv2_Dh(<2 x half>)
+declare <4 x half>   @_Z6asinpiDv4_Dh(<4 x half>)
 declare double       @_Z6asinpid(double)
-declare <2 x double> @_Z6asinpiDv2_d(<2 x double>)
+declare <4 x double> @_Z6asinpiDv4_d(<4 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atan.ll
index e863bcb8fe7f5..a178c5ab24b3d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atan.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atan.ll
@@ -11,14 +11,14 @@ entry:
   ret float %c
 }
 
-define <2 x float> @test_tdo_v2_f32_atan() {
-; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_atan() {
+define <4 x float> @test_tdo_v2_f32_atan() {
+; CHECK-LABEL: define <4 x float> @test_tdo_v2_f32_atan() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x float> <float 0.000000e+00, float 0x3FE921FB60000000>
+; CHECK-NEXT:    ret <4 x float> <float 0.000000e+00, float -0.000000e+00, float 0x3FE921FB60000000, float 0xBFE921FB60000000>
 ;
 entry:
-  %c = call <2 x float> @_Z4atanDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
-  ret <2 x float> %c
+  %c = call <4 x float> @_Z4atanDv4_f(<4 x float> <float 0.000000e+00, float -0.000000e+00, float 1.000000e+00, float -1.000000e+00>)
+  ret <4 x float> %c
 }
 
 define half @test_tdo_scalar_f16_atan() {
@@ -31,14 +31,14 @@ entry:
   ret half %c
 }
 
-define <2 x half> @test_tdo_v2_f16_atan() {
-; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_atan() {
+define <4 x half> @test_tdo_v2_f16_atan() {
+; CHECK-LABEL: define <4 x half> @test_tdo_v2_f16_atan() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x half> <half 0xH0000, half 0xH3A48>
+; CHECK-NEXT:    ret <4 x half> <half 0xH0000, half 0xH8000, half 0xH3A48, half 0xHBA48>
 ;
 entry:
-  %c = call <2 x half> @_Z4atanDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
-  ret <2 x half> %c
+  %c = call <4 x half> @_Z4atanDv4_Dh(<4 x half> <half 0.000000e+00, half -0.000000e+00, half 1.000000e+00, half -1.000000e+00>)
+  ret <4 x half> %c
 }
 
 define double @test_tdo_scalar_f64_atan() {
@@ -51,19 +51,19 @@ entry:
   ret double %c
 }
 
-define <2 x double> @test_tdo_v2_f64_atan() {
-; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_atan() {
+define <4 x double> @test_tdo_v2_f64_atan() {
+; CHECK-LABEL: define <4 x double> @test_tdo_v2_f64_atan() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x double> <double 0.000000e+00, double 0x3FE921FB54442D18>
+; CHECK-NEXT:    ret <4 x double> <double 0.000000e+00, double -0.000000e+00, double 0x3FE921FB54442D18, double 0xBFE921FB54442D18>
 ;
 entry:
-  %c = call <2 x double> @_Z4atanDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
-  ret <2 x double> %c
+  %c = call <4 x double> @_Z4atanDv4_d(<4 x double> <double 0.000000e+00, double -0.000000e+00, double 1.000000e+00, double -1.000000e+00>)
+  ret <4 x double> %c
 }
 
 declare float        @_Z4atanf(float)
-declare <2 x float>  @_Z4atanDv2_f(<2 x float>)
+declare <4 x float>  @_Z4atanDv4_f(<4 x float>)
 declare half         @_Z4atanDh(half)
-declare <2 x half>   @_Z4atanDv2_Dh(<2 x half>)
+declare <4 x half>   @_Z4atanDv4_Dh(<4 x half>)
 declare double       @_Z4atand(double)
-declare <2 x double> @_Z4atanDv2_d(<2 x double>)
+declare <4 x double> @_Z4atanDv4_d(<4 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanpi.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanpi.ll
index 448f687a8a85e..e5aef437f8245 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanpi.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-atanpi.ll
@@ -11,14 +11,14 @@ entry:
   ret float %c
 }
 
-define <2 x float> @test_tdo_v2_f32_atanpi() {
-; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_atanpi() {
+define <4 x float> @test_tdo_v2_f32_atanpi() {
+; CHECK-LABEL: define <4 x float> @test_tdo_v2_f32_atanpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x float> <float 0.000000e+00, float 2.500000e-01>
+; CHECK-NEXT:    ret <4 x float> <float 0.000000e+00, float -0.000000e+00, float 2.500000e-01, float -2.500000e-01>
 ;
 entry:
-  %c = call <2 x float> @_Z6atanpiDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
-  ret <2 x float> %c
+  %c = call <4 x float> @_Z6atanpiDv4_f(<4 x float> <float 0.000000e+00, float -0.000000e+00, float 1.000000e+00, float -1.000000e+00>)
+  ret <4 x float> %c
 }
 
 define half @test_tdo_scalar_f16_atanpi() {
@@ -31,14 +31,14 @@ entry:
   ret half %c
 }
 
-define <2 x half> @test_tdo_v2_f16_atanpi() {
-; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_atanpi() {
+define <4 x half> @test_tdo_v2_f16_atanpi() {
+; CHECK-LABEL: define <4 x half> @test_tdo_v2_f16_atanpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x half> <half 0xH0000, half 0xH3400>
+; CHECK-NEXT:    ret <4 x half> <half 0xH0000, half 0xH8000, half 0xH3400, half 0xHB400>
 ;
 entry:
-  %c = call <2 x half> @_Z6atanpiDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
-  ret <2 x half> %c
+  %c = call <4 x half> @_Z6atanpiDv4_Dh(<4 x half> <half 0.000000e+00, half -0.000000e+00, half 1.000000e+00, half -1.000000e+00>)
+  ret <4 x half> %c
 }
 
 define double @test_tdo_scalar_f64_atanpi() {
@@ -51,19 +51,19 @@ entry:
   ret double %c
 }
 
-define <2 x double> @test_tdo_v2_f64_atanpi() {
-; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_atanpi() {
+define <4 x double> @test_tdo_v2_f64_atanpi() {
+; CHECK-LABEL: define <4 x double> @test_tdo_v2_f64_atanpi() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x double> <double 0.000000e+00, double 2.500000e-01>
+; CHECK-NEXT:    ret <4 x double> <double 0.000000e+00, double -0.000000e+00, double 2.500000e-01, double -2.500000e-01>
 ;
 entry:
-  %c = call <2 x double> @_Z6atanpiDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
-  ret <2 x double> %c
+  %c = call <4 x double> @_Z6atanpiDv4_d(<4 x double> <double 0.000000e+00, double -0.000000e+00, double 1.000000e+00, double -1.000000e+00>)
+  ret <4 x double> %c
 }
 
 declare float        @_Z6atanpif(float)
-declare <2 x float>  @_Z6atanpiDv2_f(<2 x float>)
+declare <4 x float>  @_Z6atanpiDv4_f(<4 x float>)
 declare half         @_Z6atanpiDh(half)
-declare <2 x half>   @_Z6atanpiDv2_Dh(<2 x half>)
+declare <4 x half>   @_Z6atanpiDv4_Dh(<4 x half>)
 declare double       @_Z6atanpid(double)
-declare <2 x double> @_Z6atanpiDv2_d(<2 x double>)
+declare <4 x double> @_Z6atanpiDv4_d(<4 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cbrt.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cbrt.ll
index 23efb6009d6b6..9b297ba8eae10 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cbrt.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-cbrt.ll
@@ -11,14 +11,14 @@ entry:
   ret float %c
 }
 
-define <2 x float> @test_tdo_v2_f32_cbrt() {
-; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_cbrt() {
+define <4 x float> @test_tdo_v2_f32_cbrt() {
+; CHECK-LABEL: define <4 x float> @test_tdo_v2_f32_cbrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x float> <float 1.000000e+00, float -1.000000e+00>
+; CHECK-NEXT:    ret <4 x float> <float 0.000000e+00, float -0.000000e+00, float 1.000000e+00, float -1.000000e+00>
 ;
 entry:
-  %c = call <2 x float> @_Z4cbrtDv2_f(<2 x float> <float 1.000000e+00, float -1.000000e+00>)
-  ret <2 x float> %c
+  %c = call <4 x float> @_Z4cbrtDv4_f(<4 x float> <float 0.000000e+00, float -0.000000e+00, float 1.000000e+00, float -1.000000e+00>)
+  ret <4 x float> %c
 }
 
 define half @test_tdo_scalar_f16_cbrt() {
@@ -31,14 +31,14 @@ entry:
   ret half %c
 }
 
-define <2 x half> @test_tdo_v2_f16_cbrt() {
-; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_cbrt() {
+define <4 x half> @test_tdo_v2_f16_cbrt() {
+; CHECK-LABEL: define <4 x half> @test_tdo_v2_f16_cbrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x half> <half 0xH3C00, half 0xHBC00>
+; CHECK-NEXT:    ret <4 x half> <half 0xH0000, half 0xH8000, half 0xH3C00, half 0xHBC00>
 ;
 entry:
-  %c = call <2 x half> @_Z4cbrtDv2_Dh(<2 x half> <half 1.000000e+00, half -1.000000e+00>)
-  ret <2 x half> %c
+  %c = call <4 x half> @_Z4cbrtDv4_Dh(<4 x half> <half 0.000000e+00, half -0.000000e+00, half 1.000000e+00, half -1.000000e+00>)
+  ret <4 x half> %c
 }
 
 define double @test_tdo_scalar_f64_cbrt() {
@@ -51,19 +51,19 @@ entry:
   ret double %c
 }
 
-define <2 x double> @test_tdo_v2_f64_cbrt() {
-; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_cbrt() {
+define <4 x double> @test_tdo_v2_f64_cbrt() {
+; CHECK-LABEL: define <4 x double> @test_tdo_v2_f64_cbrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x double> <double 1.000000e+00, double -1.000000e+00>
+; CHECK-NEXT:    ret <4 x double> <double 0.000000e+00, double -0.000000e+00, double 1.000000e+00, double -1.000000e+00>
 ;
 entry:
-  %c = call <2 x double> @_Z4cbrtDv2_d(<2 x double> <double 1.000000e+00, double -1.000000e+00>)
-  ret <2 x double> %c
+  %c = call <4 x double> @_Z4cbrtDv4_d(<4 x double> <double 0.000000e+00, double -0.000000e+00, double 1.000000e+00, double -1.000000e+00>)
+  ret <4 x double> %c
 }
 
 declare float        @_Z4cbrtf(float)
-declare <2 x float>  @_Z4cbrtDv2_f(<2 x float>)
+declare <4 x float>  @_Z4cbrtDv4_f(<4 x float>)
 declare half         @_Z4cbrtDh(half)
-declare <2 x half>   @_Z4cbrtDv2_Dh(<2 x half>)
+declare <4 x half>   @_Z4cbrtDv4_Dh(<4 x half>)
 declare double       @_Z4cbrtd(double)
-declare <2 x double> @_Z4cbrtDv2_d(<2 x double>)
+declare <4 x double> @_Z4cbrtDv4_d(<4 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll
index ca7a8fbece135..6c49a84ef27e3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp.ll
@@ -11,14 +11,14 @@ entry:
   ret float %c
 }
 
-define <2 x float> @test_tdo_v2_f32_exp() {
-; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_exp() {
+define <3 x float> @test_tdo_v2_f32_exp() {
+; CHECK-LABEL: define <3 x float> @test_tdo_v2_f32_exp() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x float> <float 1.000000e+00, float 0x4005BF0A80000000>
+; CHECK-NEXT:    ret <3 x float> <float 1.000000e+00, float 1.000000e+00, float 0x4005BF0A80000000>
 ;
 entry:
-  %c = call <2 x float> @_Z3expDv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
-  ret <2 x float> %c
+  %c = call <3 x float> @_Z3expDv3_f(<3 x float> <float 0.000000e+00, float -0.000000e+00, float 1.000000e+00>)
+  ret <3 x float> %c
 }
 
 define half @test_tdo_scalar_f16_exp() {
@@ -31,14 +31,14 @@ entry:
   ret half %c
 }
 
-define <2 x half> @test_tdo_v2_f16_exp() {
-; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_exp() {
+define <3 x half> @test_tdo_v2_f16_exp() {
+; CHECK-LABEL: define <3 x half> @test_tdo_v2_f16_exp() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x half> <half 0xH3C00, half 0xH4170>
+; CHECK-NEXT:    ret <3 x half> <half 0xH3C00, half 0xH3C00, half 0xH4170>
 ;
 entry:
-  %c = call <2 x half> @_Z3expDv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
-  ret <2 x half> %c
+  %c = call <3 x half> @_Z3expDv3_Dh(<3 x half> <half 0.000000e+00, half -0.000000e+00, half 1.000000e+00>)
+  ret <3 x half> %c
 }
 
 define double @test_tdo_scalar_f64_exp() {
@@ -51,19 +51,19 @@ entry:
   ret double %c
 }
 
-define <2 x double> @test_tdo_v2_f64_exp() {
-; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_exp() {
+define <3 x double> @test_tdo_v2_f64_exp() {
+; CHECK-LABEL: define <3 x double> @test_tdo_v2_f64_exp() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x double> <double 1.000000e+00, double 0x4005BF0A8B145769>
+; CHECK-NEXT:    ret <3 x double> <double 1.000000e+00, double 1.000000e+00, double 0x4005BF0A8B145769>
 ;
 entry:
-  %c = call <2 x double> @_Z3expDv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
-  ret <2 x double> %c
+  %c = call <3 x double> @_Z3expDv3_d(<3 x double> <double 0.000000e+00, double -0.000000e+00, double 1.000000e+00>)
+  ret <3 x double> %c
 }
 
 declare float        @_Z3expf(float)
-declare <2 x float>  @_Z3expDv2_f(<2 x float>)
+declare <3 x float>  @_Z3expDv3_f(<3 x float>)
 declare half         @_Z3expDh(half)
-declare <2 x half>   @_Z3expDv2_Dh(<2 x half>)
+declare <3 x half>   @_Z3expDv3_Dh(<3 x half>)
 declare double       @_Z3expd(double)
-declare <2 x double> @_Z3expDv2_d(<2 x double>)
+declare <3 x double> @_Z3expDv3_d(<3 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp10.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp10.ll
index 0b1ce025c9eee..93c81f43ac93b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp10.ll
@@ -11,14 +11,14 @@ entry:
   ret float %c
 }
 
-define <2 x float> @test_tdo_v2_f32_exp10() {
-; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_exp10() {
+define <3 x float> @test_tdo_v2_f32_exp10() {
+; CHECK-LABEL: define <3 x float> @test_tdo_v2_f32_exp10() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x float> <float 1.000000e+00, float 1.000000e+01>
+; CHECK-NEXT:    ret <3 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+01>
 ;
 entry:
-  %c = call <2 x float> @_Z5exp10Dv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
-  ret <2 x float> %c
+  %c = call <3 x float> @_Z5exp10Dv3_f(<3 x float> <float 0.000000e+00, float -0.000000e+00, float 1.000000e+00>)
+  ret <3 x float> %c
 }
 
 define half @test_tdo_scalar_f16_exp10() {
@@ -31,14 +31,14 @@ entry:
   ret half %c
 }
 
-define <2 x half> @test_tdo_v2_f16_exp10() {
-; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_exp10() {
+define <3 x half> @test_tdo_v2_f16_exp10() {
+; CHECK-LABEL: define <3 x half> @test_tdo_v2_f16_exp10() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x half> <half 0xH3C00, half 0xH4900>
+; CHECK-NEXT:    ret <3 x half> <half 0xH3C00, half 0xH3C00, half 0xH4900>
 ;
 entry:
-  %c = call <2 x half> @_Z5exp10Dv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
-  ret <2 x half> %c
+  %c = call <3 x half> @_Z5exp10Dv3_Dh(<3 x half> <half 0.000000e+00, half -0.000000e+00, half 1.000000e+00>)
+  ret <3 x half> %c
 }
 
 define double @test_tdo_scalar_f64_exp10() {
@@ -51,19 +51,19 @@ entry:
   ret double %c
 }
 
-define <2 x double> @test_tdo_v2_f64_exp10() {
-; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_exp10() {
+define <3 x double> @test_tdo_v2_f64_exp10() {
+; CHECK-LABEL: define <3 x double> @test_tdo_v2_f64_exp10() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x double> <double 1.000000e+00, double 1.000000e+01>
+; CHECK-NEXT:    ret <3 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+01>
 ;
 entry:
-  %c = call <2 x double> @_Z5exp10Dv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
-  ret <2 x double> %c
+  %c = call <3 x double> @_Z5exp10Dv3_d(<3 x double> <double 0.000000e+00, double -0.000000e+00, double 1.000000e+00>)
+  ret <3 x double> %c
 }
 
 declare float        @_Z5exp10f(float)
-declare <2 x float>  @_Z5exp10Dv2_f(<2 x float>)
+declare <3 x float>  @_Z5exp10Dv3_f(<3 x float>)
 declare half         @_Z5exp10Dh(half)
-declare <2 x half>   @_Z5exp10Dv2_Dh(<2 x half>)
+declare <3 x half>   @_Z5exp10Dv3_Dh(<3 x half>)
 declare double       @_Z5exp10d(double)
-declare <2 x double> @_Z5exp10Dv2_d(<2 x double>)
+declare <3 x double> @_Z5exp10Dv3_d(<3 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp2.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp2.ll
index 707aa9ba60618..e1a645320cb7b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-exp2.ll
@@ -11,14 +11,14 @@ entry:
   ret float %c
 }
 
-define <2 x float> @test_tdo_v2_f32_exp2() {
-; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_exp2() {
+define <3 x float> @test_tdo_v2_f32_exp2() {
+; CHECK-LABEL: define <3 x float> @test_tdo_v2_f32_exp2() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x float> <float 1.000000e+00, float 2.000000e+00>
+; CHECK-NEXT:    ret <3 x float> <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
 ;
 entry:
-  %c = call <2 x float> @_Z4exp2Dv2_f(<2 x float> <float 0.000000e+00, float 1.000000e+00>)
-  ret <2 x float> %c
+  %c = call <3 x float> @_Z4exp2Dv3_f(<3 x float> <float 0.000000e+00, float -0.000000e+00, float 1.000000e+00>)
+  ret <3 x float> %c
 }
 
 define half @test_tdo_scalar_f16_exp2() {
@@ -31,14 +31,14 @@ entry:
   ret half %c
 }
 
-define <2 x half> @test_tdo_v2_f16_exp2() {
-; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_exp2() {
+define <3 x half> @test_tdo_v2_f16_exp2() {
+; CHECK-LABEL: define <3 x half> @test_tdo_v2_f16_exp2() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x half> <half 0xH3C00, half 0xH4000>
+; CHECK-NEXT:    ret <3 x half> <half 0xH3C00, half 0xH3C00, half 0xH4000>
 ;
 entry:
-  %c = call <2 x half> @_Z4exp2Dv2_Dh(<2 x half> <half 0.000000e+00, half 1.000000e+00>)
-  ret <2 x half> %c
+  %c = call <3 x half> @_Z4exp2Dv3_Dh(<3 x half> <half 0.000000e+00, half -0.000000e+00, half 1.000000e+00>)
+  ret <3 x half> %c
 }
 
 define double @test_tdo_scalar_f64_exp2() {
@@ -51,19 +51,19 @@ entry:
   ret double %c
 }
 
-define <2 x double> @test_tdo_v2_f64_exp2() {
-; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_exp2() {
+define <3 x double> @test_tdo_v2_f64_exp2() {
+; CHECK-LABEL: define <3 x double> @test_tdo_v2_f64_exp2() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x double> <double 1.000000e+00, double 2.000000e+00>
+; CHECK-NEXT:    ret <3 x double> <double 1.000000e+00, double 1.000000e+00, double 2.000000e+00>
 ;
 entry:
-  %c = call <2 x double> @_Z4exp2Dv2_d(<2 x double> <double 0.000000e+00, double 1.000000e+00>)
-  ret <2 x double> %c
+  %c = call <3 x double> @_Z4exp2Dv3_d(<3 x double> <double 0.000000e+00, double -0.000000e+00, double 1.000000e+00>)
+  ret <3 x double> %c
 }
 
 declare float        @_Z4exp2f(float)
-declare <2 x float>  @_Z4exp2Dv2_f(<2 x float>)
+declare <3 x float>  @_Z4exp2Dv3_f(<3 x float>)
 declare half         @_Z4exp2Dh(half)
-declare <2 x half>   @_Z4exp2Dv2_Dh(<2 x half>)
+declare <3 x half>   @_Z4exp2Dv3_Dh(<3 x half>)
 declare double       @_Z4exp2d(double)
-declare <2 x double> @_Z4exp2Dv2_d(<2 x double>)
+declare <3 x double> @_Z4exp2Dv3_d(<3 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sqrt.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sqrt.ll
index ce495cde3df3f..04bdb0cabf004 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sqrt.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-sqrt.ll
@@ -11,14 +11,14 @@ entry:
   ret float %c
 }
 
-define <2 x float> @test_tdo_v2_f32_sqrt() {
-; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_sqrt() {
+define <3 x float> @test_tdo_v2_f32_sqrt() {
+; CHECK-LABEL: define <3 x float> @test_tdo_v2_f32_sqrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x float> splat (float 1.000000e+00)
+; CHECK-NEXT:    ret <3 x float> <float 0.000000e+00, float 1.000000e+00, float 0x3FF6A09E60000000>
 ;
 entry:
-  %c = call <2 x float> @_Z4sqrtDv2_f(<2 x float> <float 1.000000e+00, float 1.000000e+00>)
-  ret <2 x float> %c
+  %c = call <3 x float> @_Z4sqrtDv3_f(<3 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00>)
+  ret <3 x float> %c
 }
 
 define half @test_tdo_scalar_f16_sqrt() {
@@ -31,14 +31,14 @@ entry:
   ret half %c
 }
 
-define <2 x half> @test_tdo_v2_f16_sqrt() {
-; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_sqrt() {
+define <3 x half> @test_tdo_v2_f16_sqrt() {
+; CHECK-LABEL: define <3 x half> @test_tdo_v2_f16_sqrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x half> splat (half 0xH3C00)
+; CHECK-NEXT:    ret <3 x half> <half 0xH0000, half 0xH3C00, half 0xH3DA8>
 ;
 entry:
-  %c = call <2 x half> @_Z4sqrtDv2_Dh(<2 x half> <half 1.000000e+00, half 1.000000e+00>)
-  ret <2 x half> %c
+  %c = call <3 x half> @_Z4sqrtDv3_Dh(<3 x half> <half 0.000000e+00, half 1.000000e+00, half 2.000000e+00>)
+  ret <3 x half> %c
 }
 
 define double @test_tdo_scalar_f64_sqrt() {
@@ -51,19 +51,19 @@ entry:
   ret double %c
 }
 
-define <2 x double> @test_tdo_v2_f64_sqrt() {
-; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_sqrt() {
+define <3 x double> @test_tdo_v2_f64_sqrt() {
+; CHECK-LABEL: define <3 x double> @test_tdo_v2_f64_sqrt() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x double> splat (double 1.000000e+00)
+; CHECK-NEXT:    ret <3 x double> <double 0.000000e+00, double 1.000000e+00, double 0x3FF6A09E667F3BCD>
 ;
 entry:
-  %c = call <2 x double> @_Z4sqrtDv2_d(<2 x double> <double 1.000000e+00, double 1.000000e+00>)
-  ret <2 x double> %c
+  %c = call <3 x double> @_Z4sqrtDv3_d(<3 x double> <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00>)
+  ret <3 x double> %c
 }
 
 declare float        @_Z4sqrtf(float)
-declare <2 x float>  @_Z4sqrtDv2_f(<2 x float>)
+declare <3 x float>  @_Z4sqrtDv3_f(<3 x float>)
 declare half         @_Z4sqrtDh(half)
-declare <2 x half>   @_Z4sqrtDv2_Dh(<2 x half>)
+declare <3 x half>   @_Z4sqrtDv3_Dh(<3 x half>)
 declare double       @_Z4sqrtd(double)
-declare <2 x double> @_Z4sqrtDv2_d(<2 x double>)
+declare <3 x double> @_Z4sqrtDv3_d(<3 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tgamma.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tgamma.ll
index ec32898911985..a95aa6a5730c2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tgamma.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-tdo-tgamma.ll
@@ -11,14 +11,14 @@ entry:
   ret float %c
 }
 
-define <2 x float> @test_tdo_v2_f32_tgamma() {
-; CHECK-LABEL: define <2 x float> @test_tdo_v2_f32_tgamma() {
+define <4 x float> @test_tdo_v2_f32_tgamma() {
+; CHECK-LABEL: define <4 x float> @test_tdo_v2_f32_tgamma() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x float> splat (float 1.000000e+00)
+; CHECK-NEXT:    ret <4 x float> <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00, float 6.000000e+00>
 ;
 entry:
-  %c = call <2 x float> @_Z6tgammaDv2_f(<2 x float> <float 1.000000e+00, float 2.000000e+00>)
-  ret <2 x float> %c
+  %c = call <4 x float> @_Z6tgammaDv4_f(<4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>)
+  ret <4 x float> %c
 }
 
 define half @test_tdo_scalar_f16_tgamma() {
@@ -31,14 +31,14 @@ entry:
   ret half %c
 }
 
-define <2 x half> @test_tdo_v2_f16_tgamma() {
-; CHECK-LABEL: define <2 x half> @test_tdo_v2_f16_tgamma() {
+define <4 x half> @test_tdo_v2_f16_tgamma() {
+; CHECK-LABEL: define <4 x half> @test_tdo_v2_f16_tgamma() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x half> splat (half 0xH3C00)
+; CHECK-NEXT:    ret <4 x half> <half 0xH3C00, half 0xH3C00, half 0xH4000, half 0xH4600>
 ;
 entry:
-  %c = call <2 x half> @_Z6tgammaDv2_Dh(<2 x half> <half 1.000000e+00, half 2.000000e+00>)
-  ret <2 x half> %c
+  %c = call <4 x half> @_Z6tgammaDv4_Dh(<4 x half> <half 1.000000e+00, half 2.000000e+00, half 3.000000e+00, half 4.000000e+00>)
+  ret <4 x half> %c
 }
 
 define double @test_tdo_scalar_f64_tgamma() {
@@ -51,19 +51,19 @@ entry:
   ret double %c
 }
 
-define <2 x double> @test_tdo_v2_f64_tgamma() {
-; CHECK-LABEL: define <2 x double> @test_tdo_v2_f64_tgamma() {
+define <4 x double> @test_tdo_v2_f64_tgamma() {
+; CHECK-LABEL: define <4 x double> @test_tdo_v2_f64_tgamma() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    ret <2 x double> splat (double 1.000000e+00)
+; CHECK-NEXT:    ret <4 x double> <double 1.000000e+00, double 1.000000e+00, double 2.000000e+00, double 6.000000e+00>
 ;
 entry:
-  %c = call <2 x double> @_Z6tgammaDv2_d(<2 x double> <double 1.000000e+00, double 2.000000e+00>)
-  ret <2 x double> %c
+  %c = call <4 x double> @_Z6tgammaDv4_d(<4 x double> <double 1.000000e+00, double 2.000000e+00, double 3.000000e+00, double 4.000000e+00>)
+  ret <4 x double> %c
 }
 
 declare float        @_Z6tgammaf(float)
-declare <2 x float>  @_Z6tgammaDv2_f(<2 x float>)
+declare <4 x float>  @_Z6tgammaDv4_f(<4 x float>)
 declare half         @_Z6tgammaDh(half)
-declare <2 x half>   @_Z6tgammaDv2_Dh(<2 x half>)
+declare <4 x half>   @_Z6tgammaDv4_Dh(<4 x half>)
 declare double       @_Z6tgammad(double)
-declare <2 x double> @_Z6tgammaDv2_d(<2 x double>)
+declare <4 x double> @_Z6tgammaDv4_d(<4 x double>)

>From 9b56e11aad288c2fa012febdd9ba2e97565f6a7d Mon Sep 17 00:00:00 2001
From: Steffen Holst Larsen <HolstLarsen.Steffen at amd.com>
Date: Wed, 4 Feb 2026 06:58:46 -0600
Subject: [PATCH 5/8] Use getFltSemantics and simplify

Signed-off-by: Steffen Holst Larsen <HolstLarsen.Steffen at amd.com>
---
 llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 43b72f5ea9aa8..3696643e54131 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -747,18 +747,15 @@ static Constant *
 getConstantFloatVectorForArgType(LLVMContext &Ctx, AMDGPULibFunc::EType ArgType,
                                  const ArrayRef<APFloat> Values,
                                  const Type *Ty) {
+  Type *ElemTy = Ty->getScalarType();
+  const fltSemantics &FltSem = ElemTy->getFltSemantics();
+
   SmallVector<Constant *, 4> ConstValues;
   ConstValues.reserve(Values.size());
-  for (const APFloat &APF : Values) {
-    APFloat APFCopy = APF;
-    const auto &FltSem =
-        ArgType == AMDGPULibFunc::F16
-            ? APFloat::IEEEhalf()
-            : (ArgType == AMDGPULibFunc::F32 ? APFloat::IEEEsingle()
-                                             : APFloat::IEEEdouble());
+  for (APFloat APF : Values) {
     bool Unused;
-    APFCopy.convert(FltSem, APFloat::rmNearestTiesToEven, &Unused);
-    ConstValues.push_back(ConstantFP::get(Ty->getScalarType(), APFCopy));
+    APF.convert(FltSem, APFloat::rmNearestTiesToEven, &Unused);
+    ConstValues.push_back(ConstantFP::get(ElemTy, APF));
   }
   return ConstantVector::get(ConstValues);
 }

>From 43a76b73ff4bf1258de91b22e11bbad5d415a10d Mon Sep 17 00:00:00 2001
From: Steffen Holst Larsen <HolstLarsen.Steffen at amd.com>
Date: Wed, 4 Feb 2026 07:01:10 -0600
Subject: [PATCH 6/8] Remove unused arguments

Signed-off-by: Steffen Holst Larsen <HolstLarsen.Steffen at amd.com>
---
 llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 3696643e54131..271ba9e355441 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -743,10 +743,8 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
   return false;
 }
 
-static Constant *
-getConstantFloatVectorForArgType(LLVMContext &Ctx, AMDGPULibFunc::EType ArgType,
-                                 const ArrayRef<APFloat> Values,
-                                 const Type *Ty) {
+static Constant *getConstantFloatVector(const ArrayRef<APFloat> Values,
+                                        const Type *Ty) {
   Type *ElemTy = Ty->getScalarType();
   const fltSemantics &FltSem = ElemTy->getFltSemantics();
 
@@ -787,8 +785,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
           return false;
         Values.push_back(APFloat(MatchingRow->result));
       }
-      Constant *NewValues = getConstantFloatVectorForArgType(
-          CI->getContext(), getArgType(FInfo), Values, CI->getType());
+      Constant *NewValues = getConstantFloatVector(Values, CI->getType());
       LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *NewValues << "\n");
       replaceCall(CI, NewValues);
       return true;
@@ -1548,11 +1545,9 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
     if (hasTwoResults)
       nval1 = ConstantFP::get(aCI->getType(), Val1[0]);
   } else {
-    nval0 = getConstantFloatVectorForArgType(context, getArgType(FInfo), Val0,
-                                             aCI->getType());
+    nval0 = getConstantFloatVector(Val0, aCI->getType());
     if (hasTwoResults)
-      nval1 = getConstantFloatVectorForArgType(context, getArgType(FInfo), Val1,
-                                               aCI->getType());
+      nval1 = getConstantFloatVector(Val1, aCI->getType());
   }
 
   if (hasTwoResults) {

>From 86a4229c20bed5c3be52babce0555f4d90356e31 Mon Sep 17 00:00:00 2001
From: Steffen Holst Larsen <HolstLarsen.Steffen at amd.com>
Date: Tue, 10 Feb 2026 06:42:46 -0600
Subject: [PATCH 7/8] Revert lambda

Signed-off-by: Steffen Holst Larsen <HolstLarsen.Steffen at amd.com>
---
 llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp | 227 +++++++++++++---------
 1 file changed, 140 insertions(+), 87 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index bd42e1857671a..1e7b7adc81add 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1413,97 +1413,150 @@ bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo,
              : (double)fpopr1->getValueAPF().convertToFloat();
   }
 
-  auto Res = [&FInfo, opr0, opr1,
-              copr1]() -> std::optional<std::pair<double, double>> {
-    switch (FInfo.getId()) {
-    default:
-      return std::nullopt;
-    case AMDGPULibFunc::EI_ACOS:
-      return std::make_pair(acos(opr0), 0.0);
-    case AMDGPULibFunc::EI_ACOSH:
-      // acosh(x) == log(x + sqrt(x*x - 1))
-      return std::make_pair(log(opr0 + sqrt(opr0 * opr0 - 1.0)), 0.0);
-    case AMDGPULibFunc::EI_ACOSPI:
-      return std::make_pair(acos(opr0) / MATH_PI, 0.0);
-    case AMDGPULibFunc::EI_ASIN:
-      return std::make_pair(asin(opr0), 0.0);
-    case AMDGPULibFunc::EI_ASINH:
-      // asinh(x) == log(x + sqrt(x*x + 1))
-      return std::make_pair(log(opr0 + sqrt(opr0 * opr0 + 1.0)), 0.0);
-    case AMDGPULibFunc::EI_ASINPI:
-      return std::make_pair(asin(opr0) / MATH_PI, 0.0);
-    case AMDGPULibFunc::EI_ATAN:
-      return std::make_pair(atan(opr0), 0.0);
-    case AMDGPULibFunc::EI_ATANH:
-      // atanh(x) == (log(x+1) - log(x-1))/2;
-      return std::make_pair((log(opr0 + 1.0) - log(opr0 - 1.0)) / 2.0, 0.0);
-    case AMDGPULibFunc::EI_ATANPI:
-      return std::make_pair(atan(opr0) / MATH_PI, 0.0);
-    case AMDGPULibFunc::EI_CBRT:
-      return std::make_pair(
-          (opr0 < 0.0) ? -pow(-opr0, 1.0 / 3.0) : pow(opr0, 1.0 / 3.0), 0.0);
-    case AMDGPULibFunc::EI_COS:
-      return std::make_pair(cos(opr0), 0.0);
-    case AMDGPULibFunc::EI_COSH:
-      return std::make_pair(cosh(opr0), 0.0);
-    case AMDGPULibFunc::EI_COSPI:
-      return std::make_pair(cos(MATH_PI * opr0), 0.0);
-    case AMDGPULibFunc::EI_EXP:
-      return std::make_pair(exp(opr0), 0.0);
-    case AMDGPULibFunc::EI_EXP2:
-      return std::make_pair(pow(2.0, opr0), 0.0);
-    case AMDGPULibFunc::EI_EXP10:
-      return std::make_pair(pow(10.0, opr0), 0.0);
-    case AMDGPULibFunc::EI_LOG:
-      return std::make_pair(log(opr0), 0.0);
-    case AMDGPULibFunc::EI_LOG2:
-      return std::make_pair(log(opr0) / log(2.0), 0.0);
-    case AMDGPULibFunc::EI_LOG10:
-      return std::make_pair(log(opr0) / log(10.0), 0.0);
-    case AMDGPULibFunc::EI_RSQRT:
-      return std::make_pair(1.0 / sqrt(opr0), 0.0);
-    case AMDGPULibFunc::EI_SIN:
-      return std::make_pair(sin(opr0), 0.0);
-    case AMDGPULibFunc::EI_SINH:
-      return std::make_pair(sinh(opr0), 0.0);
-    case AMDGPULibFunc::EI_SINPI:
-      return std::make_pair(sin(MATH_PI * opr0), 0.0);
-    case AMDGPULibFunc::EI_TAN:
-      return std::make_pair(tan(opr0), 0.0);
-    case AMDGPULibFunc::EI_TANH:
-      return std::make_pair(tanh(opr0), 0.0);
-    case AMDGPULibFunc::EI_TANPI:
-      return std::make_pair(tan(MATH_PI * opr0), 0.0);
-    // two-arg functions
-    case AMDGPULibFunc::EI_POW:
-    case AMDGPULibFunc::EI_POWR:
-      return std::make_pair(pow(opr0, opr1), 0.0);
-    case AMDGPULibFunc::EI_POWN: {
-      if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
-        double val = (double)iopr1->getSExtValue();
-        return std::make_pair(pow(opr0, val), 0.0);
-      }
-      return std::nullopt;
-    }
+  switch (FInfo.getId()) {
+  default:
+    return false;
 
-    case AMDGPULibFunc::EI_ROOTN: {
-      if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
-        double val = (double)iopr1->getSExtValue();
-        return std::make_pair(pow(opr0, 1.0 / val), 0.0);
-      }
-      return std::nullopt;
-    }
-    // with ptr arg
-    case AMDGPULibFunc::EI_SINCOS:
-      return std::make_pair(sin(opr0), cos(opr0));
+  case AMDGPULibFunc::EI_ACOS:
+    Res0 = APFloat{acos(opr0)};
+    return true;
+
+  case AMDGPULibFunc::EI_ACOSH:
+    // acosh(x) == log(x + sqrt(x*x - 1))
+    Res0 = APFloat{log(opr0 + sqrt(opr0 * opr0 - 1.0))};
+    return true;
+
+  case AMDGPULibFunc::EI_ACOSPI:
+    Res0 = APFloat{acos(opr0) / MATH_PI};
+    return true;
+
+  case AMDGPULibFunc::EI_ASIN:
+    Res0 = APFloat{asin(opr0)};
+    return true;
+
+  case AMDGPULibFunc::EI_ASINH:
+    // asinh(x) == log(x + sqrt(x*x + 1))
+    Res0 = APFloat{log(opr0 + sqrt(opr0 * opr0 + 1.0))};
+    return true;
+
+  case AMDGPULibFunc::EI_ASINPI:
+    Res0 = APFloat{asin(opr0) / MATH_PI};
+    return true;
+
+  case AMDGPULibFunc::EI_ATAN:
+    Res0 = APFloat{atan(opr0)};
+    return true;
+
+  case AMDGPULibFunc::EI_ATANH:
+    // atanh(x) == (log(x+1) - log(x-1))/2;
+    Res0 = APFloat{(log(opr0 + 1.0) - log(opr0 - 1.0)) / 2.0};
+    return true;
+
+  case AMDGPULibFunc::EI_ATANPI:
+    Res0 = APFloat{atan(opr0) / MATH_PI};
+    return true;
+
+  case AMDGPULibFunc::EI_CBRT:
+    Res0 =
+        APFloat{(opr0 < 0.0) ? -pow(-opr0, 1.0 / 3.0) : pow(opr0, 1.0 / 3.0)};
+    return true;
+
+  case AMDGPULibFunc::EI_COS:
+    Res0 = APFloat{cos(opr0)};
+    return true;
+
+  case AMDGPULibFunc::EI_COSH:
+    Res0 = APFloat{cosh(opr0)};
+    return true;
+
+  case AMDGPULibFunc::EI_COSPI:
+    Res0 = APFloat{cos(MATH_PI * opr0)};
+    return true;
+
+  case AMDGPULibFunc::EI_EXP:
+    Res0 = APFloat{exp(opr0)};
+    return true;
+
+  case AMDGPULibFunc::EI_EXP2:
+    Res0 = APFloat{pow(2.0, opr0)};
+    return true;
+
+  case AMDGPULibFunc::EI_EXP10:
+    Res0 = APFloat{pow(10.0, opr0)};
+    return true;
+
+  case AMDGPULibFunc::EI_LOG:
+    Res0 = APFloat{log(opr0)};
+    return true;
+
+  case AMDGPULibFunc::EI_LOG2:
+    Res0 = APFloat{log(opr0) / log(2.0)};
+    return true;
+
+  case AMDGPULibFunc::EI_LOG10:
+    Res0 = APFloat{log(opr0) / log(10.0)};
+    return true;
+
+  case AMDGPULibFunc::EI_RSQRT:
+    Res0 = APFloat{1.0 / sqrt(opr0)};
+    return true;
+
+  case AMDGPULibFunc::EI_SIN:
+    Res0 = APFloat{sin(opr0)};
+    return true;
+
+  case AMDGPULibFunc::EI_SINH:
+    Res0 = APFloat{sinh(opr0)};
+    return true;
+
+  case AMDGPULibFunc::EI_SINPI:
+    Res0 = APFloat{sin(MATH_PI * opr0)};
+    return true;
+
+  case AMDGPULibFunc::EI_TAN:
+    Res0 = APFloat{tan(opr0)};
+    return true;
+
+  case AMDGPULibFunc::EI_TANH:
+    Res0 = APFloat{tanh(opr0)};
+    return true;
+
+  case AMDGPULibFunc::EI_TANPI:
+    Res0 = APFloat{tan(MATH_PI * opr0)};
+    return true;
+
+  // two-arg functions
+  case AMDGPULibFunc::EI_POW:
+  case AMDGPULibFunc::EI_POWR:
+    Res0 = APFloat{pow(opr0, opr1)};
+    return true;
+
+  case AMDGPULibFunc::EI_POWN: {
+    if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
+      double val = (double)iopr1->getSExtValue();
+      Res0 = APFloat{pow(opr0, val)};
+      return true;
     }
-  }();
+    return false;
+  }
 
-  if (!Res.has_value())
+  case AMDGPULibFunc::EI_ROOTN: {
+    if (ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
+      double val = (double)iopr1->getSExtValue();
+      Res0 = APFloat{pow(opr0, 1.0 / val)};
+      return true;
+    }
     return false;
-  Res0 = APFloat(Res->first);
-  Res1 = APFloat(Res->second);
-  return true;
+  }
+
+  // with ptr arg
+  case AMDGPULibFunc::EI_SINCOS:
+    Res0 = APFloat{sin(opr0)};
+    Res1 = APFloat{cos(opr0)};
+    return true;
+  }
+
+  return false;
 }
 
 bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {

>From 5a1d00f4f053df7533552c0dc91899dc5113149e Mon Sep 17 00:00:00 2001
From: Steffen Holst Larsen <HolstLarsen.Steffen at amd.com>
Date: Tue, 10 Feb 2026 07:17:22 -0600
Subject: [PATCH 8/8] Remove unused context

Signed-off-by: Steffen Holst Larsen <HolstLarsen.Steffen at amd.com>
---
 llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 1e7b7adc81add..baf4c10180b04 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1602,7 +1602,6 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
     }
   }
 
-  LLVMContext &context = aCI->getContext();
   Constant *nval0, *nval1;
   if (FuncVecSize == 1) {
     nval0 = ConstantFP::get(aCI->getType(), Val0[0]);