[llvm] 54bda79 - AMDGPU: Simplify and improve sincos matching

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 2 14:48:40 PDT 2023


Author: Matt Arsenault
Date: 2023-08-02T17:48:35-04:00
New Revision: 54bda79335ba65b0ab739a97e24030fcd95165b7

URL: https://github.com/llvm/llvm-project/commit/54bda79335ba65b0ab739a97e24030fcd95165b7
DIFF: https://github.com/llvm/llvm-project/commit/54bda79335ba65b0ab739a97e24030fcd95165b7.diff

LOG: AMDGPU: Simplify and improve sincos matching

The first trivial example I tried failed to merge due to the user scan
logic. Remove the complicated scan of users handling with distance
thresholds, with a same block restriction. The actual expansion of
sincos is basically the same size as sin or cos individually. Copy the
technique the generic optimization uses, which is to just use the
input instruction as the insert point or just insert at the start of
the entry block.

https://reviews.llvm.org/D156706

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
    llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.nobuiltins.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index b65b09fd207094..10ae460326d926 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -91,6 +91,9 @@ class AMDGPULibCalls {
   // sqrt
   bool fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
 
+  bool insertSinCos(CallInst *Sin, CallInst *Cos, IRBuilder<> &B,
+                    const FuncInfo &FInfo);
+
   // sin/cos
   bool fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
 
@@ -98,11 +101,6 @@ class AMDGPULibCalls {
   bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
                             const FuncInfo &FInfo);
 
-  // Get insertion point at entry.
-  BasicBlock::iterator getEntryIns(CallInst * UI);
-  // Insert an Alloc instruction.
-  AllocaInst* insertAlloca(CallInst * UI, IRBuilder<> &B, const char *prefix);
-
   // Get a scalar native builtin single argument FP function
   FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo);
 
@@ -1153,6 +1151,71 @@ bool AMDGPULibCalls::fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B,
   return false;
 }
 
+bool AMDGPULibCalls::insertSinCos(CallInst *Sin, CallInst *Cos, IRBuilder<> &B,
+                                  const FuncInfo &fInfo) {
+  Value *Arg = Sin->getOperand(0);
+  assert(Arg == Cos->getOperand(0));
+
+  Function *F = B.GetInsertBlock()->getParent();
+  Module *M = F->getParent();
+  // Merge the sin and cos.
+
+  // for OpenCL 2.0 we have only generic implementation of sincos
+  // function.
+  // FIXME: This is not true anymore
+  AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
+  nf.getLeads()[0].PtrKind =
+      AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);
+  FunctionCallee Fsincos = getFunction(M, nf);
+  if (!Fsincos)
+    return false;
+
+  B.SetInsertPointPastAllocas(F);
+
+  DILocation *MergedDebugLoc =
+      DILocation::getMergedLocation(Sin->getDebugLoc(), Cos->getDebugLoc());
+  B.SetCurrentDebugLocation(MergedDebugLoc);
+
+  AllocaInst *Alloc = B.CreateAlloca(Sin->getType(), nullptr, "__sincos_");
+
+  if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
+    // If the argument is an instruction, it must dominate all uses so put our
+    // sincos call there. Otherwise, right after the allocas works well enough
+    // if it's an argument or constant.
+
+    B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator());
+    B.SetCurrentDebugLocation(MergedDebugLoc);
+  }
+
+  Value *P = Alloc;
+  Type *PTy = Fsincos.getFunctionType()->getParamType(1);
+  // The allocaInst allocates the memory in private address space. This need
+  // to be bitcasted to point to the address space of cos pointer type.
+  // In OpenCL 2.0 this is generic, while in 1.2 that is private.
+  if (PTy->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+    P = B.CreateAddrSpaceCast(Alloc, PTy);
+
+  // Intersect the two sets of flags.
+  FastMathFlags FMF = cast<FPMathOperator>(Sin)->getFastMathFlags();
+  FMF &= cast<FPMathOperator>(Cos)->getFastMathFlags();
+  B.setFastMathFlags(FMF);
+
+  CallInst *Call = CreateCallEx2(B, Fsincos, Arg, P);
+  LoadInst *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
+  Reload->setDebugLoc(Cos->getDebugLoc());
+
+  LLVM_DEBUG(errs() << "AMDIC: fold_sincos (" << *Sin << ", " << *Cos
+                    << ") with " << *Call << '\n');
+
+  Sin->replaceAllUsesWith(Call);
+  Sin->eraseFromParent();
+
+  Cos->replaceAllUsesWith(Reload);
+  Cos->eraseFromParent();
+
+  return true;
+}
+
 // fold sin, cos -> sincos.
 bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
                                  const FuncInfo &fInfo) {
@@ -1168,106 +1231,33 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
 
   Value *CArgVal = FPOp->getOperand(0);
   CallInst *CI = cast<CallInst>(FPOp);
-  BasicBlock * const CBB = CI->getParent();
-
-  int const MaxScan = 30;
   bool Changed = false;
 
-  Module *M = CI->getModule();
   FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN,
                        fInfo);
   const std::string PairName = PartnerInfo.mangle();
 
   CallInst *UI = nullptr;
+
+  // TODO: Handle repeated uses, the generic implementation does.
   for (User* U : CArgVal->users()) {
-    CallInst *XI = dyn_cast_or_null<CallInst>(U);
-    if (!XI || XI == CI || XI->getParent() != CBB)
+    CallInst *XI = dyn_cast<CallInst>(U);
+    if (!XI || XI->isNoBuiltin())
       continue;
 
     Function *UCallee = XI->getCalledFunction();
-    if (!UCallee || !UCallee->getName().equals(PairName))
-      continue;
-
-    BasicBlock::iterator BBI = CI->getIterator();
-    if (BBI == CI->getParent()->begin())
-      break;
-    --BBI;
-    for (int I = MaxScan; I > 0 && BBI != CBB->begin(); --BBI, --I) {
-      if (cast<Instruction>(BBI) == XI) {
-        UI = XI;
-        break;
-      }
-    }
-    if (UI) break;
+    if (UCallee && UCallee->getName().equals(PairName))
+      UI = XI;
+    else if (UI)
+      return Changed;
   }
 
   if (!UI)
     return Changed;
 
-  // Merge the sin and cos.
-
-  // for OpenCL 2.0 we have only generic implementation of sincos
-  // function.
-  AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
-  nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);
-  FunctionCallee Fsincos = getFunction(M, nf);
-  if (!Fsincos)
-    return Changed;
-
-  BasicBlock::iterator ItOld = B.GetInsertPoint();
-  AllocaInst *Alloc = insertAlloca(UI, B, "__sincos_");
-  B.SetInsertPoint(UI);
-
-  Value *P = Alloc;
-  Type *PTy = Fsincos.getFunctionType()->getParamType(1);
-  // The allocaInst allocates the memory in private address space. This need
-  // to be bitcasted to point to the address space of cos pointer type.
-  // In OpenCL 2.0 this is generic, while in 1.2 that is private.
-  if (PTy->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
-    P = B.CreateAddrSpaceCast(Alloc, PTy);
-  CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P);
-
-  LLVM_DEBUG(errs() << "AMDIC: fold_sincos (" << *CI << ", " << *UI << ") with "
-                    << *Call << "\n");
-
-  if (!isSin) { // CI->cos, UI->sin
-    B.SetInsertPoint(&*ItOld);
-    UI->replaceAllUsesWith(&*Call);
-    Instruction *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
-    CI->replaceAllUsesWith(Reload);
-    UI->eraseFromParent();
-    CI->eraseFromParent();
-  } else { // CI->sin, UI->cos
-    Instruction *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
-    UI->replaceAllUsesWith(Reload);
-    CI->replaceAllUsesWith(Call);
-    UI->eraseFromParent();
-    CI->eraseFromParent();
-  }
-  return true;
-}
-
-// Get insertion point at entry.
-BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) {
-  Function * Func = UI->getParent()->getParent();
-  BasicBlock * BB = &Func->getEntryBlock();
-  assert(BB && "Entry block not found!");
-  BasicBlock::iterator ItNew = BB->begin();
-  return ItNew;
-}
-
-// Insert a AllocsInst at the beginning of function entry block.
-AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B,
-                                         const char *prefix) {
-  BasicBlock::iterator ItNew = getEntryIns(UI);
-  Function *UCallee = UI->getCalledFunction();
-  Type *RetType = UCallee->getReturnType();
-  B.SetInsertPoint(&*ItNew);
-  AllocaInst *Alloc =
-      B.CreateAlloca(RetType, nullptr, std::string(prefix) + UI->getName());
-  Alloc->setAlignment(
-      Align(UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType)));
-  return Alloc;
+  CallInst *Sin = isSin ? CI : UI;
+  CallInst *Cos = isSin ? UI : CI;
+  return insertSinCos(Sin, Cos, B, fInfo) || Changed;
 }
 
 bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo,

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.ll
index 87e1dc63e6588b..65cab49f2bbe4b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.ll
@@ -105,10 +105,12 @@ define void @sincos_f32(float %x, ptr addrspace(1) nocapture writeonly %sin_out,
 ; CHECK-LABEL: define void @sincos_f32
 ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract float @_Z3sinf(float [[X]])
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]])
-; CHECK-NEXT:    store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -123,10 +125,12 @@ define void @sincos_f32_value_is_same_constantfp(ptr addrspace(1) nocapture writ
 ; CHECK-LABEL: define void @sincos_f32_value_is_same_constantfp
 ; CHECK-SAME: (ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract float @_Z3sinf(float 4.200000e+01)
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float 4.200000e+01)
-; CHECK-NEXT:    store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float 4.200000e+01, ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -150,10 +154,12 @@ define void @sincos_v2f32(<2 x float> %x, ptr addrspace(1) nocapture writeonly %
 ; CHECK-LABEL: define void @sincos_v2f32
 ; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract <2 x float> @_Z3sinDv2_f(<2 x float> [[X]])
-; CHECK-NEXT:    store <2 x float> [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 8
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract <2 x float> @_Z3cosDv2_f(<2 x float> [[X]])
-; CHECK-NEXT:    store <2 x float> [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 8
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS0S_(<2 x float> [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll
index 2e4703b5f8cf4c..599c2ea4262ba5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll
@@ -56,10 +56,11 @@ define void @sincos_f32(float %x, ptr addrspace(1) nocapture writeonly %sin_out,
 ; CHECK-LABEL: define void @sincos_f32
 ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract float @_Z3sinf(float [[X]])
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]])
-; CHECK-NEXT:    store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[__SINCOS_]])
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
index 82b50ce9da0b6b..ba832c81aba3db 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
@@ -220,12 +220,14 @@ entry:
 
 define void @sincos_f32_nocontract(float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_f32_nocontract
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z3sinf(float [[X]])
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call float @_Z3cosf(float [[X]])
-; CHECK-NEXT:    store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -239,12 +241,14 @@ entry:
 
 define void @sincos_v2f32_nocontract(<2 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v2f32_nocontract
-; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call <2 x float> @_Z3sinDv2_f(<2 x float> [[X]])
-; CHECK-NEXT:    store <2 x float> [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 8
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call <2 x float> @_Z3cosDv2_f(<2 x float> [[X]])
-; CHECK-NEXT:    store <2 x float> [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 8
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x float> @_Z6sincosDv2_fPU3AS0S_(<2 x float> [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -258,12 +262,14 @@ entry:
 
 define void @sincos_f32(float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_f32
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract float @_Z3sinf(float [[X]])
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]])
-; CHECK-NEXT:    store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -276,12 +282,14 @@ entry:
 
 define void @sincos_f32_order1(float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_f32_order1
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]])
-; CHECK-NEXT:    store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract float @_Z3sinf(float [[X]])
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -294,12 +302,14 @@ entry:
 
 define void @sincos_v2f32(<2 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v2f32
-; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract <2 x float> @_Z3sinDv2_f(<2 x float> [[X]])
-; CHECK-NEXT:    store <2 x float> [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 8
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract <2 x float> @_Z3cosDv2_f(<2 x float> [[X]])
-; CHECK-NEXT:    store <2 x float> [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 8
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS0S_(<2 x float> [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -312,13 +322,15 @@ entry:
 
 define void @sincos_v3f32(<3 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v3f32
-; CHECK-SAME: (<3 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (<3 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract <3 x float> @_Z3sinDv3_f(<3 x float> [[X]])
-; CHECK-NEXT:    [[EXTRACTVEC2:%.*]] = shufflevector <3 x float> [[CALL]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <3 x float>, align 16, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract <3 x float> @_Z6sincosDv3_fPU3AS0S_(<3 x float> [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr addrspace(5) [[__SINCOS_]], align 16
+; CHECK-NEXT:    [[EXTRACTVEC2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 ; CHECK-NEXT:    store <4 x float> [[EXTRACTVEC2]], ptr addrspace(1) [[SIN_OUT]], align 16
-; CHECK-NEXT:    [[CALL5:%.*]] = tail call contract <3 x float> @_Z3cosDv3_f(<3 x float> [[X]])
-; CHECK-NEXT:    [[EXTRACTVEC6:%.*]] = shufflevector <3 x float> [[CALL5]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+; CHECK-NEXT:    [[EXTRACTVEC6:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 ; CHECK-NEXT:    store <4 x float> [[EXTRACTVEC6]], ptr addrspace(1) [[COS_OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -334,12 +346,14 @@ entry:
 
 define void @sincos_v4f32(<4 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v4f32
-; CHECK-SAME: (<4 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (<4 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract <4 x float> @_Z3sinDv4_f(<4 x float> [[X]])
-; CHECK-NEXT:    store <4 x float> [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 16
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract <4 x float> @_Z3cosDv4_f(<4 x float> [[X]])
-; CHECK-NEXT:    store <4 x float> [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 16
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract <4 x float> @_Z6sincosDv4_fPU3AS0S_(<4 x float> [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr addrspace(5) [[__SINCOS_]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -352,12 +366,14 @@ entry:
 
 define void @sincos_v8f32(<8 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v8f32
-; CHECK-SAME: (<8 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (<8 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract <8 x float> @_Z3sinDv8_f(<8 x float> [[X]])
-; CHECK-NEXT:    store <8 x float> [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 32
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract <8 x float> @_Z3cosDv8_f(<8 x float> [[X]])
-; CHECK-NEXT:    store <8 x float> [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 32
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <8 x float>, align 32, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract <8 x float> @_Z6sincosDv8_fPU3AS0S_(<8 x float> [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr addrspace(5) [[__SINCOS_]], align 32
+; CHECK-NEXT:    store <8 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 32
+; CHECK-NEXT:    store <8 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 32
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -370,12 +386,14 @@ entry:
 
 define void @sincos_v16f32(<16 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v16f32
-; CHECK-SAME: (<16 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (<16 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract <16 x float> @_Z3sinDv16_f(<16 x float> [[X]])
-; CHECK-NEXT:    store <16 x float> [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 64
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract <16 x float> @_Z3cosDv16_f(<16 x float> [[X]])
-; CHECK-NEXT:    store <16 x float> [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 64
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <16 x float>, align 64, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract <16 x float> @_Z6sincosDv16_fPU3AS0S_(<16 x float> [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x float>, ptr addrspace(5) [[__SINCOS_]], align 64
+; CHECK-NEXT:    store <16 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 64
+; CHECK-NEXT:    store <16 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 64
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -388,12 +406,14 @@ entry:
 
 define void @sincos_f64_nocontract(double %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_f64_nocontract
-; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @_Z3sind(double [[X]])
-; CHECK-NEXT:    store double [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 8
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call double @_Z3cosd(double [[X]])
-; CHECK-NEXT:    store double [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 8
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca double, align 8, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call double @_Z6sincosdPU3AS0d(double [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr addrspace(5) [[__SINCOS_]], align 8
+; CHECK-NEXT:    store double [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8
+; CHECK-NEXT:    store double [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -407,12 +427,14 @@ entry:
 
 define void @sincos_v2f64_nocontract(<2 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v2f64_nocontract
-; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call <2 x double> @_Z3sinDv2_d(<2 x double> [[X]])
-; CHECK-NEXT:    store <2 x double> [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 16
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call <2 x double> @_Z3cosDv2_d(<2 x double> [[X]])
-; CHECK-NEXT:    store <2 x double> [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 16
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <2 x double>, align 16, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x double> @_Z6sincosDv2_dPU3AS0S_(<2 x double> [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr addrspace(5) [[__SINCOS_]], align 16
+; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 16
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -425,12 +447,14 @@ entry:
 
 define void @sincos_f64(double %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_f64
-; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract double @_Z3sind(double [[X]])
-; CHECK-NEXT:    store double [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 8
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract double @_Z3cosd(double [[X]])
-; CHECK-NEXT:    store double [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 8
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca double, align 8, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract double @_Z6sincosdPU3AS0d(double [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr addrspace(5) [[__SINCOS_]], align 8
+; CHECK-NEXT:    store double [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8
+; CHECK-NEXT:    store double [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -443,12 +467,14 @@ entry:
 
 define void @sincos_f64_order1(double %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_f64_order1
-; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (double [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract double @_Z3cosd(double [[X]])
-; CHECK-NEXT:    store double [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 8
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract double @_Z3sind(double [[X]])
-; CHECK-NEXT:    store double [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 8
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca double, align 8, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract double @_Z6sincosdPU3AS0d(double [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr addrspace(5) [[__SINCOS_]], align 8
+; CHECK-NEXT:    store double [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8
+; CHECK-NEXT:    store double [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -461,12 +487,14 @@ entry:
 
 define void @sincos_v2f64(<2 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v2f64
-; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (<2 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract <2 x double> @_Z3sinDv2_d(<2 x double> [[X]])
-; CHECK-NEXT:    store <2 x double> [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 16
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract <2 x double> @_Z3cosDv2_d(<2 x double> [[X]])
-; CHECK-NEXT:    store <2 x double> [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 16
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <2 x double>, align 16, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract <2 x double> @_Z6sincosDv2_dPU3AS0S_(<2 x double> [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr addrspace(5) [[__SINCOS_]], align 16
+; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 16
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -479,13 +507,15 @@ entry:
 
 define void @sincos_v3f64(<3 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v3f64
-; CHECK-SAME: (<3 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (<3 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract <3 x double> @_Z3sinDv3_d(<3 x double> [[X]])
-; CHECK-NEXT:    [[EXTRACTVEC2:%.*]] = shufflevector <3 x double> [[CALL]], <3 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <3 x double>, align 32, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract <3 x double> @_Z6sincosDv3_dPU3AS0S_(<3 x double> [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load <3 x double>, ptr addrspace(5) [[__SINCOS_]], align 32
+; CHECK-NEXT:    [[EXTRACTVEC2:%.*]] = shufflevector <3 x double> [[TMP1]], <3 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 ; CHECK-NEXT:    store <4 x double> [[EXTRACTVEC2]], ptr addrspace(1) [[SIN_OUT]], align 32
-; CHECK-NEXT:    [[CALL5:%.*]] = tail call contract <3 x double> @_Z3cosDv3_d(<3 x double> [[X]])
-; CHECK-NEXT:    [[EXTRACTVEC6:%.*]] = shufflevector <3 x double> [[CALL5]], <3 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+; CHECK-NEXT:    [[EXTRACTVEC6:%.*]] = shufflevector <3 x double> [[TMP2]], <3 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 ; CHECK-NEXT:    store <4 x double> [[EXTRACTVEC6]], ptr addrspace(1) [[COS_OUT]], align 32
 ; CHECK-NEXT:    ret void
 ;
@@ -501,12 +531,14 @@ entry:
 
 define void @sincos_v4f64(<4 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v4f64
-; CHECK-SAME: (<4 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (<4 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract <4 x double> @_Z3sinDv4_d(<4 x double> [[X]])
-; CHECK-NEXT:    store <4 x double> [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 32
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract <4 x double> @_Z3cosDv4_d(<4 x double> [[X]])
-; CHECK-NEXT:    store <4 x double> [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 32
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <4 x double>, align 32, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract <4 x double> @_Z6sincosDv4_dPU3AS0S_(<4 x double> [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr addrspace(5) [[__SINCOS_]], align 32
+; CHECK-NEXT:    store <4 x double> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 32
+; CHECK-NEXT:    store <4 x double> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 32
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -519,12 +551,14 @@ entry:
 
 define void @sincos_v8f64(<8 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v8f64
-; CHECK-SAME: (<8 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (<8 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract <8 x double> @_Z3sinDv8_d(<8 x double> [[X]])
-; CHECK-NEXT:    store <8 x double> [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 64
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract <8 x double> @_Z3cosDv8_d(<8 x double> [[X]])
-; CHECK-NEXT:    store <8 x double> [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 64
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <8 x double>, align 64, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract <8 x double> @_Z6sincosDv8_dPU3AS0S_(<8 x double> [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x double>, ptr addrspace(5) [[__SINCOS_]], align 64
+; CHECK-NEXT:    store <8 x double> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 64
+; CHECK-NEXT:    store <8 x double> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 64
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -537,12 +571,14 @@ entry:
 
 define void @sincos_v16f64(<16 x double> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v16f64
-; CHECK-SAME: (<16 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (<16 x double> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract <16 x double> @_Z3sinDv16_d(<16 x double> [[X]])
-; CHECK-NEXT:    store <16 x double> [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 128
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract <16 x double> @_Z3cosDv16_d(<16 x double> [[X]])
-; CHECK-NEXT:    store <16 x double> [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 128
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <16 x double>, align 128, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract <16 x double> @_Z6sincosDv16_dPU3AS0S_(<16 x double> [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x double>, ptr addrspace(5) [[__SINCOS_]], align 128
+; CHECK-NEXT:    store <16 x double> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 128
+; CHECK-NEXT:    store <16 x double> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 128
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -557,18 +593,20 @@ entry:
 
 define void @sincos_f32_
diff erent_blocks(i1 %cond, float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_f32_
diff erent_blocks
-; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB0:%.*]], label [[BB1:%.*]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       bb0:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract float @_Z3sinf(float [[X]])
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
 ; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]])
-; CHECK-NEXT:    store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    br label [[COMMON_RET]]
 ;
 entry:
@@ -588,17 +626,19 @@ bb1:
 ; The sin and cos are in 
diff erent blocks but always execute
 define void @sincos_f32_
diff erent_blocks_dominating_always_execute(i1 %cond, float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out, ptr addrspace(1) %other) {
 ; CHECK-LABEL: define void @sincos_f32_
diff erent_blocks_dominating_always_execute
-; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[OTHER:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[OTHER:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract float @_Z3sinf(float [[X]])
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB0:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb0:
 ; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[OTHER]], align 4
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]])
-; CHECK-NEXT:    store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -619,12 +659,14 @@ bb1:
 ; sin dominates cos but cos doesn't always execute.
 define void @sincos_f32_
diff erent_blocks_dominating_conditional_execute(i1 %cond, float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out, ptr addrspace(1) %other) {
 ; CHECK-LABEL: define void @sincos_f32_
diff erent_blocks_dominating_conditional_execute
-; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr addrspace(1) nocapture readnone [[OTHER:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]], ptr addrspace(1) nocapture readnone [[OTHER:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract float @_Z3sinf(float [[X]])
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]])
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], float [[CALL1]], float 0.000000e+00
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], float [[TMP2]], float 0.000000e+00
 ; CHECK-NEXT:    store float [[SPEC_SELECT]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -645,11 +687,13 @@ bb1:
 
 define float @select_sin_or_cos_f32(i1 %cond, float %x) {
 ; CHECK-LABEL: define float @select_sin_or_cos_f32
-; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
+; CHECK-SAME: (i1 [[COND:%.*]], float [[X:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SIN:%.*]] = tail call contract float @_Z3sinf(float [[X]])
-; CHECK-NEXT:    [[COS:%.*]] = tail call contract float @_Z3cosf(float [[X]])
-; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[SIN]], float [[COS]]
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[COND]], float [[TMP1]], float [[TMP2]]
 ; CHECK-NEXT:    ret float [[SELECT]]
 ;
 entry:
@@ -665,13 +709,13 @@ define void @sincos_f32_value_is_instr(ptr addrspace(1) %value.ptr, ptr addrspac
 ; CHECK-LABEL: define void @sincos_f32_value_is_instr
 ; CHECK-SAME: (ptr addrspace(1) [[VALUE_PTR:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__SINCOS_CALL:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
 ; CHECK-NEXT:    tail call void @func(ptr addrspace(1) [[VALUE_PTR]])
 ; CHECK-NEXT:    [[X:%.*]] = load float, ptr addrspace(1) [[VALUE_PTR]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_CALL]] to ptr
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
 ; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_CALL]], align 4
 ; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -777,12 +821,14 @@ entry:
 
 define void @sincos_f32_flag_intersect0(float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_f32_flag_intersect0
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call nnan contract float @_Z3sinf(float [[X]])
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call ninf contract float @_Z3cosf(float [[X]])
-; CHECK-NEXT:    store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -795,12 +841,14 @@ entry:
 
 define void @sincos_f32_flag_intersect1(float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_f32_flag_intersect1
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call nnan nsz contract float @_Z3sinf(float [[X]])
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call nnan ninf contract float @_Z3cosf(float [[X]])
-; CHECK-NEXT:    store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -813,12 +861,14 @@ entry:
 
 define void @sincos_v2f32_flag_intersect1(<2 x float> %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_v2f32_flag_intersect1
-; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (<2 x float> [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call nnan ninf contract <2 x float> @_Z3sinDv2_f(<2 x float> [[X]])
-; CHECK-NEXT:    store <2 x float> [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 8
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call nnan nsz contract <2 x float> @_Z3cosDv2_f(<2 x float> [[X]])
-; CHECK-NEXT:    store <2 x float> [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 8
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan contract <2 x float> @_Z6sincosDv2_fPU3AS0S_(<2 x float> [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -835,12 +885,12 @@ define void @sincos_f32_alloca_insertpt(float %x) {
 ; CHECK-LABEL: define void @sincos_f32_alloca_insertpt
 ; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__SINCOS_SIN:%.*]] = alloca float, align 4, addrspace(5)
 ; CHECK-NEXT:    [[ALLOCA0:%.*]] = alloca i32, align 4, addrspace(5)
 ; CHECK-NEXT:    [[ALLOCA1:%.*]] = alloca i32, align 4, addrspace(5)
-; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_SIN]] to ptr
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_SIN]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
 ; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(5) [[ALLOCA0]], align 4
 ; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(5) [[ALLOCA1]], align 4
 ; CHECK-NEXT:    call void @use_stack_ptrs(ptr addrspace(5) [[ALLOCA0]], ptr addrspace(5) [[ALLOCA1]])
@@ -859,10 +909,10 @@ entry:
 
 define float @sincos_f32_unused_result_cos(float %x) {
 ; CHECK-LABEL: define float @sincos_f32_unused_result_cos
-; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__SINCOS_SIN:%.*]] = alloca float, align 4, addrspace(5)
-; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_SIN]] to ptr
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
 ; CHECK-NEXT:    ret float [[TMP1]]
 ;
@@ -876,12 +926,12 @@ entry:
 
 define float @sincos_f32_unused_result_sin(float %x) {
 ; CHECK-LABEL: define float @sincos_f32_unused_result_sin
-; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR3]] {
+; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__SINCOS_DEAD:%.*]] = alloca float, align 4, addrspace(5)
-; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_DEAD]] to ptr
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
 ; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_DEAD]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
 ; CHECK-NEXT:    ret float [[TMP2]]
 ;
 entry:
@@ -894,22 +944,15 @@ entry:
 
 define void @sincos_f32_repeated_uses(float %x, ptr addrspace(1) %sin_out, ptr addrspace(1) %cos_out) {
 ; CHECK-LABEL: define void @sincos_f32_repeated_uses
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) [[SIN_OUT:%.*]], ptr addrspace(1) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) [[SIN_OUT:%.*]], ptr addrspace(1) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__SINCOS_SIN0:%.*]] = alloca float, align 4, addrspace(5)
-; CHECK-NEXT:    [[__SINCOS_SIN1:%.*]] = alloca float, align 4, addrspace(5)
-; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_SIN0]] to ptr
-; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
-; CHECK-NEXT:    store volatile float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_SIN1]] to ptr
-; CHECK-NEXT:    [[TMP3:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP2]])
-; CHECK-NEXT:    store volatile float [[TMP3]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(5) [[__SINCOS_SIN1]], align 4
-; CHECK-NEXT:    store volatile float [[TMP4]], ptr addrspace(1) [[COS_OUT]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(5) [[__SINCOS_SIN0]], align 4
-; CHECK-NEXT:    store volatile float [[TMP5]], ptr addrspace(1) [[COS_OUT]], align 4
-; CHECK-NEXT:    [[SIN2:%.*]] = tail call contract float @_Z3sinf(float [[X]])
-; CHECK-NEXT:    store volatile float [[SIN2]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    [[SIN0:%.*]] = tail call contract float @_Z3sinf(float [[X]])
+; CHECK-NEXT:    store volatile float [[SIN0]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    store volatile float [[SIN0]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    [[COS0:%.*]] = tail call contract float @_Z3cosf(float [[X]])
+; CHECK-NEXT:    store volatile float [[COS0]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    store volatile float [[COS0]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    store volatile float [[SIN0]], ptr addrspace(1) [[SIN_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -969,12 +1012,14 @@ entry:
 
 define void @sincos_f32_preserve_fpmath_0(float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_f32_preserve_fpmath_0
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract float @_Z3sinf(float [[X]]), !fpmath !5
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]]), !fpmath !5
-; CHECK-NEXT:    store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -987,12 +1032,14 @@ entry:
 
 define void @sincos_f32_preserve_fpmath_1(float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_f32_preserve_fpmath_1
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract float @_Z3sinf(float [[X]]), !fpmath !5
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]]), !fpmath !6
-; CHECK-NEXT:    store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1006,12 +1053,14 @@ entry:
 ; Should drop the metadata
 define void @sincos_f32_drop_fpmath(float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) {
 ; CHECK-LABEL: define void @sincos_f32_drop_fpmath
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract float @_Z3sinf(float [[X]]), !fpmath !5
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]])
-; CHECK-NEXT:    store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1024,15 +1073,17 @@ entry:
 
 define void @sincos_f32_debuginfo(float %x, ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) !dbg !15 {
 ; CHECK-LABEL: define void @sincos_f32_debuginfo
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] !dbg [[DBG7:![0-9]+]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] !dbg [[DBG5:![0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract float @_Z3sinf(float [[X]]), !dbg [[DBG14:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata float [[CALL]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG14]]
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4, !dbg [[DBG15:![0-9]+]]
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]]), !dbg [[DBG16:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata float [[CALL1]], metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG16]]
-; CHECK-NEXT:    store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4, !dbg [[DBG17:![0-9]+]]
-; CHECK-NEXT:    ret void, !dbg [[DBG18:![0-9]+]]
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5), !dbg [[DBG12:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr, !dbg [[DBG12]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]]), !dbg [[DBG12]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4, !dbg [[DBG13:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata float [[TMP1]], metadata [[META9:![0-9]+]], metadata !DIExpression()), !dbg [[DBG14:![0-9]+]]
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4, !dbg [[DBG15:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata float [[TMP2]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG13]]
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4, !dbg [[DBG16:![0-9]+]]
+; CHECK-NEXT:    ret void, !dbg [[DBG17:![0-9]+]]
 ;
 entry:
   %call = tail call contract float @_Z3sinf(float %x), !dbg !19
@@ -1046,12 +1097,12 @@ entry:
 
 define float @sin_sincos_private_f32(float %x, ptr addrspace(1) %sin_out, ptr addrspace(1) %cos_out) {
 ; CHECK-LABEL: define float @sin_sincos_private_f32
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COS_TMP:%.*]] = alloca float, align 4, addrspace(5)
-; CHECK-NEXT:    [[SIN0:%.*]] = tail call nnan ninf nsz contract float @_Z3sinf(float [[X]]), !fpmath !5
+; CHECK-NEXT:    [[SIN0:%.*]] = tail call nnan ninf nsz contract float @_Z3sinf(float [[X]]), !fpmath !18
 ; CHECK-NEXT:    store float [[SIN0]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[SIN1:%.*]] = call nnan contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[COS_TMP]]), !fpmath !6
+; CHECK-NEXT:    [[SIN1:%.*]] = call nnan contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[COS_TMP]]), !fpmath !19
 ; CHECK-NEXT:    [[COS1:%.*]] = load float, ptr addrspace(5) [[COS_TMP]], align 4
 ; CHECK-NEXT:    store float [[COS1]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret float [[SIN1]]
@@ -1068,13 +1119,13 @@ entry:
 
 define float @sin_sincos_generic_f32(float %x, ptr addrspace(1) %sin_out, ptr addrspace(1) %cos_out) {
 ; CHECK-LABEL: define float @sin_sincos_generic_f32
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR5]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COS_TMP:%.*]] = alloca float, align 4, addrspace(5)
-; CHECK-NEXT:    [[SIN0:%.*]] = tail call nsz contract float @_Z3sinf(float [[X]]), !fpmath !5
+; CHECK-NEXT:    [[SIN0:%.*]] = tail call nsz contract float @_Z3sinf(float [[X]]), !fpmath !18
 ; CHECK-NEXT:    store float [[SIN0]], ptr addrspace(1) [[SIN_OUT]], align 4
 ; CHECK-NEXT:    [[COS_TMP_CAST:%.*]] = addrspacecast ptr addrspace(5) [[COS_TMP]] to ptr
-; CHECK-NEXT:    [[SIN1:%.*]] = call ninf nsz contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[COS_TMP_CAST]]), !fpmath !6
+; CHECK-NEXT:    [[SIN1:%.*]] = call ninf nsz contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[COS_TMP_CAST]]), !fpmath !19
 ; CHECK-NEXT:    [[COS1:%.*]] = load float, ptr addrspace(5) [[COS_TMP]], align 4
 ; CHECK-NEXT:    store float [[COS1]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret float [[SIN1]]
@@ -1092,7 +1143,7 @@ entry:
 
 define float @cos_sincos_private_f32(float %x, ptr addrspace(1) %sin_out, ptr addrspace(1) %cos_out) {
 ; CHECK-LABEL: define float @cos_sincos_private_f32
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture readnone [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR5]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture readnone [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COS_TMP:%.*]] = alloca float, align 4, addrspace(5)
 ; CHECK-NEXT:    [[COS0:%.*]] = tail call contract float @_Z3cosf(float [[X]])
@@ -1114,7 +1165,7 @@ entry:
 
 define float @cos_sincos_generic_f32(float %x, ptr addrspace(1) %sin_out, ptr addrspace(1) %cos_out) {
 ; CHECK-LABEL: define float @cos_sincos_generic_f32
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture readnone [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR5]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture readnone [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COS_TMP:%.*]] = alloca float, align 4, addrspace(5)
 ; CHECK-NEXT:    [[COS0:%.*]] = tail call contract float @_Z3cosf(float [[X]])
@@ -1138,7 +1189,7 @@ entry:
 
 define float @sincos_private_f32_x2(float %x, ptr addrspace(1) %sin_out, ptr addrspace(1) %cos_out) {
 ; CHECK-LABEL: define float @sincos_private_f32_x2
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture readnone [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR5]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture readnone [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COS_TMP0:%.*]] = alloca float, align 4, addrspace(5)
 ; CHECK-NEXT:    [[COS_TMP1:%.*]] = alloca float, align 4, addrspace(5)
@@ -1166,7 +1217,7 @@ entry:
 
 define float @sincos_generic_f32_x2(float %x, ptr addrspace(1) %sin_out, ptr addrspace(1) %cos_out) {
 ; CHECK-LABEL: define float @sincos_generic_f32_x2
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture readnone [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR5]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture readnone [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COS_TMP0:%.*]] = alloca float, align 4, addrspace(5)
 ; CHECK-NEXT:    [[COS_TMP1:%.*]] = alloca float, align 4, addrspace(5)
@@ -1195,7 +1246,7 @@ entry:
 
 define float @sincos_generic_private_f32(float %x, ptr addrspace(1) %sin_out, ptr addrspace(1) %cos_out) {
 ; CHECK-LABEL: define float @sincos_generic_private_f32
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture readnone [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR5]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture readnone [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COS_TMP0:%.*]] = alloca float, align 4, addrspace(5)
 ; CHECK-NEXT:    [[COS_TMP1:%.*]] = alloca float, align 4, addrspace(5)
@@ -1222,22 +1273,19 @@ entry:
 
 define float @sincos_mixed_sin_cos_generic_private_f32(float %x, ptr addrspace(1) %sin_out, ptr addrspace(1) %cos_out) {
 ; CHECK-LABEL: define float @sincos_mixed_sin_cos_generic_private_f32
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR5]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__SINCOS_SIN1:%.*]] = alloca float, align 4, addrspace(5)
 ; CHECK-NEXT:    [[COS_TMP0:%.*]] = alloca float, align 4, addrspace(5)
 ; CHECK-NEXT:    [[COS_TMP1:%.*]] = alloca float, align 4, addrspace(5)
 ; CHECK-NEXT:    [[SIN0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[COS_TMP0]])
-; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_SIN1]] to ptr
-; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
-; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[COS0:%.*]] = load float, ptr addrspace(5) [[COS_TMP0]], align 4
-; CHECK-NEXT:    store float [[COS0]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    [[SIN1:%.*]] = call contract float @_Z3sinf(float [[X]])
+; CHECK-NEXT:    store float [[SIN1]], ptr addrspace(1) [[SIN_OUT]], align 4
 ; CHECK-NEXT:    [[COS_TMP1_CAST:%.*]] = addrspacecast ptr addrspace(5) [[COS_TMP1]] to ptr
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_SIN1]], align 4
-; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    [[COS1:%.*]] = call contract float @_Z3cosf(float [[X]])
+; CHECK-NEXT:    store float [[COS1]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    [[SIN2:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[COS_TMP1_CAST]])
-; CHECK-NEXT:    store float [[COS0]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    [[COS2:%.*]] = load float, ptr addrspace(5) [[COS_TMP0]], align 4
+; CHECK-NEXT:    store float [[COS2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret float [[SIN2]]
 ;
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.nobuiltins.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.nobuiltins.ll
index 94d68fe7f6ac33..53fea4307d6457 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.nobuiltins.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.nobuiltins.ll
@@ -13,10 +13,12 @@ define void @sincos_f32_nobuiltin(float noundef %x, ptr addrspace(1) nocapture n
 ; CHECK-LABEL: define void @sincos_f32_nobuiltin
 ; CHECK-SAME: (float noundef [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture noundef writeonly [[COS_OUT:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract float @_Z3sinf(float noundef [[X]])
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float noundef [[X]])
-; CHECK-NEXT:    store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -32,10 +34,12 @@ define void @sincos_v2f32_nobuiltin(<2 x float> noundef %x, ptr addrspace(1) noc
 ; CHECK-LABEL: define void @sincos_v2f32_nobuiltin
 ; CHECK-SAME: (<2 x float> noundef [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture noundef writeonly [[COS_OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract <2 x float> @_Z3sinDv2_f(<2 x float> noundef [[X]])
-; CHECK-NEXT:    store <2 x float> [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 8
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract <2 x float> @_Z3cosDv2_f(<2 x float> noundef [[X]])
-; CHECK-NEXT:    store <2 x float> [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 8
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS0S_(<2 x float> [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -50,10 +54,12 @@ define void @sincos_f32_no_builtins(float noundef %x, ptr addrspace(1) nocapture
 ; CHECK-LABEL: define void @sincos_f32_no_builtins
 ; CHECK-SAME: (float noundef [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture noundef writeonly [[COS_OUT:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract float @_Z3sinf(float noundef [[X]])
-; CHECK-NEXT:    store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float noundef [[X]])
-; CHECK-NEXT:    store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -68,10 +74,12 @@ define void @sincos_v2f32_no_builtins(<2 x float> noundef %x, ptr addrspace(1) n
 ; CHECK-LABEL: define void @sincos_v2f32_no_builtins
 ; CHECK-SAME: (<2 x float> noundef [[X:%.*]], ptr addrspace(1) nocapture noundef writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture noundef writeonly [[COS_OUT:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call contract <2 x float> @_Z3sinDv2_f(<2 x float> noundef [[X]])
-; CHECK-NEXT:    store <2 x float> [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 8
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract <2 x float> @_Z3cosDv2_f(<2 x float> noundef [[X]])
-; CHECK-NEXT:    store <2 x float> [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 8
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca <2 x float>, align 8, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract <2 x float> @_Z6sincosDv2_fPU3AS0S_(<2 x float> [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[__SINCOS_]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:


        


More information about the llvm-commits mailing list