[llvm] a7dafea - [SDAG] Allow folding stack slots into sincos/frexp in more cases (#118117)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 17 02:54:21 PST 2024
Author: Benjamin Maxwell
Date: 2024-12-17T10:54:17Z
New Revision: a7dafea384a519342b2fbe210ed101c1e67f3be7
URL: https://github.com/llvm/llvm-project/commit/a7dafea384a519342b2fbe210ed101c1e67f3be7
DIFF: https://github.com/llvm/llvm-project/commit/a7dafea384a519342b2fbe210ed101c1e67f3be7.diff
LOG: [SDAG] Allow folding stack slots into sincos/frexp in more cases (#118117)
This adds a new helper `canFoldStoreIntoLibCallOutputPointers()` to
check that it is safe to fold a store into a node that will expand to a
library call that takes output pointers. This requires checking for two
(independent) properties:
1. The store is not within a CALLSEQ_START..CALLSEQ_END pair
* If it is, the expansion would lead to nested call sequences (which is
invalid)
2. The node does not appear as a predecessor to the store
* If it does, attempting to merge the store into the call would result
in a cycle in the DAG
These two properties are checked as part of the same traversal in
`canFoldStoreIntoLibCallOutputPointers()`
Added:
llvm/test/CodeGen/X86/llvm.sincos.ll
Modified:
llvm/include/llvm/CodeGen/SelectionDAG.h
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
llvm/test/CodeGen/PowerPC/f128-arith.ll
llvm/test/CodeGen/RISCV/llvm.frexp.ll
llvm/test/CodeGen/X86/llvm.frexp.ll
llvm/test/CodeGen/X86/sincos-stack-args.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index e97e01839f73b4..d63e1c559122b6 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -455,6 +455,9 @@ class SelectionDAG {
// Maximum depth for recursive analysis such as computeKnownBits, etc.
static constexpr unsigned MaxRecursionDepth = 6;
+ // Returns the maximum steps for SDNode->hasPredecessor() like searches.
+ static unsigned getHasPredecessorMaxSteps();
+
explicit SelectionDAG(const TargetMachine &TM, CodeGenOptLevel);
SelectionDAG(const SelectionDAG &) = delete;
SelectionDAG &operator=(const SelectionDAG &) = delete;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4fe5e7416c7871..10fc8eecaff907 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -153,12 +153,6 @@ static cl::opt<bool> EnableVectorFCopySignExtendRound(
"combiner-vector-fcopysign-extend-round", cl::Hidden, cl::init(false),
cl::desc(
"Enable merging extends and rounds into FCOPYSIGN on vector types"));
-
-static cl::opt<unsigned int>
- MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192),
- cl::desc("DAG combiner limit number of steps when searching DAG "
- "for predecessor nodes"));
-
namespace {
class DAGCombiner {
@@ -18929,6 +18923,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
// can be folded with this one. We should do this to avoid having to keep
// a copy of the original base pointer.
SmallVector<SDNode *, 16> OtherUses;
+ unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
if (isa<ConstantSDNode>(Offset))
for (SDNode::use_iterator UI = BasePtr->use_begin(),
UE = BasePtr->use_end();
@@ -19093,6 +19088,7 @@ static bool shouldCombineToPostInc(SDNode *N, SDValue Ptr, SDNode *PtrUse,
return false;
SmallPtrSet<const SDNode *, 32> Visited;
+ unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
for (SDNode *Use : BasePtr->uses()) {
if (Use == Ptr.getNode())
continue;
@@ -19139,6 +19135,7 @@ static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad,
// 2) Op must be independent of N, i.e. Op is neither a predecessor
// nor a successor of N. Otherwise, if Op is folded that would
// create a cycle.
+ unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
for (SDNode *Op : Ptr->uses()) {
// Check for #1.
if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI))
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 34214550f3a12b..0fb5c4d5c4cb9b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -111,10 +111,17 @@ static cl::opt<int> MaxLdStGlue("ldstmemcpy-glue-max",
cl::desc("Number limit for gluing ld/st of memcpy."),
cl::Hidden, cl::init(0));
+static cl::opt<unsigned>
+ MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192),
+ cl::desc("DAG combiner limit number of steps when searching DAG "
+ "for predecessor nodes"));
+
static void NewSDValueDbgMsg(SDValue V, StringRef Msg, SelectionDAG *G) {
LLVM_DEBUG(dbgs() << Msg; V.getNode()->dump(G););
}
+unsigned SelectionDAG::getHasPredecessorMaxSteps() { return MaxSteps; }
+
//===----------------------------------------------------------------------===//
// ConstantFPSDNode Class
//===----------------------------------------------------------------------===//
@@ -2474,6 +2481,51 @@ SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
return Subvectors[0];
}
+/// Given a store node \p StoreNode, return true if it is safe to fold that node
+/// into \p FPNode, which expands to a library call with output pointers.
+static bool canFoldStoreIntoLibCallOutputPointers(StoreSDNode *StoreNode,
+ SDNode *FPNode) {
+ SmallVector<const SDNode *, 8> Worklist;
+ SmallVector<const SDNode *, 8> DeferredNodes;
+ SmallPtrSet<const SDNode *, 16> Visited;
+
+ // Skip FPNode use by StoreNode (that's the use we want to fold into FPNode).
+ for (SDValue Op : StoreNode->ops())
+ if (Op.getNode() != FPNode)
+ Worklist.push_back(Op.getNode());
+
+ unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
+ while (!Worklist.empty()) {
+ const SDNode *Node = Worklist.pop_back_val();
+ auto [_, Inserted] = Visited.insert(Node);
+ if (!Inserted)
+ continue;
+
+ if (MaxSteps > 0 && Visited.size() >= MaxSteps)
+ return false;
+
+ // Reached the FPNode (would result in a cycle).
+ // OR Reached CALLSEQ_START (would result in nested call sequences).
+ if (Node == FPNode || Node->getOpcode() == ISD::CALLSEQ_START)
+ return false;
+
+ if (Node->getOpcode() == ISD::CALLSEQ_END) {
+ // Defer looking into call sequences (so we can check we're outside one).
+ // We still need to look through these for the predecessor check.
+ DeferredNodes.push_back(Node);
+ continue;
+ }
+
+ for (SDValue Op : Node->ops())
+ Worklist.push_back(Op.getNode());
+ }
+
+ // True if we're outside a call sequence and don't have the FPNode as a
+ // predecessor. No cycles or nested call sequences possible.
+ return !SDNode::hasPredecessorHelper(FPNode, Visited, DeferredNodes,
+ MaxSteps);
+}
+
bool SelectionDAG::expandMultipleResultFPLibCall(
RTLIB::Libcall LC, SDNode *Node, SmallVectorImpl<SDValue> &Results,
std::optional<unsigned> CallRetResNo) {
@@ -2502,11 +2554,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
// Find users of the node that store the results (and share input chains). The
// destination pointers can be used instead of creating stack allocations.
- // FIXME: This should allow stores with the same chains (not just the entry
- // chain), but there's a risk the store is within a (CALLSEQ_START,
- // CALLSEQ_END) pair, which after this expansion will lead to nested call
- // sequences.
- SDValue InChain = getEntryNode();
+ SDValue StoresInChain;
SmallVector<StoreSDNode *, 2> ResultStores(NumResults);
for (SDNode *User : Node->uses()) {
if (!ISD::isNormalStore(User))
@@ -2514,14 +2562,27 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
auto *ST = cast<StoreSDNode>(User);
SDValue StoreValue = ST->getValue();
unsigned ResNo = StoreValue.getResNo();
+ // Ensure the store corresponds to an output pointer.
+ if (CallRetResNo == ResNo)
+ continue;
+ // Ensure the store to the default address space and not atomic or volatile.
+ if (!ST->isSimple() || ST->getAddressSpace() != 0)
+ continue;
+ // Ensure all store chains are the same (so they don't alias).
+ if (StoresInChain && ST->getChain() != StoresInChain)
+ continue;
+ // Ensure the store is properly aligned.
Type *StoreType = StoreValue.getValueType().getTypeForEVT(Ctx);
- if (CallRetResNo == ResNo || !ST->isSimple() ||
- ST->getAddressSpace() != 0 ||
- ST->getAlign() <
- getDataLayout().getABITypeAlign(StoreType->getScalarType()) ||
- ST->getChain() != InChain)
+ if (ST->getAlign() <
+ getDataLayout().getABITypeAlign(StoreType->getScalarType()))
+ continue;
+ // Avoid:
+ // 1. Creating cyclic dependencies.
+ // 2. Expanding the node to a call within a call sequence.
+ if (!canFoldStoreIntoLibCallOutputPointers(ST, Node))
continue;
ResultStores[ResNo] = ST;
+ StoresInChain = ST->getChain();
}
TargetLowering::ArgListTy Args;
@@ -2563,6 +2624,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
Type *RetType = CallRetResNo.has_value()
? Node->getValueType(*CallRetResNo).getTypeForEVT(Ctx)
: Type::getVoidTy(Ctx);
+ SDValue InChain = StoresInChain ? StoresInChain : getEntryNode();
SDValue Callee = getExternalSymbol(VD ? VD->getVectorFnName().data() : LCName,
TLI->getPointerTy(getDataLayout()));
TargetLowering::CallLoweringInfo CLI(*this);
diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
index 8ef8b5d13b62d4..c5fef61c96af3a 100644
--- a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
+++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
@@ -253,3 +253,37 @@ entry:
store double %cos, ptr %out_cos, align 4
ret void
}
+
+declare void @foo(ptr, ptr)
+
+define void @can_fold_with_call_in_chain(float %x, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: can_fold_with_call_in_chain:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str d8, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w20, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset b8, -32
+; CHECK-NEXT: mov x19, x1
+; CHECK-NEXT: mov x20, x0
+; CHECK-NEXT: fmov s8, s0
+; CHECK-NEXT: bl foo
+; CHECK-NEXT: fmov s0, s8
+; CHECK-NEXT: mov x0, x20
+; CHECK-NEXT: mov x1, x19
+; CHECK-NEXT: bl sincosf
+; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT: ldr d8, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %sin = tail call float @llvm.sin.f32(float %x)
+ %cos = tail call float @llvm.cos.f32(float %x)
+ call void @foo(ptr %a, ptr %b)
+ store float %sin, ptr %a, align 4
+ store float %cos, ptr %b, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/f128-arith.ll b/llvm/test/CodeGen/PowerPC/f128-arith.ll
index 35e5d61947ead7..decc4a38f7ccd4 100644
--- a/llvm/test/CodeGen/PowerPC/f128-arith.ll
+++ b/llvm/test/CodeGen/PowerPC/f128-arith.ll
@@ -1365,45 +1365,33 @@ define dso_local fp128 @qpFREXP(ptr %a, ptr %b) {
; CHECK-LABEL: qpFREXP:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: mflr r0
-; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: stdu r1, -32(r1)
+; CHECK-NEXT: std r0, 48(r1)
+; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: .cfi_offset lr, 16
-; CHECK-NEXT: .cfi_offset r30, -16
-; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
-; CHECK-NEXT: stdu r1, -64(r1)
-; CHECK-NEXT: std r0, 80(r1)
-; CHECK-NEXT: addi r5, r1, 44
-; CHECK-NEXT: mr r30, r4
; CHECK-NEXT: lxv v2, 0(r3)
+; CHECK-NEXT: mr r5, r4
; CHECK-NEXT: bl frexpf128
; CHECK-NEXT: nop
-; CHECK-NEXT: lwz r3, 44(r1)
-; CHECK-NEXT: stw r3, 0(r30)
-; CHECK-NEXT: addi r1, r1, 64
+; CHECK-NEXT: addi r1, r1, 32
; CHECK-NEXT: ld r0, 16(r1)
-; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
; CHECK-NEXT: mtlr r0
; CHECK-NEXT: blr
;
; CHECK-P8-LABEL: qpFREXP:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: mflr r0
-; CHECK-P8-NEXT: .cfi_def_cfa_offset 64
+; CHECK-P8-NEXT: stdu r1, -32(r1)
+; CHECK-P8-NEXT: std r0, 48(r1)
+; CHECK-P8-NEXT: .cfi_def_cfa_offset 32
; CHECK-P8-NEXT: .cfi_offset lr, 16
-; CHECK-P8-NEXT: .cfi_offset r30, -16
-; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill
-; CHECK-P8-NEXT: stdu r1, -64(r1)
-; CHECK-P8-NEXT: std r0, 80(r1)
-; CHECK-P8-NEXT: addi r5, r1, 44
-; CHECK-P8-NEXT: mr r30, r4
; CHECK-P8-NEXT: lxvd2x vs0, 0, r3
+; CHECK-P8-NEXT: mr r5, r4
; CHECK-P8-NEXT: xxswapd v2, vs0
; CHECK-P8-NEXT: bl frexpf128
; CHECK-P8-NEXT: nop
-; CHECK-P8-NEXT: lwz r3, 44(r1)
-; CHECK-P8-NEXT: stw r3, 0(r30)
-; CHECK-P8-NEXT: addi r1, r1, 64
+; CHECK-P8-NEXT: addi r1, r1, 32
; CHECK-P8-NEXT: ld r0, 16(r1)
-; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
; CHECK-P8-NEXT: mtlr r0
; CHECK-P8-NEXT: blr
entry:
diff --git a/llvm/test/CodeGen/RISCV/llvm.frexp.ll b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
index 74dec76a02e892..4a77b4d32cdda6 100644
--- a/llvm/test/CodeGen/RISCV/llvm.frexp.ll
+++ b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
@@ -543,50 +543,42 @@ define i32 @test_frexp_f32_i32_only_use_exp(float %a) nounwind {
define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwind {
; RV32IFD-LABEL: test_frexp_v4f32_v4i32:
; RV32IFD: # %bb.0:
-; RV32IFD-NEXT: addi sp, sp, -64
-; RV32IFD-NEXT: sw ra, 60(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT: sw s0, 56(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT: fsd fs0, 48(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT: fsd fs1, 40(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT: fsd fs2, 32(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT: fsd fs3, 24(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: addi sp, sp, -48
+; RV32IFD-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT: fsd fs0, 32(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: fsd fs1, 24(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: fsd fs2, 16(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: fsd fs3, 8(sp) # 8-byte Folded Spill
; RV32IFD-NEXT: fmv.s fs0, fa3
; RV32IFD-NEXT: fmv.s fs1, fa2
; RV32IFD-NEXT: fmv.s fs2, fa1
; RV32IFD-NEXT: mv s0, a0
-; RV32IFD-NEXT: addi a0, sp, 8
+; RV32IFD-NEXT: addi a0, a0, 16
; RV32IFD-NEXT: call frexpf
; RV32IFD-NEXT: fmv.s fs3, fa0
-; RV32IFD-NEXT: addi a0, sp, 12
+; RV32IFD-NEXT: addi a0, s0, 20
; RV32IFD-NEXT: fmv.s fa0, fs2
; RV32IFD-NEXT: call frexpf
; RV32IFD-NEXT: fmv.s fs2, fa0
-; RV32IFD-NEXT: addi a0, sp, 16
+; RV32IFD-NEXT: addi a0, s0, 24
; RV32IFD-NEXT: fmv.s fa0, fs1
; RV32IFD-NEXT: call frexpf
; RV32IFD-NEXT: fmv.s fs1, fa0
-; RV32IFD-NEXT: addi a0, sp, 20
+; RV32IFD-NEXT: addi a0, s0, 28
; RV32IFD-NEXT: fmv.s fa0, fs0
; RV32IFD-NEXT: call frexpf
-; RV32IFD-NEXT: lw a0, 8(sp)
-; RV32IFD-NEXT: lw a1, 12(sp)
-; RV32IFD-NEXT: lw a2, 16(sp)
-; RV32IFD-NEXT: lw a3, 20(sp)
-; RV32IFD-NEXT: sw a0, 16(s0)
-; RV32IFD-NEXT: sw a1, 20(s0)
-; RV32IFD-NEXT: sw a2, 24(s0)
-; RV32IFD-NEXT: sw a3, 28(s0)
; RV32IFD-NEXT: fsw fs3, 0(s0)
; RV32IFD-NEXT: fsw fs2, 4(s0)
; RV32IFD-NEXT: fsw fs1, 8(s0)
; RV32IFD-NEXT: fsw fa0, 12(s0)
-; RV32IFD-NEXT: lw ra, 60(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT: lw s0, 56(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT: fld fs0, 48(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: fld fs1, 40(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: fld fs2, 32(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: fld fs3, 24(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: addi sp, sp, 64
+; RV32IFD-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT: fld fs0, 32(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: fld fs1, 24(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: fld fs2, 16(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: fld fs3, 8(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: addi sp, sp, 48
; RV32IFD-NEXT: ret
;
; RV64IFD-LABEL: test_frexp_v4f32_v4i32:
@@ -639,52 +631,44 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
;
; RV32IZFINXZDINX-LABEL: test_frexp_v4f32_v4i32:
; RV32IZFINXZDINX: # %bb.0:
-; RV32IZFINXZDINX-NEXT: addi sp, sp, -48
-; RV32IZFINXZDINX-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s1, 36(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s2, 32(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s3, 28(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s4, 24(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: addi sp, sp, -32
+; RV32IZFINXZDINX-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32IZFINXZDINX-NEXT: mv s0, a4
; RV32IZFINXZDINX-NEXT: mv s1, a3
; RV32IZFINXZDINX-NEXT: mv s2, a2
; RV32IZFINXZDINX-NEXT: mv a2, a1
; RV32IZFINXZDINX-NEXT: mv s3, a0
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 8
+; RV32IZFINXZDINX-NEXT: addi a1, a0, 16
; RV32IZFINXZDINX-NEXT: mv a0, a2
; RV32IZFINXZDINX-NEXT: call frexpf
; RV32IZFINXZDINX-NEXT: mv s4, a0
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 12
+; RV32IZFINXZDINX-NEXT: addi a1, s3, 20
; RV32IZFINXZDINX-NEXT: mv a0, s2
; RV32IZFINXZDINX-NEXT: call frexpf
; RV32IZFINXZDINX-NEXT: mv s2, a0
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 16
+; RV32IZFINXZDINX-NEXT: addi a1, s3, 24
; RV32IZFINXZDINX-NEXT: mv a0, s1
; RV32IZFINXZDINX-NEXT: call frexpf
; RV32IZFINXZDINX-NEXT: mv s1, a0
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 20
+; RV32IZFINXZDINX-NEXT: addi a1, s3, 28
; RV32IZFINXZDINX-NEXT: mv a0, s0
; RV32IZFINXZDINX-NEXT: call frexpf
-; RV32IZFINXZDINX-NEXT: lw a1, 8(sp)
-; RV32IZFINXZDINX-NEXT: lw a2, 12(sp)
-; RV32IZFINXZDINX-NEXT: lw a3, 16(sp)
-; RV32IZFINXZDINX-NEXT: lw a4, 20(sp)
-; RV32IZFINXZDINX-NEXT: sw a1, 16(s3)
-; RV32IZFINXZDINX-NEXT: sw a2, 20(s3)
-; RV32IZFINXZDINX-NEXT: sw a3, 24(s3)
-; RV32IZFINXZDINX-NEXT: sw a4, 28(s3)
; RV32IZFINXZDINX-NEXT: sw s4, 0(s3)
; RV32IZFINXZDINX-NEXT: sw s2, 4(s3)
; RV32IZFINXZDINX-NEXT: sw s1, 8(s3)
; RV32IZFINXZDINX-NEXT: sw a0, 12(s3)
-; RV32IZFINXZDINX-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s1, 36(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s2, 32(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s3, 28(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s4, 24(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: addi sp, sp, 48
+; RV32IZFINXZDINX-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: addi sp, sp, 32
; RV32IZFINXZDINX-NEXT: ret
;
; RV64IZFINXZDINX-LABEL: test_frexp_v4f32_v4i32:
@@ -1096,41 +1080,34 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
; RV32IFD-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
; RV32IFD: # %bb.0:
-; RV32IFD-NEXT: addi sp, sp, -48
-; RV32IFD-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT: fsd fs0, 32(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT: fsd fs1, 24(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT: fsd fs2, 16(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT: fmv.s fs0, fa3
-; RV32IFD-NEXT: fmv.s fs1, fa2
-; RV32IFD-NEXT: fmv.s fs2, fa1
+; RV32IFD-NEXT: addi sp, sp, -32
+; RV32IFD-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT: fsd fs0, 16(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: fsd fs1, 8(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: fsd fs2, 0(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT: fmv.s fs0, fa2
+; RV32IFD-NEXT: fmv.s fs1, fa1
+; RV32IFD-NEXT: fmv.s fs2, fa0
; RV32IFD-NEXT: mv s0, a0
-; RV32IFD-NEXT: mv a0, sp
+; RV32IFD-NEXT: addi a0, a0, 12
+; RV32IFD-NEXT: fmv.s fa0, fa3
; RV32IFD-NEXT: call frexpf
-; RV32IFD-NEXT: addi a0, sp, 4
-; RV32IFD-NEXT: fmv.s fa0, fs2
+; RV32IFD-NEXT: addi a0, s0, 8
+; RV32IFD-NEXT: fmv.s fa0, fs0
; RV32IFD-NEXT: call frexpf
-; RV32IFD-NEXT: addi a0, sp, 8
+; RV32IFD-NEXT: addi a0, s0, 4
; RV32IFD-NEXT: fmv.s fa0, fs1
; RV32IFD-NEXT: call frexpf
-; RV32IFD-NEXT: addi a0, sp, 12
-; RV32IFD-NEXT: fmv.s fa0, fs0
+; RV32IFD-NEXT: fmv.s fa0, fs2
+; RV32IFD-NEXT: mv a0, s0
; RV32IFD-NEXT: call frexpf
-; RV32IFD-NEXT: lw a0, 0(sp)
-; RV32IFD-NEXT: lw a1, 4(sp)
-; RV32IFD-NEXT: lw a2, 8(sp)
-; RV32IFD-NEXT: lw a3, 12(sp)
-; RV32IFD-NEXT: sw a0, 0(s0)
-; RV32IFD-NEXT: sw a1, 4(s0)
-; RV32IFD-NEXT: sw a2, 8(s0)
-; RV32IFD-NEXT: sw a3, 12(s0)
-; RV32IFD-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT: fld fs0, 32(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: fld fs1, 24(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: fld fs2, 16(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT: addi sp, sp, 48
+; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT: fld fs0, 16(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: fld fs1, 8(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: fld fs2, 0(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT: addi sp, sp, 32
; RV32IFD-NEXT: ret
;
; RV64IFD-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
@@ -1174,43 +1151,34 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
;
; RV32IZFINXZDINX-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
; RV32IZFINXZDINX: # %bb.0:
-; RV32IZFINXZDINX-NEXT: addi sp, sp, -48
-; RV32IZFINXZDINX-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s1, 36(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s2, 32(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: sw s3, 28(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT: mv s0, a4
-; RV32IZFINXZDINX-NEXT: mv s1, a3
-; RV32IZFINXZDINX-NEXT: mv s2, a2
-; RV32IZFINXZDINX-NEXT: mv a2, a1
+; RV32IZFINXZDINX-NEXT: addi sp, sp, -32
+; RV32IZFINXZDINX-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT: mv s0, a3
+; RV32IZFINXZDINX-NEXT: mv s1, a2
+; RV32IZFINXZDINX-NEXT: mv s2, a1
; RV32IZFINXZDINX-NEXT: mv s3, a0
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 12
-; RV32IZFINXZDINX-NEXT: mv a0, a2
+; RV32IZFINXZDINX-NEXT: addi a1, a0, 12
+; RV32IZFINXZDINX-NEXT: mv a0, a4
; RV32IZFINXZDINX-NEXT: call frexpf
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 16
-; RV32IZFINXZDINX-NEXT: mv a0, s2
+; RV32IZFINXZDINX-NEXT: addi a1, s3, 8
+; RV32IZFINXZDINX-NEXT: mv a0, s0
; RV32IZFINXZDINX-NEXT: call frexpf
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 20
+; RV32IZFINXZDINX-NEXT: addi a1, s3, 4
; RV32IZFINXZDINX-NEXT: mv a0, s1
; RV32IZFINXZDINX-NEXT: call frexpf
-; RV32IZFINXZDINX-NEXT: addi a1, sp, 24
-; RV32IZFINXZDINX-NEXT: mv a0, s0
+; RV32IZFINXZDINX-NEXT: mv a0, s2
+; RV32IZFINXZDINX-NEXT: mv a1, s3
; RV32IZFINXZDINX-NEXT: call frexpf
-; RV32IZFINXZDINX-NEXT: lw a0, 12(sp)
-; RV32IZFINXZDINX-NEXT: lw a1, 16(sp)
-; RV32IZFINXZDINX-NEXT: lw a2, 20(sp)
-; RV32IZFINXZDINX-NEXT: lw a3, 24(sp)
-; RV32IZFINXZDINX-NEXT: sw a0, 0(s3)
-; RV32IZFINXZDINX-NEXT: sw a1, 4(s3)
-; RV32IZFINXZDINX-NEXT: sw a2, 8(s3)
-; RV32IZFINXZDINX-NEXT: sw a3, 12(s3)
-; RV32IZFINXZDINX-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s1, 36(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s2, 32(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: lw s3, 28(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT: addi sp, sp, 48
+; RV32IZFINXZDINX-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT: addi sp, sp, 32
; RV32IZFINXZDINX-NEXT: ret
;
; RV64IZFINXZDINX-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
diff --git a/llvm/test/CodeGen/X86/llvm.frexp.ll b/llvm/test/CodeGen/X86/llvm.frexp.ll
index cd560ad627de4c..96de34519556d0 100644
--- a/llvm/test/CodeGen/X86/llvm.frexp.ll
+++ b/llvm/test/CodeGen/X86/llvm.frexp.ll
@@ -325,28 +325,27 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) {
;
; WIN32-LABEL: test_frexp_v4f32_v4i32:
; WIN32: # %bb.0:
-; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
-; WIN32-NEXT: subl $60, %esp
+; WIN32-NEXT: subl $44, %esp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: leal 24(%esi), %eax
; WIN32-NEXT: movl %eax, {{[0-9]+}}(%esp)
; WIN32-NEXT: flds {{[0-9]+}}(%esp)
; WIN32-NEXT: fstpl (%esp)
; WIN32-NEXT: calll _frexp
; WIN32-NEXT: fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: leal 20(%esi), %eax
; WIN32-NEXT: movl %eax, {{[0-9]+}}(%esp)
; WIN32-NEXT: flds {{[0-9]+}}(%esp)
; WIN32-NEXT: fstpl (%esp)
; WIN32-NEXT: calll _frexp
; WIN32-NEXT: fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: leal 16(%esi), %eax
; WIN32-NEXT: movl %eax, {{[0-9]+}}(%esp)
; WIN32-NEXT: flds {{[0-9]+}}(%esp)
; WIN32-NEXT: fstpl (%esp)
; WIN32-NEXT: calll _frexp
-; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: leal 28(%esi), %eax
; WIN32-NEXT: movl %eax, {{[0-9]+}}(%esp)
; WIN32-NEXT: flds {{[0-9]+}}(%esp)
; WIN32-NEXT: fstpl (%esp)
@@ -361,22 +360,13 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) {
; WIN32-NEXT: flds {{[0-9]+}}(%esp)
; WIN32-NEXT: flds {{[0-9]+}}(%esp)
; WIN32-NEXT: flds {{[0-9]+}}(%esp)
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT: movl %edi, 28(%esi)
-; WIN32-NEXT: movl %edx, 24(%esi)
-; WIN32-NEXT: movl %ecx, 20(%esi)
-; WIN32-NEXT: movl %eax, 16(%esi)
; WIN32-NEXT: fstps 12(%esi)
; WIN32-NEXT: fstps 8(%esi)
; WIN32-NEXT: fstps 4(%esi)
; WIN32-NEXT: fstps (%esi)
; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: addl $60, %esp
+; WIN32-NEXT: addl $44, %esp
; WIN32-NEXT: popl %esi
-; WIN32-NEXT: popl %edi
; WIN32-NEXT: retl
%result = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> %a)
ret { <4 x float>, <4 x i32> } %result
@@ -499,46 +489,35 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) {
;
; WIN32-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
; WIN32: # %bb.0:
-; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
-; WIN32-NEXT: subl $28, %esp
+; WIN32-NEXT: subl $12, %esp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: leal 8(%esi), %eax
; WIN32-NEXT: movl %eax, {{[0-9]+}}(%esp)
; WIN32-NEXT: flds {{[0-9]+}}(%esp)
; WIN32-NEXT: fstpl (%esp)
; WIN32-NEXT: calll _frexp
; WIN32-NEXT: fstp %st(0)
-; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: leal 4(%esi), %eax
; WIN32-NEXT: movl %eax, {{[0-9]+}}(%esp)
; WIN32-NEXT: flds {{[0-9]+}}(%esp)
; WIN32-NEXT: fstpl (%esp)
; WIN32-NEXT: calll _frexp
; WIN32-NEXT: fstp %st(0)
-; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: leal 12(%esi), %eax
; WIN32-NEXT: movl %eax, {{[0-9]+}}(%esp)
; WIN32-NEXT: flds {{[0-9]+}}(%esp)
; WIN32-NEXT: fstpl (%esp)
; WIN32-NEXT: calll _frexp
; WIN32-NEXT: fstp %st(0)
-; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; WIN32-NEXT: movl %esi, {{[0-9]+}}(%esp)
; WIN32-NEXT: flds {{[0-9]+}}(%esp)
; WIN32-NEXT: fstpl (%esp)
; WIN32-NEXT: calll _frexp
; WIN32-NEXT: fstp %st(0)
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT: movl %edi, 12(%esi)
-; WIN32-NEXT: movl %edx, 8(%esi)
-; WIN32-NEXT: movl %ecx, 4(%esi)
-; WIN32-NEXT: movl %eax, (%esi)
; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: addl $28, %esp
+; WIN32-NEXT: addl $12, %esp
; WIN32-NEXT: popl %esi
-; WIN32-NEXT: popl %edi
; WIN32-NEXT: retl
%result = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> %a)
%result.1 = extractvalue { <4 x float>, <4 x i32> } %result, 1
diff --git a/llvm/test/CodeGen/X86/llvm.sincos.ll b/llvm/test/CodeGen/X86/llvm.sincos.ll
new file mode 100644
index 00000000000000..a429314630e56b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/llvm.sincos.ll
@@ -0,0 +1,162 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --version 5
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s
+
+define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias %out_cos) {
+; CHECK-LABEL: test_sincos_v4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushl %edi
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 12
+; CHECK-NEXT: subl $52, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: .cfi_offset %esi, -12
+; CHECK-NEXT: .cfi_offset %edi, -8
+; CHECK-NEXT: movl 84(%esp), %esi
+; CHECK-NEXT: flds 76(%esp)
+; CHECK-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT: flds 64(%esp)
+; CHECK-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT: flds 72(%esp)
+; CHECK-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT: flds 68(%esp)
+; CHECK-NEXT: movl 80(%esp), %edi
+; CHECK-NEXT: leal 40(%esp), %eax
+; CHECK-NEXT: movl %eax, 8(%esp)
+; CHECK-NEXT: leal 4(%edi), %eax
+; CHECK-NEXT: movl %eax, 4(%esp)
+; CHECK-NEXT: fstps (%esp)
+; CHECK-NEXT: calll sincosf
+; CHECK-NEXT: leal 44(%esp), %eax
+; CHECK-NEXT: movl %eax, 8(%esp)
+; CHECK-NEXT: leal 8(%edi), %eax
+; CHECK-NEXT: movl %eax, 4(%esp)
+; CHECK-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; CHECK-NEXT: fstps (%esp)
+; CHECK-NEXT: calll sincosf
+; CHECK-NEXT: leal 36(%esp), %eax
+; CHECK-NEXT: movl %eax, 8(%esp)
+; CHECK-NEXT: movl %edi, 4(%esp)
+; CHECK-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; CHECK-NEXT: fstps (%esp)
+; CHECK-NEXT: calll sincosf
+; CHECK-NEXT: leal 48(%esp), %eax
+; CHECK-NEXT: movl %eax, 8(%esp)
+; CHECK-NEXT: addl $12, %edi
+; CHECK-NEXT: movl %edi, 4(%esp)
+; CHECK-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; CHECK-NEXT: fstps (%esp)
+; CHECK-NEXT: calll sincosf
+; CHECK-NEXT: flds 36(%esp)
+; CHECK-NEXT: flds 40(%esp)
+; CHECK-NEXT: flds 44(%esp)
+; CHECK-NEXT: flds 48(%esp)
+; CHECK-NEXT: fstps 12(%esi)
+; CHECK-NEXT: fstps 8(%esi)
+; CHECK-NEXT: fstps 4(%esi)
+; CHECK-NEXT: fstps (%esi)
+; CHECK-NEXT: addl $52, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 12
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: popl %edi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
+; CHECK-NEXT: retl
+ %result = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> %x)
+ %result.0 = extractvalue { <4 x float>, <4 x float> } %result, 0
+ %result.1 = extractvalue { <4 x float>, <4 x float> } %result, 1
+ store <4 x float> %result.0, ptr %out_sin, align 4
+ store <4 x float> %result.1, ptr %out_cos, align 4
+ ret void
+}
+
+define void @test_sincos_v2f64(<2 x double> %x, ptr noalias %out_sin, ptr noalias %out_cos) {
+; CHECK-LABEL: test_sincos_v2f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushl %edi
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 12
+; CHECK-NEXT: subl $52, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: .cfi_offset %esi, -12
+; CHECK-NEXT: .cfi_offset %edi, -8
+; CHECK-NEXT: movl 84(%esp), %esi
+; CHECK-NEXT: fldl 72(%esp)
+; CHECK-NEXT: fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
+; CHECK-NEXT: fldl 64(%esp)
+; CHECK-NEXT: movl 80(%esp), %edi
+; CHECK-NEXT: leal 24(%esp), %eax
+; CHECK-NEXT: movl %eax, 12(%esp)
+; CHECK-NEXT: movl %edi, 8(%esp)
+; CHECK-NEXT: fstpl (%esp)
+; CHECK-NEXT: calll sincos
+; CHECK-NEXT: leal 32(%esp), %eax
+; CHECK-NEXT: movl %eax, 12(%esp)
+; CHECK-NEXT: addl $8, %edi
+; CHECK-NEXT: movl %edi, 8(%esp)
+; CHECK-NEXT: fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; CHECK-NEXT: fstpl (%esp)
+; CHECK-NEXT: calll sincos
+; CHECK-NEXT: fldl 24(%esp)
+; CHECK-NEXT: fldl 32(%esp)
+; CHECK-NEXT: fstpl 8(%esi)
+; CHECK-NEXT: fstpl (%esi)
+; CHECK-NEXT: addl $52, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 12
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: popl %edi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
+; CHECK-NEXT: retl
+ %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %x)
+ %result.0 = extractvalue { <2 x double>, <2 x double> } %result, 0
+ %result.1 = extractvalue { <2 x double>, <2 x double> } %result, 1
+ store <2 x double> %result.0, ptr %out_sin, align 8
+ store <2 x double> %result.1, ptr %out_cos, align 8
+ ret void
+}
+
+declare void @foo(ptr, ptr)
+
+define void @can_fold_with_call_in_chain(float %x, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: can_fold_with_call_in_chain:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushl %edi
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 12
+; CHECK-NEXT: subl $20, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: .cfi_offset %esi, -12
+; CHECK-NEXT: .cfi_offset %edi, -8
+; CHECK-NEXT: flds 32(%esp)
+; CHECK-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT: movl 36(%esp), %edi
+; CHECK-NEXT: movl 40(%esp), %esi
+; CHECK-NEXT: movl %esi, 4(%esp)
+; CHECK-NEXT: movl %edi, (%esp)
+; CHECK-NEXT: calll foo at PLT
+; CHECK-NEXT: leal 16(%esp), %eax
+; CHECK-NEXT: movl %eax, 8(%esp)
+; CHECK-NEXT: movl %edi, 4(%esp)
+; CHECK-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; CHECK-NEXT: fstps (%esp)
+; CHECK-NEXT: calll sincosf
+; CHECK-NEXT: flds 16(%esp)
+; CHECK-NEXT: fstps (%esi)
+; CHECK-NEXT: addl $20, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 12
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: popl %edi
+; CHECK-NEXT: .cfi_def_cfa_offset 4
+; CHECK-NEXT: retl
+entry:
+ %sin = tail call float @llvm.sin.f32(float %x)
+ %cos = tail call float @llvm.cos.f32(float %x)
+ call void @foo(ptr %a, ptr %b)
+ store float %sin, ptr %a, align 4
+ store float %cos, ptr %b, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/sincos-stack-args.ll b/llvm/test/CodeGen/X86/sincos-stack-args.ll
index 9fb3a6769fda11..fe15330833334d 100644
--- a/llvm/test/CodeGen/X86/sincos-stack-args.ll
+++ b/llvm/test/CodeGen/X86/sincos-stack-args.ll
@@ -4,8 +4,11 @@
declare double @g(double, double)
-define double @f(double %a) {
-; CHECK-LABEL: f:
+; Though not visible within the IR, this will lower to an FSINCOS node, with
+; store users, that are within a (callseq_start, callseq_end) pair. In this
+; case, the stores cannot be folded into the sincos call.
+define double @negative_sincos_with_stores_within_call_sequence(double %a) {
+; CHECK-LABEL: negative_sincos_with_stores_within_call_sequence:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subl $44, %esp
; CHECK-NEXT: .cfi_def_cfa_offset 48
More information about the llvm-commits
mailing list