[llvm] [SDAG] Allow folding stack slots into sincos/frexp in more cases (PR #118117)

Tue Dec 3 08:16:29 PST 2024

https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/118117

>From be1ca7edf527e9fb2d3d32db7272de48e9649f8f Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 13 Nov 2024 16:06:55 +0000
Subject: [PATCH 1/2] [SDAG] Allow folding stack slots into sincos/frexp in
 more cases

This adds a new helper `CanFoldStoreIntoFPLibCall()` to check that it is
safe to fold a store into a node that will expand to a library call that
takes output pointers. This requires checking for two (independent)
properties:

1. The store is not within a CALLSEQ_START..CALLSEQ_END pair
 * If it is, the expansion would lead to nested call sequences (which is
   invalid)
2. The node does not appear as a predecessor to the store
 * If it does, attempting to merge the store into the call would result
   in a cycle in the DAG

These two properties are checked as part of the same traversal in
`CanFoldStoreIntoFPLibCall()`
---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  62 +++++-
 .../CodeGen/AArch64/sincos-stack-slots.ll     |  34 ++++
 llvm/test/CodeGen/PowerPC/f128-arith.ll       |  32 +--
 llvm/test/CodeGen/RISCV/llvm.frexp.ll         | 192 ++++++++----------
 llvm/test/CodeGen/X86/llvm.frexp.ll           |  45 ++--
 llvm/test/CodeGen/X86/sincos-stack-args.ll    |  45 ++++
 6 files changed, 236 insertions(+), 174 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 182529123ec6d8..8ddb4bcb8fb212 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2474,6 +2474,45 @@ SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
   return Subvectors[0];
 }
 
+/// Given a store node \p StoreNode, return true if it is safe to fold that node
+/// into \p FPNode, which expands to a library call with output pointers.
+static bool CanFoldStoreIntoFPLibCall(StoreSDNode *StoreNode, SDNode *FPNode) {
+  SmallVector<const SDNode *, 8> Worklist;
+  SmallVector<const SDNode *, 8> DeferredNodes;
+  SmallPtrSet<const SDNode *, 16> Visited;
+
+  // Skip FPNode use by StoreNode (that's the use we want to fold into FPNode).
+  for (SDValue Op : StoreNode->ops())
+    if (Op.getNode() != FPNode)
+      Worklist.push_back(Op.getNode());
+
+  while (!Worklist.empty()) {
+    const SDNode *Node = Worklist.pop_back_val();
+    auto [_, Inserted] = Visited.insert(Node);
+    if (!Inserted)
+      continue;
+
+    // Reached the FPNode (would result in a cycle).
+    // OR Reached CALLSEQ_START (would result in nested call sequences).
+    if (Node == FPNode || Node->getOpcode() == ISD::CALLSEQ_START)
+      return false;
+
+    if (Node->getOpcode() == ISD::CALLSEQ_END) {
+      // Defer looking into call sequences (so we can check we're outside one).
+      // We still need to look through these for the predecessor check.
+      DeferredNodes.push_back(Node);
+      continue;
+    }
+
+    for (SDValue Op : Node->ops())
+      Worklist.push_back(Op.getNode());
+  }
+
+  // True if we're outside a call sequence and don't have the FPNode as a
+  // predecessor. No cycles or nested call sequences possible.
+  return !SDNode::hasPredecessorHelper(FPNode, Visited, DeferredNodes);
+}
+
 bool SelectionDAG::expandMultipleResultFPLibCall(
     RTLIB::Libcall LC, SDNode *Node, SmallVectorImpl<SDValue> &Results,
     std::optional<unsigned> CallRetResNo) {
@@ -2502,11 +2541,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
 
   // Find users of the node that store the results (and share input chains). The
   // destination pointers can be used instead of creating stack allocations.
-  // FIXME: This should allow stores with the same chains (not just the entry
-  // chain), but there's a risk the store is within a (CALLSEQ_START,
-  // CALLSEQ_END) pair, which after this expansion will lead to nested call
-  // sequences.
-  SDValue InChain = getEntryNode();
+  SDValue StoresInChain{};
   SmallVector<StoreSDNode *, 2> ResultStores(NumResults);
   for (SDNode *User : Node->uses()) {
     if (!ISD::isNormalStore(User))
@@ -2515,13 +2550,25 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
     SDValue StoreValue = ST->getValue();
     unsigned ResNo = StoreValue.getResNo();
     Type *StoreType = StoreValue.getValueType().getTypeForEVT(Ctx);
-    if (CallRetResNo == ResNo || !ST->isSimple() ||
+    if (
+        // Ensure the store corresponds to an output pointer.
+        CallRetResNo == ResNo ||
+        // Ensure the store is not atomic or volatile.
+        !ST->isSimple() ||
+        // Ensure the store is in the default address space.
         ST->getAddressSpace() != 0 ||
+        // Ensure the store is properly aligned.
         ST->getAlign() <
             getDataLayout().getABITypeAlign(StoreType->getScalarType()) ||
-        ST->getChain() != InChain)
+        // Ensure all store chains are the same (so they don't alias).
+        (StoresInChain && ST->getChain() != StoresInChain)
+        // Avoid:
+        //  1. Creating cyclic dependencies.
+        //  2. Expanding the node to a call within a call sequence.
+        || !CanFoldStoreIntoFPLibCall(ST, Node))
       continue;
     ResultStores[ResNo] = ST;
+    StoresInChain = ST->getChain();
   }
 
   TargetLowering::ArgListTy Args;
@@ -2563,6 +2610,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
   Type *RetType = CallRetResNo.has_value()
                       ? Node->getValueType(*CallRetResNo).getTypeForEVT(Ctx)
                       : Type::getVoidTy(Ctx);
+  SDValue InChain = StoresInChain ? StoresInChain : getEntryNode();
   SDValue Callee = getExternalSymbol(VD ? VD->getVectorFnName().data() : LCName,
                                      TLI->getPointerTy(getDataLayout()));
   TargetLowering::CallLoweringInfo CLI(*this);
diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
index 8ef8b5d13b62d4..c5fef61c96af3a 100644
--- a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
+++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
@@ -253,3 +253,37 @@ entry:
   store double %cos, ptr %out_cos, align 4
   ret void
 }
+
+declare void @foo(ptr, ptr)
+
+define void @can_fold_with_call_in_chain(float %x, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: can_fold_with_call_in_chain:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str d8, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset b8, -32
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    fmov s8, s0
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    fmov s0, s8
+; CHECK-NEXT:    mov x0, x20
+; CHECK-NEXT:    mov x1, x19
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr d8, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call float @llvm.sin.f32(float %x)
+  %cos = tail call float @llvm.cos.f32(float %x)
+  call void @foo(ptr %a, ptr %b)
+  store float %sin, ptr %a, align 4
+  store float %cos, ptr %b, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/f128-arith.ll b/llvm/test/CodeGen/PowerPC/f128-arith.ll
index 35e5d61947ead7..decc4a38f7ccd4 100644
--- a/llvm/test/CodeGen/PowerPC/f128-arith.ll
+++ b/llvm/test/CodeGen/PowerPC/f128-arith.ll
@@ -1365,45 +1365,33 @@ define dso_local fp128 @qpFREXP(ptr %a, ptr %b) {
 ; CHECK-LABEL: qpFREXP:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    mflr r0
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    .cfi_offset r30, -16
-; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    stdu r1, -64(r1)
-; CHECK-NEXT:    std r0, 80(r1)
-; CHECK-NEXT:    addi r5, r1, 44
-; CHECK-NEXT:    mr r30, r4
 ; CHECK-NEXT:    lxv v2, 0(r3)
+; CHECK-NEXT:    mr r5, r4
 ; CHECK-NEXT:    bl frexpf128
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    lwz r3, 44(r1)
-; CHECK-NEXT:    stw r3, 0(r30)
-; CHECK-NEXT:    addi r1, r1, 64
+; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
-; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P8-LABEL: qpFREXP:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
-; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
-; CHECK-P8-NEXT:    stdu r1, -64(r1)
-; CHECK-P8-NEXT:    std r0, 80(r1)
-; CHECK-P8-NEXT:    addi r5, r1, 44
-; CHECK-P8-NEXT:    mr r30, r4
 ; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    mr r5, r4
 ; CHECK-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-P8-NEXT:    bl frexpf128
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    lwz r3, 44(r1)
-; CHECK-P8-NEXT:    stw r3, 0(r30)
-; CHECK-P8-NEXT:    addi r1, r1, 64
+; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
-; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
diff --git a/llvm/test/CodeGen/RISCV/llvm.frexp.ll b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
index 74dec76a02e892..4a77b4d32cdda6 100644
--- a/llvm/test/CodeGen/RISCV/llvm.frexp.ll
+++ b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
@@ -543,50 +543,42 @@ define i32 @test_frexp_f32_i32_only_use_exp(float %a) nounwind {
 define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwind {
 ; RV32IFD-LABEL: test_frexp_v4f32_v4i32:
 ; RV32IFD:       # %bb.0:
-; RV32IFD-NEXT:    addi sp, sp, -64
-; RV32IFD-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    fsd fs0, 48(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT:    fsd fs1, 40(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT:    fsd fs2, 32(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT:    fsd fs3, 24(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT:    addi sp, sp, -48
+; RV32IFD-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT:    fsd fs0, 32(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT:    fsd fs1, 24(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT:    fsd fs2, 16(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT:    fsd fs3, 8(sp) # 8-byte Folded Spill
 ; RV32IFD-NEXT:    fmv.s fs0, fa3
 ; RV32IFD-NEXT:    fmv.s fs1, fa2
 ; RV32IFD-NEXT:    fmv.s fs2, fa1
 ; RV32IFD-NEXT:    mv s0, a0
-; RV32IFD-NEXT:    addi a0, sp, 8
+; RV32IFD-NEXT:    addi a0, a0, 16
 ; RV32IFD-NEXT:    call frexpf
 ; RV32IFD-NEXT:    fmv.s fs3, fa0
-; RV32IFD-NEXT:    addi a0, sp, 12
+; RV32IFD-NEXT:    addi a0, s0, 20
 ; RV32IFD-NEXT:    fmv.s fa0, fs2
 ; RV32IFD-NEXT:    call frexpf
 ; RV32IFD-NEXT:    fmv.s fs2, fa0
-; RV32IFD-NEXT:    addi a0, sp, 16
+; RV32IFD-NEXT:    addi a0, s0, 24
 ; RV32IFD-NEXT:    fmv.s fa0, fs1
 ; RV32IFD-NEXT:    call frexpf
 ; RV32IFD-NEXT:    fmv.s fs1, fa0
-; RV32IFD-NEXT:    addi a0, sp, 20
+; RV32IFD-NEXT:    addi a0, s0, 28
 ; RV32IFD-NEXT:    fmv.s fa0, fs0
 ; RV32IFD-NEXT:    call frexpf
-; RV32IFD-NEXT:    lw a0, 8(sp)
-; RV32IFD-NEXT:    lw a1, 12(sp)
-; RV32IFD-NEXT:    lw a2, 16(sp)
-; RV32IFD-NEXT:    lw a3, 20(sp)
-; RV32IFD-NEXT:    sw a0, 16(s0)
-; RV32IFD-NEXT:    sw a1, 20(s0)
-; RV32IFD-NEXT:    sw a2, 24(s0)
-; RV32IFD-NEXT:    sw a3, 28(s0)
 ; RV32IFD-NEXT:    fsw fs3, 0(s0)
 ; RV32IFD-NEXT:    fsw fs2, 4(s0)
 ; RV32IFD-NEXT:    fsw fs1, 8(s0)
 ; RV32IFD-NEXT:    fsw fa0, 12(s0)
-; RV32IFD-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT:    fld fs0, 48(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT:    fld fs1, 40(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT:    fld fs2, 32(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT:    fld fs3, 24(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT:    addi sp, sp, 64
+; RV32IFD-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT:    fld fs0, 32(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT:    fld fs1, 24(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT:    fld fs2, 16(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT:    fld fs3, 8(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT:    addi sp, sp, 48
 ; RV32IFD-NEXT:    ret
 ;
 ; RV64IFD-LABEL: test_frexp_v4f32_v4i32:
@@ -639,52 +631,44 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ;
 ; RV32IZFINXZDINX-LABEL: test_frexp_v4f32_v4i32:
 ; RV32IZFINXZDINX:       # %bb.0:
-; RV32IZFINXZDINX-NEXT:    addi sp, sp, -48
-; RV32IZFINXZDINX-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    addi sp, sp, -32
+; RV32IZFINXZDINX-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32IZFINXZDINX-NEXT:    mv s0, a4
 ; RV32IZFINXZDINX-NEXT:    mv s1, a3
 ; RV32IZFINXZDINX-NEXT:    mv s2, a2
 ; RV32IZFINXZDINX-NEXT:    mv a2, a1
 ; RV32IZFINXZDINX-NEXT:    mv s3, a0
-; RV32IZFINXZDINX-NEXT:    addi a1, sp, 8
+; RV32IZFINXZDINX-NEXT:    addi a1, a0, 16
 ; RV32IZFINXZDINX-NEXT:    mv a0, a2
 ; RV32IZFINXZDINX-NEXT:    call frexpf
 ; RV32IZFINXZDINX-NEXT:    mv s4, a0
-; RV32IZFINXZDINX-NEXT:    addi a1, sp, 12
+; RV32IZFINXZDINX-NEXT:    addi a1, s3, 20
 ; RV32IZFINXZDINX-NEXT:    mv a0, s2
 ; RV32IZFINXZDINX-NEXT:    call frexpf
 ; RV32IZFINXZDINX-NEXT:    mv s2, a0
-; RV32IZFINXZDINX-NEXT:    addi a1, sp, 16
+; RV32IZFINXZDINX-NEXT:    addi a1, s3, 24
 ; RV32IZFINXZDINX-NEXT:    mv a0, s1
 ; RV32IZFINXZDINX-NEXT:    call frexpf
 ; RV32IZFINXZDINX-NEXT:    mv s1, a0
-; RV32IZFINXZDINX-NEXT:    addi a1, sp, 20
+; RV32IZFINXZDINX-NEXT:    addi a1, s3, 28
 ; RV32IZFINXZDINX-NEXT:    mv a0, s0
 ; RV32IZFINXZDINX-NEXT:    call frexpf
-; RV32IZFINXZDINX-NEXT:    lw a1, 8(sp)
-; RV32IZFINXZDINX-NEXT:    lw a2, 12(sp)
-; RV32IZFINXZDINX-NEXT:    lw a3, 16(sp)
-; RV32IZFINXZDINX-NEXT:    lw a4, 20(sp)
-; RV32IZFINXZDINX-NEXT:    sw a1, 16(s3)
-; RV32IZFINXZDINX-NEXT:    sw a2, 20(s3)
-; RV32IZFINXZDINX-NEXT:    sw a3, 24(s3)
-; RV32IZFINXZDINX-NEXT:    sw a4, 28(s3)
 ; RV32IZFINXZDINX-NEXT:    sw s4, 0(s3)
 ; RV32IZFINXZDINX-NEXT:    sw s2, 4(s3)
 ; RV32IZFINXZDINX-NEXT:    sw s1, 8(s3)
 ; RV32IZFINXZDINX-NEXT:    sw a0, 12(s3)
-; RV32IZFINXZDINX-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s2, 32(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s3, 28(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s4, 24(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    addi sp, sp, 48
+; RV32IZFINXZDINX-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    addi sp, sp, 32
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: test_frexp_v4f32_v4i32:
@@ -1096,41 +1080,34 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
 define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; RV32IFD-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
 ; RV32IFD:       # %bb.0:
-; RV32IFD-NEXT:    addi sp, sp, -48
-; RV32IFD-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    fsd fs0, 32(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT:    fsd fs1, 24(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT:    fsd fs2, 16(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT:    fmv.s fs0, fa3
-; RV32IFD-NEXT:    fmv.s fs1, fa2
-; RV32IFD-NEXT:    fmv.s fs2, fa1
+; RV32IFD-NEXT:    addi sp, sp, -32
+; RV32IFD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT:    fsd fs0, 16(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT:    fsd fs1, 8(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT:    fsd fs2, 0(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT:    fmv.s fs0, fa2
+; RV32IFD-NEXT:    fmv.s fs1, fa1
+; RV32IFD-NEXT:    fmv.s fs2, fa0
 ; RV32IFD-NEXT:    mv s0, a0
-; RV32IFD-NEXT:    mv a0, sp
+; RV32IFD-NEXT:    addi a0, a0, 12
+; RV32IFD-NEXT:    fmv.s fa0, fa3
 ; RV32IFD-NEXT:    call frexpf
-; RV32IFD-NEXT:    addi a0, sp, 4
-; RV32IFD-NEXT:    fmv.s fa0, fs2
+; RV32IFD-NEXT:    addi a0, s0, 8
+; RV32IFD-NEXT:    fmv.s fa0, fs0
 ; RV32IFD-NEXT:    call frexpf
-; RV32IFD-NEXT:    addi a0, sp, 8
+; RV32IFD-NEXT:    addi a0, s0, 4
 ; RV32IFD-NEXT:    fmv.s fa0, fs1
 ; RV32IFD-NEXT:    call frexpf
-; RV32IFD-NEXT:    addi a0, sp, 12
-; RV32IFD-NEXT:    fmv.s fa0, fs0
+; RV32IFD-NEXT:    fmv.s fa0, fs2
+; RV32IFD-NEXT:    mv a0, s0
 ; RV32IFD-NEXT:    call frexpf
-; RV32IFD-NEXT:    lw a0, 0(sp)
-; RV32IFD-NEXT:    lw a1, 4(sp)
-; RV32IFD-NEXT:    lw a2, 8(sp)
-; RV32IFD-NEXT:    lw a3, 12(sp)
-; RV32IFD-NEXT:    sw a0, 0(s0)
-; RV32IFD-NEXT:    sw a1, 4(s0)
-; RV32IFD-NEXT:    sw a2, 8(s0)
-; RV32IFD-NEXT:    sw a3, 12(s0)
-; RV32IFD-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT:    fld fs0, 32(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT:    fld fs1, 24(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT:    fld fs2, 16(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT:    addi sp, sp, 48
+; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT:    fld fs0, 16(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT:    fld fs1, 8(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT:    fld fs2, 0(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT:    addi sp, sp, 32
 ; RV32IFD-NEXT:    ret
 ;
 ; RV64IFD-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
@@ -1174,43 +1151,34 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ;
 ; RV32IZFINXZDINX-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
 ; RV32IZFINXZDINX:       # %bb.0:
-; RV32IZFINXZDINX-NEXT:    addi sp, sp, -48
-; RV32IZFINXZDINX-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    mv s0, a4
-; RV32IZFINXZDINX-NEXT:    mv s1, a3
-; RV32IZFINXZDINX-NEXT:    mv s2, a2
-; RV32IZFINXZDINX-NEXT:    mv a2, a1
+; RV32IZFINXZDINX-NEXT:    addi sp, sp, -32
+; RV32IZFINXZDINX-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    mv s0, a3
+; RV32IZFINXZDINX-NEXT:    mv s1, a2
+; RV32IZFINXZDINX-NEXT:    mv s2, a1
 ; RV32IZFINXZDINX-NEXT:    mv s3, a0
-; RV32IZFINXZDINX-NEXT:    addi a1, sp, 12
-; RV32IZFINXZDINX-NEXT:    mv a0, a2
+; RV32IZFINXZDINX-NEXT:    addi a1, a0, 12
+; RV32IZFINXZDINX-NEXT:    mv a0, a4
 ; RV32IZFINXZDINX-NEXT:    call frexpf
-; RV32IZFINXZDINX-NEXT:    addi a1, sp, 16
-; RV32IZFINXZDINX-NEXT:    mv a0, s2
+; RV32IZFINXZDINX-NEXT:    addi a1, s3, 8
+; RV32IZFINXZDINX-NEXT:    mv a0, s0
 ; RV32IZFINXZDINX-NEXT:    call frexpf
-; RV32IZFINXZDINX-NEXT:    addi a1, sp, 20
+; RV32IZFINXZDINX-NEXT:    addi a1, s3, 4
 ; RV32IZFINXZDINX-NEXT:    mv a0, s1
 ; RV32IZFINXZDINX-NEXT:    call frexpf
-; RV32IZFINXZDINX-NEXT:    addi a1, sp, 24
-; RV32IZFINXZDINX-NEXT:    mv a0, s0
+; RV32IZFINXZDINX-NEXT:    mv a0, s2
+; RV32IZFINXZDINX-NEXT:    mv a1, s3
 ; RV32IZFINXZDINX-NEXT:    call frexpf
-; RV32IZFINXZDINX-NEXT:    lw a0, 12(sp)
-; RV32IZFINXZDINX-NEXT:    lw a1, 16(sp)
-; RV32IZFINXZDINX-NEXT:    lw a2, 20(sp)
-; RV32IZFINXZDINX-NEXT:    lw a3, 24(sp)
-; RV32IZFINXZDINX-NEXT:    sw a0, 0(s3)
-; RV32IZFINXZDINX-NEXT:    sw a1, 4(s3)
-; RV32IZFINXZDINX-NEXT:    sw a2, 8(s3)
-; RV32IZFINXZDINX-NEXT:    sw a3, 12(s3)
-; RV32IZFINXZDINX-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s2, 32(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s3, 28(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    addi sp, sp, 48
+; RV32IZFINXZDINX-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    addi sp, sp, 32
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
diff --git a/llvm/test/CodeGen/X86/llvm.frexp.ll b/llvm/test/CodeGen/X86/llvm.frexp.ll
index cd560ad627de4c..96de34519556d0 100644
--- a/llvm/test/CodeGen/X86/llvm.frexp.ll
+++ b/llvm/test/CodeGen/X86/llvm.frexp.ll
@@ -325,28 +325,27 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) {
 ;
 ; WIN32-LABEL: test_frexp_v4f32_v4i32:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $60, %esp
+; WIN32-NEXT:    subl $44, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    leal 24(%esi), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    leal 20(%esi), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    leal 16(%esi), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
-; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    leal 28(%esi), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
@@ -361,22 +360,13 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) {
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl %edi, 28(%esi)
-; WIN32-NEXT:    movl %edx, 24(%esi)
-; WIN32-NEXT:    movl %ecx, 20(%esi)
-; WIN32-NEXT:    movl %eax, 16(%esi)
 ; WIN32-NEXT:    fstps 12(%esi)
 ; WIN32-NEXT:    fstps 8(%esi)
 ; WIN32-NEXT:    fstps 4(%esi)
 ; WIN32-NEXT:    fstps (%esi)
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    addl $60, %esp
+; WIN32-NEXT:    addl $44, %esp
 ; WIN32-NEXT:    popl %esi
-; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    retl
   %result = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> %a)
   ret { <4 x float>, <4 x i32> } %result
@@ -499,46 +489,35 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) {
 ;
 ; WIN32-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $28, %esp
+; WIN32-NEXT:    subl $12, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    leal 8(%esi), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    leal 4(%esi), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    leal 12(%esi), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl %edi, 12(%esi)
-; WIN32-NEXT:    movl %edx, 8(%esi)
-; WIN32-NEXT:    movl %ecx, 4(%esi)
-; WIN32-NEXT:    movl %eax, (%esi)
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    addl $28, %esp
+; WIN32-NEXT:    addl $12, %esp
 ; WIN32-NEXT:    popl %esi
-; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    retl
   %result = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> %a)
   %result.1 = extractvalue { <4 x float>, <4 x i32> } %result, 1
diff --git a/llvm/test/CodeGen/X86/sincos-stack-args.ll b/llvm/test/CodeGen/X86/sincos-stack-args.ll
index 9fb3a6769fda11..a76cdf54ab63f7 100644
--- a/llvm/test/CodeGen/X86/sincos-stack-args.ll
+++ b/llvm/test/CodeGen/X86/sincos-stack-args.ll
@@ -33,3 +33,48 @@ entry:
   %call = tail call double @g(double %add, double %0)
   ret double %call
 }
+
+declare void @foo(ptr, ptr)
+
+define void @can_fold_with_call_in_chain(float %x, ptr noalias %a, ptr noalias %b) {
+; CHECK-LABEL: can_fold_with_call_in_chain:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-NEXT:    subl $20, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset %esi, -12
+; CHECK-NEXT:    .cfi_offset %edi, -8
+; CHECK-NEXT:    flds 32(%esp)
+; CHECK-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT:    movl 36(%esp), %edi
+; CHECK-NEXT:    movl 40(%esp), %esi
+; CHECK-NEXT:    movl %esi, 4(%esp)
+; CHECK-NEXT:    movl %edi, (%esp)
+; CHECK-NEXT:    calll foo at PLT
+; CHECK-NEXT:    leal 16(%esp), %eax
+; CHECK-NEXT:    movl %eax, 8(%esp)
+; CHECK-NEXT:    movl %edi, 4(%esp)
+; CHECK-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; CHECK-NEXT:    fstps (%esp)
+; CHECK-NEXT:    calll sincosf
+; CHECK-NEXT:    flds 16(%esp)
+; CHECK-NEXT:    fstps (%esi)
+; CHECK-NEXT:    addl $20, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-NEXT:    retl
+entry:
+  %sin = tail call float @llvm.sin.f32(float %x)
+  %cos = tail call float @llvm.cos.f32(float %x)
+  call void @foo(ptr %a, ptr %b)
+  store float %sin, ptr %a, align 4
+  store float %cos, ptr %b, align 4
+  ret void
+}
+

>From b2c080c9440ba2d76555d84b0b808aa9a54b3564 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 3 Dec 2024 16:12:42 +0000
Subject: [PATCH 2/2] Rename function

---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 8ddb4bcb8fb212..9a2725cce4fadf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2476,7 +2476,8 @@ SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
 
 /// Given a store node \p StoreNode, return true if it is safe to fold that node
 /// into \p FPNode, which expands to a library call with output pointers.
-static bool CanFoldStoreIntoFPLibCall(StoreSDNode *StoreNode, SDNode *FPNode) {
+static bool canFoldStoreIntoLibCallOutputPointers(StoreSDNode *StoreNode,
+                                                  SDNode *FPNode) {
   SmallVector<const SDNode *, 8> Worklist;
   SmallVector<const SDNode *, 8> DeferredNodes;
   SmallPtrSet<const SDNode *, 16> Visited;
@@ -2561,11 +2562,11 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
         ST->getAlign() <
             getDataLayout().getABITypeAlign(StoreType->getScalarType()) ||
         // Ensure all store chains are the same (so they don't alias).
-        (StoresInChain && ST->getChain() != StoresInChain)
+        (StoresInChain && ST->getChain() != StoresInChain) ||
         // Avoid:
         //  1. Creating cyclic dependencies.
         //  2. Expanding the node to a call within a call sequence.
-        || !CanFoldStoreIntoFPLibCall(ST, Node))
+        !canFoldStoreIntoLibCallOutputPointers(ST, Node))
       continue;
     ResultStores[ResNo] = ST;
     StoresInChain = ST->getChain();