[llvm] [SDAG] Merge multiple-result libcall expansion into DAG.expandMultipleResultFPLibCall() (PR #114792)

Tue Nov 5 07:35:44 PST 2024

https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/114792

>From 65858296963eeb5105dd14b9b148182b0636a9e5 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 1 Nov 2024 17:44:24 +0000
Subject: [PATCH 1/6] [SDAG] Merge multiple-result libcall expansion into
 DAG.expandMultipleResultFPLibCall()

This merges the logic for expanding both FFREXP and FSINCOS into one
method `DAG.expandMultipleResultFPLibCall(). This reduces duplication
and also allows FFREXP to benefit from the stack slot elimination
implemented for FSINCOS. This method will also be used in future to
implement more multiple-result intrinsics (such as modf and sincospi).
---
 llvm/include/llvm/CodeGen/SelectionDAG.h      | 11 ++-
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 52 ++---------
 .../SelectionDAG/LegalizeVectorOps.cpp        | 18 +++-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 86 +++++++++++++------
 llvm/test/CodeGen/X86/llvm.frexp.ll           | 45 +++-------
 5 files changed, 101 insertions(+), 111 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index e03773f46ae092..9035aa3ea31278 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -34,6 +34,7 @@
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/RuntimeLibcalls.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/ArrayRecycler.h"
 #include "llvm/Support/CodeGen.h"
@@ -1595,8 +1596,14 @@ class SelectionDAG {
   SDValue getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
                               SDValue Op2);
 
-  /// Expand the specified \c ISD::FSINCOS node as the Legalize pass would.
-  bool expandFSINCOS(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+  /// Expands a node with multiple results to an FP or vector libcall. The
+  /// libcall is expected to take all the operands of the \p Node followed by
+  /// output pointers for each of the results. \p CallRetResNo can be optionally
+  /// set to indicate that one of the results comes from the libcall's return
+  /// value.
+  bool expandMultipleResultFPLibCall(RTLIB::Libcall LC, SDNode *Node,
+                                     SmallVectorImpl<SDValue> &Results,
+                                     std::optional<unsigned> CallRetResNo = {});
 
   /// Expand the specified \c ISD::VAARG node as the Legalize pass would.
   SDValue expandVAArg(SDNode *Node);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 142774ef4f2e40..6c160e6e90a1fb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -132,7 +132,6 @@ class SelectionDAGLegalize {
                         TargetLowering::ArgListTy &&Args, bool isSigned);
   std::pair<SDValue, SDValue> ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned);
 
-  void ExpandFrexpLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   void ExpandFPLibCall(SDNode *Node, RTLIB::Libcall LC,
                        SmallVectorImpl<SDValue> &Results);
   void ExpandFPLibCall(SDNode *Node, RTLIB::Libcall Call_F32,
@@ -2144,47 +2143,6 @@ std::pair<SDValue, SDValue> SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall L
   return ExpandLibCall(LC, Node, std::move(Args), isSigned);
 }
 
-void SelectionDAGLegalize::ExpandFrexpLibCall(
-    SDNode *Node, SmallVectorImpl<SDValue> &Results) {
-  SDLoc dl(Node);
-  EVT VT = Node->getValueType(0);
-  EVT ExpVT = Node->getValueType(1);
-
-  SDValue FPOp = Node->getOperand(0);
-
-  EVT ArgVT = FPOp.getValueType();
-  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-
-  TargetLowering::ArgListEntry FPArgEntry;
-  FPArgEntry.Node = FPOp;
-  FPArgEntry.Ty = ArgTy;
-
-  SDValue StackSlot = DAG.CreateStackTemporary(ExpVT);
-  TargetLowering::ArgListEntry PtrArgEntry;
-  PtrArgEntry.Node = StackSlot;
-  PtrArgEntry.Ty = PointerType::get(*DAG.getContext(),
-                                    DAG.getDataLayout().getAllocaAddrSpace());
-
-  TargetLowering::ArgListTy Args = {FPArgEntry, PtrArgEntry};
-
-  RTLIB::Libcall LC = RTLIB::getFREXP(VT);
-  auto [Call, Chain] = ExpandLibCall(LC, Node, std::move(Args), false);
-
-  // FIXME: Get type of int for libcall declaration and cast
-
-  int FrameIdx = cast<FrameIndexSDNode>(StackSlot)->getIndex();
-  auto PtrInfo =
-      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
-
-  SDValue LoadExp = DAG.getLoad(ExpVT, dl, Chain, StackSlot, PtrInfo);
-  SDValue OutputChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                                    LoadExp.getValue(1), DAG.getRoot());
-  DAG.setRoot(OutputChain);
-
-  Results.push_back(Call);
-  Results.push_back(LoadExp);
-}
-
 void SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
                                            RTLIB::Libcall LC,
                                            SmallVectorImpl<SDValue> &Results) {
@@ -4562,10 +4520,11 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     ExpandFPLibCall(Node, RTLIB::TANH_F32, RTLIB::TANH_F64, RTLIB::TANH_F80,
                     RTLIB::TANH_F128, RTLIB::TANH_PPCF128, Results);
     break;
-  case ISD::FSINCOS:
-    // Expand into sincos libcall.
-    (void)DAG.expandFSINCOS(Node, Results);
+  case ISD::FSINCOS: {
+    RTLIB::Libcall LC = RTLIB::getFSINCOS(Node->getValueType(0));
+    DAG.expandMultipleResultFPLibCall(LC, Node, Results);
     break;
+  }
   case ISD::FLOG:
   case ISD::STRICT_FLOG:
     ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64, RTLIB::LOG_F80,
@@ -4649,7 +4608,8 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                     RTLIB::LDEXP_F128, RTLIB::LDEXP_PPCF128, Results);
     break;
   case ISD::FFREXP: {
-    ExpandFrexpLibCall(Node, Results);
+    RTLIB::Libcall LC = RTLIB::getFREXP(Node->getValueType(0));
+    DAG.expandMultipleResultFPLibCall(LC, Node, Results, /*CallRetResNo=*/0);
     break;
   }
   case ISD::FPOWI:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 8403c98545187a..6630357410a99b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -27,6 +27,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -270,6 +271,15 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   DenseMap<SDValue, SDValue>::iterator I = LegalizedNodes.find(Op);
   if (I != LegalizedNodes.end()) return I->second;
 
+  // Handle legalizing the root if it changes.
+  auto FixupRoot = make_scope_exit([&, OldRoot = DAG.getRoot()] {
+    SDValue Root = DAG.getRoot();
+    if (Root != OldRoot) {
+      if (SDValue LegalRoot = LegalizeOp(Root))
+        DAG.setRoot(LegalRoot);
+    }
+  });
+
   // Legalize the operands
   SmallVector<SDValue, 8> Ops;
   for (const SDValue &Oper : Op->op_values())
@@ -1192,11 +1202,13 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
       return;
 
     break;
-  case ISD::FSINCOS:
-    if (DAG.expandFSINCOS(Node, Results))
+  case ISD::FSINCOS: {
+    RTLIB::Libcall LC =
+        RTLIB::getFSINCOS(Node->getValueType(0).getVectorElementType());
+    if (DAG.expandMultipleResultFPLibCall(LC, Node, Results))
       return;
-
     break;
+  }
   case ISD::VECTOR_COMPRESS:
     Results.push_back(TLI.expandVECTOR_COMPRESS(Node, DAG));
     return;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 166b6dbf46db87..610e274caaafe3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2484,13 +2484,12 @@ SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
   return Subvectors[0];
 }
 
-bool SelectionDAG::expandFSINCOS(SDNode *Node,
-                                 SmallVectorImpl<SDValue> &Results) {
+bool SelectionDAG::expandMultipleResultFPLibCall(
+    RTLIB::Libcall LC, SDNode *Node, SmallVectorImpl<SDValue> &Results,
+    std::optional<unsigned> CallRetResNo) {
+  LLVMContext &Ctx = *getContext();
   EVT VT = Node->getValueType(0);
-  LLVMContext *Ctx = getContext();
-  Type *Ty = VT.getTypeForEVT(*Ctx);
-  RTLIB::Libcall LC =
-      RTLIB::getFSINCOS(VT.isVector() ? VT.getVectorElementType() : VT);
+  unsigned NumResults = Node->getNumValues();
 
   const char *LCName = TLI->getLibcallName(LC);
   if (!LC || !LCName)
@@ -2506,6 +2505,7 @@ bool SelectionDAG::expandFSINCOS(SDNode *Node,
     return nullptr;
   };
 
+  // For vector types, we must find a vector mapping for the libcall.
   VecDesc const *VD = nullptr;
   if (VT.isVector() && !(VD = getVecDesc()))
     return false;
@@ -2513,71 +2513,103 @@ bool SelectionDAG::expandFSINCOS(SDNode *Node,
   // Find users of the node that store the results (and share input chains). The
   // destination pointers can be used instead of creating stack allocations.
   SDValue StoresInChain{};
-  std::array<StoreSDNode *, 2> ResultStores = {nullptr};
+  SmallVector<StoreSDNode *, 2> ResultStores(NumResults);
   for (SDNode *User : Node->uses()) {
     if (!ISD::isNormalStore(User))
       continue;
     auto *ST = cast<StoreSDNode>(User);
-    if (!ST->isSimple() || ST->getAddressSpace() != 0 ||
-        ST->getAlign() < getDataLayout().getABITypeAlign(Ty->getScalarType()) ||
+    SDValue StoreValue = ST->getValue();
+    unsigned ResNo = StoreValue.getResNo();
+    Type *StoreType = StoreValue.getValueType().getTypeForEVT(Ctx);
+    if (CallRetResNo == ResNo || !ST->isSimple() ||
+        ST->getAddressSpace() != 0 ||
+        ST->getAlign() <
+            getDataLayout().getABITypeAlign(StoreType->getScalarType()) ||
         (StoresInChain && ST->getChain() != StoresInChain) ||
         Node->isPredecessorOf(ST->getChain().getNode()))
       continue;
-    ResultStores[ST->getValue().getResNo()] = ST;
+    ResultStores[ResNo] = ST;
     StoresInChain = ST->getChain();
   }
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry{};
 
-  // Pass the argument.
-  Entry.Node = Node->getOperand(0);
-  Entry.Ty = Ty;
-  Args.push_back(Entry);
+  // Pass the arguments.
+  for (const SDValue &Op : Node->op_values()) {
+    EVT ArgVT = Op.getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(Ctx);
+    Entry.Node = Node->getOperand(0);
+    Entry.Ty = ArgTy;
+    Args.push_back(Entry);
+  }
 
-  // Pass the output pointers for sin and cos.
-  SmallVector<SDValue, 2> ResultPtrs{};
-  for (StoreSDNode *ST : ResultStores) {
-    SDValue ResultPtr = ST ? ST->getBasePtr() : CreateStackTemporary(VT);
+  // Pass the output pointers.
+  SmallVector<SDValue, 2> ResultPtrs(NumResults);
+  for (auto [ResNo, ST] : llvm::enumerate(ResultStores)) {
+    if (ResNo == CallRetResNo)
+      continue;
+    EVT ResVT = Node->getValueType(ResNo);
+    SDValue ResultPtr = ST ? ST->getBasePtr() : CreateStackTemporary(ResVT);
     Entry.Node = ResultPtr;
-    Entry.Ty = PointerType::getUnqual(Ty->getContext());
+    Entry.Ty = PointerType::getUnqual(Ctx);
+    ResultPtrs[ResNo] = ResultPtr;
     Args.push_back(Entry);
-    ResultPtrs.push_back(ResultPtr);
   }
 
   SDLoc DL(Node);
 
+  // Pass the vector mask (if required).
   if (VD && VD->isMasked()) {
-    EVT MaskVT = TLI->getSetCCResultType(getDataLayout(), *Ctx, VT);
+    EVT MaskVT = TLI->getSetCCResultType(getDataLayout(), Ctx, VT);
     Entry.Node = getBoolConstant(true, DL, MaskVT, VT);
-    Entry.Ty = MaskVT.getTypeForEVT(*Ctx);
+    Entry.Ty = MaskVT.getTypeForEVT(Ctx);
     Args.push_back(Entry);
   }
 
+  Type *RetType = CallRetResNo.has_value()
+                      ? Node->getValueType(*CallRetResNo).getTypeForEVT(Ctx)
+                      : Type::getVoidTy(Ctx);
   SDValue InChain = StoresInChain ? StoresInChain : getEntryNode();
   SDValue Callee = getExternalSymbol(VD ? VD->getVectorFnName().data() : LCName,
                                      TLI->getPointerTy(getDataLayout()));
   TargetLowering::CallLoweringInfo CLI(*this);
   CLI.setDebugLoc(DL).setChain(InChain).setLibCallee(
-      TLI->getLibcallCallingConv(LC), Type::getVoidTy(*Ctx), Callee,
-      std::move(Args));
+      TLI->getLibcallCallingConv(LC), RetType, Callee, std::move(Args));
 
-  auto [Call, OutChain] = TLI->LowerCallTo(CLI);
+  auto [Call, CallChain] = TLI->LowerCallTo(CLI);
 
   for (auto [ResNo, ResultPtr] : llvm::enumerate(ResultPtrs)) {
+    if (ResNo == CallRetResNo) {
+      Results.push_back(Call);
+      continue;
+    }
     MachinePointerInfo PtrInfo;
     if (StoreSDNode *ST = ResultStores[ResNo]) {
       // Replace store with the library call.
-      ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain);
+      ReplaceAllUsesOfValueWith(SDValue(ST, 0), CallChain);
       PtrInfo = ST->getPointerInfo();
     } else {
       PtrInfo = MachinePointerInfo::getFixedStack(
           getMachineFunction(), cast<FrameIndexSDNode>(ResultPtr)->getIndex());
     }
-    SDValue LoadResult = getLoad(VT, DL, OutChain, ResultPtr, PtrInfo);
+    SDValue LoadResult =
+        getLoad(Node->getValueType(ResNo), DL, CallChain, ResultPtr, PtrInfo);
     Results.push_back(LoadResult);
   }
 
+  // FIXME: Find a way to avoid updating the root. This is needed for x86, which
+  // uses a floating-point stack. If (for example) the node to be expanded has
+  // two results one floating-point which is returned by the call, and one
+  // integer result, set returned via an output pointer. If only the integer
+  // result is used then the `CopyFromReg` for the FP result may be optimized
+  // out. This prevents an FP stack pop from being emitted for it. Setting the
+  // root like this ensures there will be a use of the `CopyFromReg` chain, and
+  // ensures the FP pop will be emitted.
+  SDValue OutputChain =
+      getNode(ISD::TokenFactor, DL, MVT::Other, getRoot(), CallChain);
+  setRoot(OutputChain);
+
   return true;
 }
 
diff --git a/llvm/test/CodeGen/X86/llvm.frexp.ll b/llvm/test/CodeGen/X86/llvm.frexp.ll
index cd560ad627de4c..96de34519556d0 100644
--- a/llvm/test/CodeGen/X86/llvm.frexp.ll
+++ b/llvm/test/CodeGen/X86/llvm.frexp.ll
@@ -325,28 +325,27 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) {
 ;
 ; WIN32-LABEL: test_frexp_v4f32_v4i32:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $60, %esp
+; WIN32-NEXT:    subl $44, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    leal 24(%esi), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    leal 20(%esi), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    leal 16(%esi), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
-; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    leal 28(%esi), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
@@ -361,22 +360,13 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) {
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl %edi, 28(%esi)
-; WIN32-NEXT:    movl %edx, 24(%esi)
-; WIN32-NEXT:    movl %ecx, 20(%esi)
-; WIN32-NEXT:    movl %eax, 16(%esi)
 ; WIN32-NEXT:    fstps 12(%esi)
 ; WIN32-NEXT:    fstps 8(%esi)
 ; WIN32-NEXT:    fstps 4(%esi)
 ; WIN32-NEXT:    fstps (%esi)
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    addl $60, %esp
+; WIN32-NEXT:    addl $44, %esp
 ; WIN32-NEXT:    popl %esi
-; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    retl
   %result = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> %a)
   ret { <4 x float>, <4 x i32> } %result
@@ -499,46 +489,35 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) {
 ;
 ; WIN32-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
 ; WIN32:       # %bb.0:
-; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $28, %esp
+; WIN32-NEXT:    subl $12, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    leal 8(%esi), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    leal 4(%esi), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    leal 12(%esi), %eax
 ; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    flds {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    fstpl (%esp)
 ; WIN32-NEXT:    calll _frexp
 ; WIN32-NEXT:    fstp %st(0)
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT:    movl %edi, 12(%esi)
-; WIN32-NEXT:    movl %edx, 8(%esi)
-; WIN32-NEXT:    movl %ecx, 4(%esi)
-; WIN32-NEXT:    movl %eax, (%esi)
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    addl $28, %esp
+; WIN32-NEXT:    addl $12, %esp
 ; WIN32-NEXT:    popl %esi
-; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    retl
   %result = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> %a)
   %result.1 = extractvalue { <4 x float>, <4 x i32> } %result, 1

>From b092e2b44fa7bc4142aa045b8748d3ae25b73855 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 4 Nov 2024 16:04:10 +0000
Subject: [PATCH 2/6] Update more tests

---
 llvm/test/CodeGen/PowerPC/f128-arith.ll |  32 ++--
 llvm/test/CodeGen/RISCV/llvm.frexp.ll   | 192 ++++++++++--------------
 2 files changed, 90 insertions(+), 134 deletions(-)

diff --git a/llvm/test/CodeGen/PowerPC/f128-arith.ll b/llvm/test/CodeGen/PowerPC/f128-arith.ll
index 35e5d61947ead7..decc4a38f7ccd4 100644
--- a/llvm/test/CodeGen/PowerPC/f128-arith.ll
+++ b/llvm/test/CodeGen/PowerPC/f128-arith.ll
@@ -1365,45 +1365,33 @@ define dso_local fp128 @qpFREXP(ptr %a, ptr %b) {
 ; CHECK-LABEL: qpFREXP:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    mflr r0
-; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset lr, 16
-; CHECK-NEXT:    .cfi_offset r30, -16
-; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    stdu r1, -64(r1)
-; CHECK-NEXT:    std r0, 80(r1)
-; CHECK-NEXT:    addi r5, r1, 44
-; CHECK-NEXT:    mr r30, r4
 ; CHECK-NEXT:    lxv v2, 0(r3)
+; CHECK-NEXT:    mr r5, r4
 ; CHECK-NEXT:    bl frexpf128
 ; CHECK-NEXT:    nop
-; CHECK-NEXT:    lwz r3, 44(r1)
-; CHECK-NEXT:    stw r3, 0(r30)
-; CHECK-NEXT:    addi r1, r1, 64
+; CHECK-NEXT:    addi r1, r1, 32
 ; CHECK-NEXT:    ld r0, 16(r1)
-; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-NEXT:    mtlr r0
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-P8-LABEL: qpFREXP:
 ; CHECK-P8:       # %bb.0: # %entry
 ; CHECK-P8-NEXT:    mflr r0
-; CHECK-P8-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-P8-NEXT:    stdu r1, -32(r1)
+; CHECK-P8-NEXT:    std r0, 48(r1)
+; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    .cfi_offset r30, -16
-; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
-; CHECK-P8-NEXT:    stdu r1, -64(r1)
-; CHECK-P8-NEXT:    std r0, 80(r1)
-; CHECK-P8-NEXT:    addi r5, r1, 44
-; CHECK-P8-NEXT:    mr r30, r4
 ; CHECK-P8-NEXT:    lxvd2x vs0, 0, r3
+; CHECK-P8-NEXT:    mr r5, r4
 ; CHECK-P8-NEXT:    xxswapd v2, vs0
 ; CHECK-P8-NEXT:    bl frexpf128
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    lwz r3, 44(r1)
-; CHECK-P8-NEXT:    stw r3, 0(r30)
-; CHECK-P8-NEXT:    addi r1, r1, 64
+; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
-; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; CHECK-P8-NEXT:    mtlr r0
 ; CHECK-P8-NEXT:    blr
 entry:
diff --git a/llvm/test/CodeGen/RISCV/llvm.frexp.ll b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
index e85a7118f5ff83..3f615d23d3eaf6 100644
--- a/llvm/test/CodeGen/RISCV/llvm.frexp.ll
+++ b/llvm/test/CodeGen/RISCV/llvm.frexp.ll
@@ -543,50 +543,42 @@ define i32 @test_frexp_f32_i32_only_use_exp(float %a) nounwind {
 define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwind {
 ; RV32IFD-LABEL: test_frexp_v4f32_v4i32:
 ; RV32IFD:       # %bb.0:
-; RV32IFD-NEXT:    addi sp, sp, -64
-; RV32IFD-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    fsd fs0, 48(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT:    fsd fs1, 40(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT:    fsd fs2, 32(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT:    fsd fs3, 24(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT:    addi sp, sp, -48
+; RV32IFD-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT:    fsd fs0, 32(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT:    fsd fs1, 24(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT:    fsd fs2, 16(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT:    fsd fs3, 8(sp) # 8-byte Folded Spill
 ; RV32IFD-NEXT:    fmv.s fs0, fa3
 ; RV32IFD-NEXT:    fmv.s fs1, fa2
 ; RV32IFD-NEXT:    fmv.s fs2, fa1
 ; RV32IFD-NEXT:    mv s0, a0
-; RV32IFD-NEXT:    addi a0, sp, 8
+; RV32IFD-NEXT:    addi a0, a0, 16
 ; RV32IFD-NEXT:    call frexpf
 ; RV32IFD-NEXT:    fmv.s fs3, fa0
-; RV32IFD-NEXT:    addi a0, sp, 12
+; RV32IFD-NEXT:    addi a0, s0, 20
 ; RV32IFD-NEXT:    fmv.s fa0, fs2
 ; RV32IFD-NEXT:    call frexpf
 ; RV32IFD-NEXT:    fmv.s fs2, fa0
-; RV32IFD-NEXT:    addi a0, sp, 16
+; RV32IFD-NEXT:    addi a0, s0, 24
 ; RV32IFD-NEXT:    fmv.s fa0, fs1
 ; RV32IFD-NEXT:    call frexpf
 ; RV32IFD-NEXT:    fmv.s fs1, fa0
-; RV32IFD-NEXT:    addi a0, sp, 20
+; RV32IFD-NEXT:    addi a0, s0, 28
 ; RV32IFD-NEXT:    fmv.s fa0, fs0
 ; RV32IFD-NEXT:    call frexpf
-; RV32IFD-NEXT:    lw a0, 8(sp)
-; RV32IFD-NEXT:    lw a1, 12(sp)
-; RV32IFD-NEXT:    lw a2, 16(sp)
-; RV32IFD-NEXT:    lw a3, 20(sp)
-; RV32IFD-NEXT:    sw a0, 16(s0)
-; RV32IFD-NEXT:    sw a1, 20(s0)
-; RV32IFD-NEXT:    sw a2, 24(s0)
-; RV32IFD-NEXT:    sw a3, 28(s0)
 ; RV32IFD-NEXT:    fsw fs3, 0(s0)
 ; RV32IFD-NEXT:    fsw fs2, 4(s0)
 ; RV32IFD-NEXT:    fsw fs1, 8(s0)
 ; RV32IFD-NEXT:    fsw fa0, 12(s0)
-; RV32IFD-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT:    fld fs0, 48(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT:    fld fs1, 40(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT:    fld fs2, 32(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT:    fld fs3, 24(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT:    addi sp, sp, 64
+; RV32IFD-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT:    fld fs0, 32(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT:    fld fs1, 24(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT:    fld fs2, 16(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT:    fld fs3, 8(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT:    addi sp, sp, 48
 ; RV32IFD-NEXT:    ret
 ;
 ; RV64IFD-LABEL: test_frexp_v4f32_v4i32:
@@ -639,52 +631,44 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ;
 ; RV32IZFINXZDINX-LABEL: test_frexp_v4f32_v4i32:
 ; RV32IZFINXZDINX:       # %bb.0:
-; RV32IZFINXZDINX-NEXT:    addi sp, sp, -48
-; RV32IZFINXZDINX-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    addi sp, sp, -32
+; RV32IZFINXZDINX-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32IZFINXZDINX-NEXT:    mv s0, a4
 ; RV32IZFINXZDINX-NEXT:    mv s1, a3
 ; RV32IZFINXZDINX-NEXT:    mv s2, a2
 ; RV32IZFINXZDINX-NEXT:    mv a2, a1
 ; RV32IZFINXZDINX-NEXT:    mv s3, a0
-; RV32IZFINXZDINX-NEXT:    addi a1, sp, 8
+; RV32IZFINXZDINX-NEXT:    addi a1, a0, 16
 ; RV32IZFINXZDINX-NEXT:    mv a0, a2
 ; RV32IZFINXZDINX-NEXT:    call frexpf
 ; RV32IZFINXZDINX-NEXT:    mv s4, a0
-; RV32IZFINXZDINX-NEXT:    addi a1, sp, 12
+; RV32IZFINXZDINX-NEXT:    addi a1, s3, 20
 ; RV32IZFINXZDINX-NEXT:    mv a0, s2
 ; RV32IZFINXZDINX-NEXT:    call frexpf
 ; RV32IZFINXZDINX-NEXT:    mv s2, a0
-; RV32IZFINXZDINX-NEXT:    addi a1, sp, 16
+; RV32IZFINXZDINX-NEXT:    addi a1, s3, 24
 ; RV32IZFINXZDINX-NEXT:    mv a0, s1
 ; RV32IZFINXZDINX-NEXT:    call frexpf
 ; RV32IZFINXZDINX-NEXT:    mv s1, a0
-; RV32IZFINXZDINX-NEXT:    addi a1, sp, 20
+; RV32IZFINXZDINX-NEXT:    addi a1, s3, 28
 ; RV32IZFINXZDINX-NEXT:    mv a0, s0
 ; RV32IZFINXZDINX-NEXT:    call frexpf
-; RV32IZFINXZDINX-NEXT:    lw a1, 8(sp)
-; RV32IZFINXZDINX-NEXT:    lw a2, 12(sp)
-; RV32IZFINXZDINX-NEXT:    lw a3, 16(sp)
-; RV32IZFINXZDINX-NEXT:    lw a4, 20(sp)
-; RV32IZFINXZDINX-NEXT:    sw a1, 16(s3)
-; RV32IZFINXZDINX-NEXT:    sw a2, 20(s3)
-; RV32IZFINXZDINX-NEXT:    sw a3, 24(s3)
-; RV32IZFINXZDINX-NEXT:    sw a4, 28(s3)
 ; RV32IZFINXZDINX-NEXT:    sw s4, 0(s3)
 ; RV32IZFINXZDINX-NEXT:    sw s2, 4(s3)
 ; RV32IZFINXZDINX-NEXT:    sw s1, 8(s3)
 ; RV32IZFINXZDINX-NEXT:    sw a0, 12(s3)
-; RV32IZFINXZDINX-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s2, 32(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s3, 28(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s4, 24(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    addi sp, sp, 48
+; RV32IZFINXZDINX-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    addi sp, sp, 32
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: test_frexp_v4f32_v4i32:
@@ -1096,41 +1080,34 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi
 define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; RV32IFD-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
 ; RV32IFD:       # %bb.0:
-; RV32IFD-NEXT:    addi sp, sp, -48
-; RV32IFD-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IFD-NEXT:    fsd fs0, 32(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT:    fsd fs1, 24(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT:    fsd fs2, 16(sp) # 8-byte Folded Spill
-; RV32IFD-NEXT:    fmv.s fs0, fa3
-; RV32IFD-NEXT:    fmv.s fs1, fa2
-; RV32IFD-NEXT:    fmv.s fs2, fa1
+; RV32IFD-NEXT:    addi sp, sp, -32
+; RV32IFD-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT:    fsd fs0, 16(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT:    fsd fs1, 8(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT:    fsd fs2, 0(sp) # 8-byte Folded Spill
+; RV32IFD-NEXT:    fmv.s fs0, fa2
+; RV32IFD-NEXT:    fmv.s fs1, fa1
+; RV32IFD-NEXT:    fmv.s fs2, fa0
 ; RV32IFD-NEXT:    mv s0, a0
-; RV32IFD-NEXT:    mv a0, sp
+; RV32IFD-NEXT:    addi a0, a0, 12
+; RV32IFD-NEXT:    fmv.s fa0, fa3
 ; RV32IFD-NEXT:    call frexpf
-; RV32IFD-NEXT:    addi a0, sp, 4
-; RV32IFD-NEXT:    fmv.s fa0, fs2
+; RV32IFD-NEXT:    addi a0, s0, 8
+; RV32IFD-NEXT:    fmv.s fa0, fs0
 ; RV32IFD-NEXT:    call frexpf
-; RV32IFD-NEXT:    addi a0, sp, 8
+; RV32IFD-NEXT:    addi a0, s0, 4
 ; RV32IFD-NEXT:    fmv.s fa0, fs1
 ; RV32IFD-NEXT:    call frexpf
-; RV32IFD-NEXT:    addi a0, sp, 12
-; RV32IFD-NEXT:    fmv.s fa0, fs0
+; RV32IFD-NEXT:    fmv.s fa0, fs2
+; RV32IFD-NEXT:    mv a0, s0
 ; RV32IFD-NEXT:    call frexpf
-; RV32IFD-NEXT:    lw a0, 0(sp)
-; RV32IFD-NEXT:    lw a1, 4(sp)
-; RV32IFD-NEXT:    lw a2, 8(sp)
-; RV32IFD-NEXT:    lw a3, 12(sp)
-; RV32IFD-NEXT:    sw a0, 0(s0)
-; RV32IFD-NEXT:    sw a1, 4(s0)
-; RV32IFD-NEXT:    sw a2, 8(s0)
-; RV32IFD-NEXT:    sw a3, 12(s0)
-; RV32IFD-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
-; RV32IFD-NEXT:    fld fs0, 32(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT:    fld fs1, 24(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT:    fld fs2, 16(sp) # 8-byte Folded Reload
-; RV32IFD-NEXT:    addi sp, sp, 48
+; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT:    fld fs0, 16(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT:    fld fs1, 8(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT:    fld fs2, 0(sp) # 8-byte Folded Reload
+; RV32IFD-NEXT:    addi sp, sp, 32
 ; RV32IFD-NEXT:    ret
 ;
 ; RV64IFD-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
@@ -1174,43 +1151,34 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ;
 ; RV32IZFINXZDINX-LABEL: test_frexp_v4f32_v4i32_only_use_exp:
 ; RV32IZFINXZDINX:       # %bb.0:
-; RV32IZFINXZDINX-NEXT:    addi sp, sp, -48
-; RV32IZFINXZDINX-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
-; RV32IZFINXZDINX-NEXT:    mv s0, a4
-; RV32IZFINXZDINX-NEXT:    mv s1, a3
-; RV32IZFINXZDINX-NEXT:    mv s2, a2
-; RV32IZFINXZDINX-NEXT:    mv a2, a1
+; RV32IZFINXZDINX-NEXT:    addi sp, sp, -32
+; RV32IZFINXZDINX-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    mv s0, a3
+; RV32IZFINXZDINX-NEXT:    mv s1, a2
+; RV32IZFINXZDINX-NEXT:    mv s2, a1
 ; RV32IZFINXZDINX-NEXT:    mv s3, a0
-; RV32IZFINXZDINX-NEXT:    addi a1, sp, 12
-; RV32IZFINXZDINX-NEXT:    mv a0, a2
+; RV32IZFINXZDINX-NEXT:    addi a1, a0, 12
+; RV32IZFINXZDINX-NEXT:    mv a0, a4
 ; RV32IZFINXZDINX-NEXT:    call frexpf
-; RV32IZFINXZDINX-NEXT:    addi a1, sp, 16
-; RV32IZFINXZDINX-NEXT:    mv a0, s2
+; RV32IZFINXZDINX-NEXT:    addi a1, s3, 8
+; RV32IZFINXZDINX-NEXT:    mv a0, s0
 ; RV32IZFINXZDINX-NEXT:    call frexpf
-; RV32IZFINXZDINX-NEXT:    addi a1, sp, 20
+; RV32IZFINXZDINX-NEXT:    addi a1, s3, 4
 ; RV32IZFINXZDINX-NEXT:    mv a0, s1
 ; RV32IZFINXZDINX-NEXT:    call frexpf
-; RV32IZFINXZDINX-NEXT:    addi a1, sp, 24
-; RV32IZFINXZDINX-NEXT:    mv a0, s0
+; RV32IZFINXZDINX-NEXT:    mv a0, s2
+; RV32IZFINXZDINX-NEXT:    mv a1, s3
 ; RV32IZFINXZDINX-NEXT:    call frexpf
-; RV32IZFINXZDINX-NEXT:    lw a0, 12(sp)
-; RV32IZFINXZDINX-NEXT:    lw a1, 16(sp)
-; RV32IZFINXZDINX-NEXT:    lw a2, 20(sp)
-; RV32IZFINXZDINX-NEXT:    lw a3, 24(sp)
-; RV32IZFINXZDINX-NEXT:    sw a0, 0(s3)
-; RV32IZFINXZDINX-NEXT:    sw a1, 4(s3)
-; RV32IZFINXZDINX-NEXT:    sw a2, 8(s3)
-; RV32IZFINXZDINX-NEXT:    sw a3, 12(s3)
-; RV32IZFINXZDINX-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s2, 32(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    lw s3, 28(sp) # 4-byte Folded Reload
-; RV32IZFINXZDINX-NEXT:    addi sp, sp, 48
+; RV32IZFINXZDINX-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    addi sp, sp, 32
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: test_frexp_v4f32_v4i32_only_use_exp:

>From d234e52f91c067d3a1a5ea4a282e8e58c6942a5f Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 4 Nov 2024 20:45:57 +0000
Subject: [PATCH 3/6] Use callback

---
 llvm/include/llvm/CodeGen/SelectionDAG.h           | 11 +++++++----
 .../lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 14 +++-----------
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp     |  5 +++--
 3 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 9035aa3ea31278..03624336febb0c 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/ilist.h"
@@ -1600,10 +1601,12 @@ class SelectionDAG {
   /// libcall is expected to take all the operands of the \p Node followed by
   /// output pointers for each of the results. \p CallRetResNo can be optionally
   /// set to indicate that one of the results comes from the libcall's return
-  /// value.
-  bool expandMultipleResultFPLibCall(RTLIB::Libcall LC, SDNode *Node,
-                                     SmallVectorImpl<SDValue> &Results,
-                                     std::optional<unsigned> CallRetResNo = {});
+  /// value. The optional \p LegalizeOp callback can be set to legalize
+  /// non-result values created by this expansion.
+  bool expandMultipleResultFPLibCall(
+      RTLIB::Libcall LC, SDNode *Node, SmallVectorImpl<SDValue> &Results,
+      std::optional<unsigned> CallRetResNo = {},
+      function_ref<SDValue(SDValue)> LegalizeOp = nullptr);
 
   /// Expand the specified \c ISD::VAARG node as the Legalize pass would.
   SDValue expandVAArg(SDNode *Node);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 6630357410a99b..80559cec386952 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -27,7 +27,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -271,15 +270,6 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   DenseMap<SDValue, SDValue>::iterator I = LegalizedNodes.find(Op);
   if (I != LegalizedNodes.end()) return I->second;
 
-  // Handle legalizing the root if it changes.
-  auto FixupRoot = make_scope_exit([&, OldRoot = DAG.getRoot()] {
-    SDValue Root = DAG.getRoot();
-    if (Root != OldRoot) {
-      if (SDValue LegalRoot = LegalizeOp(Root))
-        DAG.setRoot(LegalRoot);
-    }
-  });
-
   // Legalize the operands
   SmallVector<SDValue, 8> Ops;
   for (const SDValue &Oper : Op->op_values())
@@ -1205,7 +1195,9 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   case ISD::FSINCOS: {
     RTLIB::Libcall LC =
         RTLIB::getFSINCOS(Node->getValueType(0).getVectorElementType());
-    if (DAG.expandMultipleResultFPLibCall(LC, Node, Results))
+    if (DAG.expandMultipleResultFPLibCall(
+            LC, Node, Results, /*CallRetResNo=*/std::nullopt,
+            [this](SDValue Op) { return LegalizeOp(Op); }))
       return;
     break;
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 610e274caaafe3..bf9f1d444ac8fb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2486,7 +2486,8 @@ SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
 
 bool SelectionDAG::expandMultipleResultFPLibCall(
     RTLIB::Libcall LC, SDNode *Node, SmallVectorImpl<SDValue> &Results,
-    std::optional<unsigned> CallRetResNo) {
+    std::optional<unsigned> CallRetResNo,
+    function_ref<SDValue(SDValue)> LegalizeOp) {
   LLVMContext &Ctx = *getContext();
   EVT VT = Node->getValueType(0);
   unsigned NumResults = Node->getNumValues();
@@ -2608,7 +2609,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
   // ensures the FP pop will be emitted.
   SDValue OutputChain =
       getNode(ISD::TokenFactor, DL, MVT::Other, getRoot(), CallChain);
-  setRoot(OutputChain);
+  setRoot(LegalizeOp ? LegalizeOp(OutputChain) : OutputChain);
 
   return true;
 }

>From b86b1781984f81d63f2eb13473ab242d0c1f4b79 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 5 Nov 2024 08:26:20 +0000
Subject: [PATCH 4/6] Remove callback

---
 llvm/include/llvm/CodeGen/SelectionDAG.h       | 11 ++++-------
 .../CodeGen/SelectionDAG/LegalizeVectorOps.cpp |  4 +---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 18 ++++++++++--------
 3 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 03624336febb0c..9035aa3ea31278 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -18,7 +18,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/ilist.h"
@@ -1601,12 +1600,10 @@ class SelectionDAG {
   /// libcall is expected to take all the operands of the \p Node followed by
   /// output pointers for each of the results. \p CallRetResNo can be optionally
   /// set to indicate that one of the results comes from the libcall's return
-  /// value. The optional \p LegalizeOp callback can be set to legalize
-  /// non-result values created by this expansion.
-  bool expandMultipleResultFPLibCall(
-      RTLIB::Libcall LC, SDNode *Node, SmallVectorImpl<SDValue> &Results,
-      std::optional<unsigned> CallRetResNo = {},
-      function_ref<SDValue(SDValue)> LegalizeOp = nullptr);
+  /// value.
+  bool expandMultipleResultFPLibCall(RTLIB::Libcall LC, SDNode *Node,
+                                     SmallVectorImpl<SDValue> &Results,
+                                     std::optional<unsigned> CallRetResNo = {});
 
   /// Expand the specified \c ISD::VAARG node as the Legalize pass would.
   SDValue expandVAArg(SDNode *Node);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 80559cec386952..db21e708970648 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1195,9 +1195,7 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   case ISD::FSINCOS: {
     RTLIB::Libcall LC =
         RTLIB::getFSINCOS(Node->getValueType(0).getVectorElementType());
-    if (DAG.expandMultipleResultFPLibCall(
-            LC, Node, Results, /*CallRetResNo=*/std::nullopt,
-            [this](SDValue Op) { return LegalizeOp(Op); }))
+    if (DAG.expandMultipleResultFPLibCall(LC, Node, Results))
       return;
     break;
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index bf9f1d444ac8fb..47d64440d19578 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2486,8 +2486,7 @@ SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
 
 bool SelectionDAG::expandMultipleResultFPLibCall(
     RTLIB::Libcall LC, SDNode *Node, SmallVectorImpl<SDValue> &Results,
-    std::optional<unsigned> CallRetResNo,
-    function_ref<SDValue(SDValue)> LegalizeOp) {
+    std::optional<unsigned> CallRetResNo) {
   LLVMContext &Ctx = *getContext();
   EVT VT = Node->getValueType(0);
   unsigned NumResults = Node->getNumValues();
@@ -2602,14 +2601,17 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
   // FIXME: Find a way to avoid updating the root. This is needed for x86, which
   // uses a floating-point stack. If (for example) the node to be expanded has
   // two results one floating-point which is returned by the call, and one
-  // integer result, set returned via an output pointer. If only the integer
-  // result is used then the `CopyFromReg` for the FP result may be optimized
-  // out. This prevents an FP stack pop from being emitted for it. Setting the
-  // root like this ensures there will be a use of the `CopyFromReg` chain, and
-  // ensures the FP pop will be emitted.
+  // integer result, returned via an output pointer. If only the integer result
+  // is used then the `CopyFromReg` for the FP result may be optimized out. This
+  // prevents an FP stack pop from being emitted for it. Setting the root like
+  // this ensures there will be a use of the `CopyFromReg` chain, and ensures
+  // the FP pop will be emitted.
   SDValue OutputChain =
       getNode(ISD::TokenFactor, DL, MVT::Other, getRoot(), CallChain);
-  setRoot(LegalizeOp ? LegalizeOp(OutputChain) : OutputChain);
+  setRoot(OutputChain);
+
+  // Ensure the new root is reachable from the results.
+  Results[0] = getMergeValues({Results[0], OutputChain}, DL);
 
   return true;
 }

>From 4638ce5b440e166371f80ac4b1565d10b1523a49 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 5 Nov 2024 13:38:53 +0000
Subject: [PATCH 5/6] Fixups

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  9 ++++++--
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 21 ++++++++++---------
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 6c160e6e90a1fb..1480bd98c685e1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4522,7 +4522,9 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     break;
   case ISD::FSINCOS: {
     RTLIB::Libcall LC = RTLIB::getFSINCOS(Node->getValueType(0));
-    DAG.expandMultipleResultFPLibCall(LC, Node, Results);
+    bool Expanded = DAG.expandMultipleResultFPLibCall(LC, Node, Results);
+    if (!Expanded)
+      llvm_unreachable("Expected scalar FSINCOS to expand to libcall!");
     break;
   }
   case ISD::FLOG:
@@ -4609,7 +4611,10 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     break;
   case ISD::FFREXP: {
     RTLIB::Libcall LC = RTLIB::getFREXP(Node->getValueType(0));
-    DAG.expandMultipleResultFPLibCall(LC, Node, Results, /*CallRetResNo=*/0);
+    bool Expanded = DAG.expandMultipleResultFPLibCall(LC, Node, Results,
+                                                      /*CallRetResNo=*/0);
+    if (!Expanded)
+      llvm_unreachable("Expected scalar FFREXP to expand to libcall!");
     break;
   }
   case ISD::FPOWI:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 47d64440d19578..1beb9087830b99 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2533,28 +2533,30 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
   }
 
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry{};
+  auto AddArgListEntry = [&](SDValue Node, Type *Ty) {
+    TargetLowering::ArgListEntry Entry{};
+    Entry.Ty = Ty;
+    Entry.Node = Node;
+    Args.push_back(Entry);
+  };
 
   // Pass the arguments.
   for (const SDValue &Op : Node->op_values()) {
     EVT ArgVT = Op.getValueType();
     Type *ArgTy = ArgVT.getTypeForEVT(Ctx);
-    Entry.Node = Node->getOperand(0);
-    Entry.Ty = ArgTy;
-    Args.push_back(Entry);
+    AddArgListEntry(Op, ArgTy);
   }
 
   // Pass the output pointers.
   SmallVector<SDValue, 2> ResultPtrs(NumResults);
+  Type *PointerTy = PointerType::getUnqual(Ctx);
   for (auto [ResNo, ST] : llvm::enumerate(ResultStores)) {
     if (ResNo == CallRetResNo)
       continue;
     EVT ResVT = Node->getValueType(ResNo);
     SDValue ResultPtr = ST ? ST->getBasePtr() : CreateStackTemporary(ResVT);
-    Entry.Node = ResultPtr;
-    Entry.Ty = PointerType::getUnqual(Ctx);
     ResultPtrs[ResNo] = ResultPtr;
-    Args.push_back(Entry);
+    AddArgListEntry(ResultPtr, PointerTy);
   }
 
   SDLoc DL(Node);
@@ -2562,9 +2564,8 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
   // Pass the vector mask (if required).
   if (VD && VD->isMasked()) {
     EVT MaskVT = TLI->getSetCCResultType(getDataLayout(), Ctx, VT);
-    Entry.Node = getBoolConstant(true, DL, MaskVT, VT);
-    Entry.Ty = MaskVT.getTypeForEVT(Ctx);
-    Args.push_back(Entry);
+    SDValue Mask = getBoolConstant(true, DL, MaskVT, VT);
+    AddArgListEntry(Mask, MaskVT.getTypeForEVT(Ctx));
   }
 
   Type *RetType = CallRetResNo.has_value()

>From e620edfa3961b55e59f18bc376f1b1e7367bf4cc Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 5 Nov 2024 15:34:23 +0000
Subject: [PATCH 6/6] Make root update conditional

---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 1beb9087830b99..1659f7c506a9b0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2599,20 +2599,21 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
     Results.push_back(LoadResult);
   }
 
-  // FIXME: Find a way to avoid updating the root. This is needed for x86, which
-  // uses a floating-point stack. If (for example) the node to be expanded has
-  // two results one floating-point which is returned by the call, and one
-  // integer result, returned via an output pointer. If only the integer result
-  // is used then the `CopyFromReg` for the FP result may be optimized out. This
-  // prevents an FP stack pop from being emitted for it. Setting the root like
-  // this ensures there will be a use of the `CopyFromReg` chain, and ensures
-  // the FP pop will be emitted.
-  SDValue OutputChain =
-      getNode(ISD::TokenFactor, DL, MVT::Other, getRoot(), CallChain);
-  setRoot(OutputChain);
-
-  // Ensure the new root is reachable from the results.
-  Results[0] = getMergeValues({Results[0], OutputChain}, DL);
+  if (CallRetResNo && !Node->hasAnyUseOfValue(*CallRetResNo)) {
+    // FIXME: Find a way to avoid updating the root. This is needed for x86,
+    // which uses a floating-point stack. If (for example) the node to be
+    // expanded has two results one floating-point which is returned by the
+    // call, and one integer result, returned via an output pointer. If only the
+    // integer result is used then the `CopyFromReg` for the FP result may be
+    // optimized out. This prevents an FP stack pop from being emitted for it.
+    // Setting the root like this ensures there will be a use of the
+    // `CopyFromReg` chain, and ensures the FP pop will be emitted.
+    SDValue NewRoot =
+        getNode(ISD::TokenFactor, DL, MVT::Other, getRoot(), CallChain);
+    setRoot(NewRoot);
+    // Ensure the new root is reachable from the results.
+    Results[0] = getMergeValues({Results[0], NewRoot}, DL);
+  }
 
   return true;
 }