[llvm] [SDAG] Avoid creating redundant stack slots when lowering FSINCOS (PR #108401)

Tue Sep 17 06:09:27 PDT 2024

https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/108401

>From adb2f4282d553a52b72c905fa220fc3093daa57b Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 12 Sep 2024 13:52:24 +0000
Subject: [PATCH 1/6] Precommit test for sincos stack slots

---
 .../CodeGen/AArch64/sincos-stack-slots.ll     | 152 ++++++++++++++++++
 1 file changed, 152 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sincos-stack-slots.ll

diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
new file mode 100644
index 00000000000000..9c362ba117fefa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
@@ -0,0 +1,152 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+
+define { float, float } @sincos_f32_value_return(float %x) {
+; CHECK-LABEL: sincos_f32_value_return:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldp s1, s0, [sp, #8]
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call float @llvm.sin.f32(float %x)
+  %cos = tail call float @llvm.cos.f32(float %x)
+  %ret_0 = insertvalue { float, float } poison, float %sin, 0
+  %ret_1 = insertvalue { float, float } %ret_0, float %cos, 1
+  ret { float, float } %ret_1
+}
+
+define void @sincos_f32_ptr_return(float %x, ptr %out_sin, ptr %out_cos) {
+; CHECK-LABEL: sincos_f32_ptr_return:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldp s1, s0, [sp, #8]
+; CHECK-NEXT:    str s0, [x20]
+; CHECK-NEXT:    str s1, [x19]
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call float @llvm.sin.f32(float %x)
+  %cos = tail call float @llvm.cos.f32(float %x)
+  store float %sin, ptr %out_sin, align 4
+  store float %cos, ptr %out_cos, align 4
+  ret void
+}
+
+define float @sincos_f32_mixed_return(float %x, ptr %out_sin) {
+; CHECK-LABEL: sincos_f32_mixed_return:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldp s0, s1, [sp, #8]
+; CHECK-NEXT:    str s1, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call float @llvm.sin.f32(float %x)
+  %cos = tail call float @llvm.cos.f32(float %x)
+  store float %sin, ptr %out_sin, align 4
+  ret float %cos
+}
+
+define { double, double } @sincos_f64_value_return(double %x) {
+; CHECK-LABEL: sincos_f64_value_return:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    add x0, sp, #24
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincos
+; CHECK-NEXT:    ldr d0, [sp, #24]
+; CHECK-NEXT:    ldr d1, [sp, #8]
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call double @llvm.sin.f64(double %x)
+  %cos = tail call double @llvm.cos.f64(double %x)
+  %ret_0 = insertvalue { double, double } poison, double %sin, 0
+  %ret_1 = insertvalue { double, double } %ret_0, double %cos, 1
+  ret { double, double } %ret_1
+}
+
+define void @sincos_f64_ptr_return(double %x, ptr %out_sin, ptr %out_cos) {
+; CHECK-LABEL: sincos_f64_ptr_return:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    add x0, sp, #24
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincos
+; CHECK-NEXT:    ldr d0, [sp, #24]
+; CHECK-NEXT:    ldr d1, [sp, #8]
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    str d0, [x20]
+; CHECK-NEXT:    str d1, [x19]
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call double @llvm.sin.f64(double %x)
+  %cos = tail call double @llvm.cos.f64(double %x)
+  store double %sin, ptr %out_sin, align 4
+  store double %cos, ptr %out_cos, align 4
+  ret void
+}
+
+define double @sincos_f64_mixed_return(double %x, ptr %out_sin) {
+; CHECK-LABEL: sincos_f64_mixed_return:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    add x0, sp, #8
+; CHECK-NEXT:    mov x1, sp
+; CHECK-NEXT:    bl sincos
+; CHECK-NEXT:    ldp d0, d1, [sp]
+; CHECK-NEXT:    str d1, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call double @llvm.sin.f64(double %x)
+  %cos = tail call double @llvm.cos.f64(double %x)
+  store double %sin, ptr %out_sin, align 4
+  ret double %cos
+}

>From c7decfcc4ad2bca873d9d73506d17d115b45e4be Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 12 Sep 2024 13:55:25 +0000
Subject: [PATCH 2/6] [SDAG] Avoid creating redundant stack slots when lowering
 FSINCOS

When lowering `FSINCOS` to a library call (that takes output pointers)
we can avoid creating new stack allocations if the results of the
`FSINCOS` are being stored. Instead, we can take the destination
pointers from the stores and pass those to the library call.
---
 .../include/llvm/CodeGen/RuntimeLibcallUtil.h |  4 +
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 99 +++++++++----------
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |  5 +
 .../CodeGen/AArch64/sincos-stack-slots.ll     | 73 ++++----------
 4 files changed, 75 insertions(+), 106 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
index 7a131645893921..045ec7d3653119 100644
--- a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
+++ b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
@@ -62,6 +62,10 @@ Libcall getLDEXP(EVT RetVT);
 /// UNKNOWN_LIBCALL if there is none.
 Libcall getFREXP(EVT RetVT);
 
+/// getFSINCOS - Return the FSINCOS_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+Libcall getFSINCOS(EVT RetVT);
+
 /// Return the SYNC_FETCH_AND_* value for the given opcode and type, or
 /// UNKNOWN_LIBCALL if there is none.
 Libcall getSYNC(unsigned Opc, MVT VT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index f5fbc01cd95e96..7b0dc63c473d25 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2326,15 +2326,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
 
 /// Return true if sincos libcall is available.
 static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
-  RTLIB::Libcall LC;
-  switch (Node->getSimpleValueType(0).SimpleTy) {
-  default: llvm_unreachable("Unexpected request for libcall!");
-  case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
-  case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
-  case MVT::f80:     LC = RTLIB::SINCOS_F80; break;
-  case MVT::f128:    LC = RTLIB::SINCOS_F128; break;
-  case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
-  }
+  RTLIB::Libcall LC = RTLIB::getFSINCOS(Node->getSimpleValueType(0).SimpleTy);
   return TLI.getLibcallName(LC) != nullptr;
 }
 
@@ -2355,68 +2347,73 @@ static bool useSinCos(SDNode *Node) {
 }
 
 /// Issue libcalls to sincos to compute sin / cos pairs.
-void
-SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
-                                          SmallVectorImpl<SDValue> &Results) {
-  RTLIB::Libcall LC;
-  switch (Node->getSimpleValueType(0).SimpleTy) {
-  default: llvm_unreachable("Unexpected request for libcall!");
-  case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
-  case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
-  case MVT::f80:     LC = RTLIB::SINCOS_F80; break;
-  case MVT::f128:    LC = RTLIB::SINCOS_F128; break;
-  case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
-  }
-
-  // The input chain to this libcall is the entry node of the function.
-  // Legalizing the call will automatically add the previous call to the
-  // dependence.
-  SDValue InChain = DAG.getEntryNode();
-
+void SelectionDAGLegalize::ExpandSinCosLibCall(
+    SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   EVT RetVT = Node->getValueType(0);
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
 
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
+  TargetLowering::ArgListEntry Entry{};
+
+  // Find users of the node that store the results. The destination pointers
+  // can be used instead of creating stack allocations.
+  StoreSDNode *SinST = nullptr;
+  StoreSDNode *CosST = nullptr;
+  for (SDNode::use_iterator UI = Node->use_begin(), UE = Node->use_end();
+       UI != UE; ++UI) {
+    SDUse &Use = UI.getUse();
+    SDNode *User = Use.getUser();
+    if (!ISD::isNormalStore(User))
+      continue;
+    auto *ST = cast<StoreSDNode>(User);
+    if (Use.getResNo() == 0)
+      SinST = ST;
+    if (Use.getResNo() == 1)
+      CosST = ST;
+  }
 
   // Pass the argument.
   Entry.Node = Node->getOperand(0);
   Entry.Ty = RetTy;
-  Entry.IsSExt = false;
-  Entry.IsZExt = false;
   Args.push_back(Entry);
 
+  auto GetOrCreateOutPointer = [&](StoreSDNode *MaybeStore) {
+    if (MaybeStore)
+      return std::make_pair(MaybeStore->getBasePtr(),
+                            MaybeStore->getPointerInfo());
+    SDValue StackSlot = DAG.CreateStackTemporary(RetVT);
+    auto PtrInfo = MachinePointerInfo::getFixedStack(
+        DAG.getMachineFunction(),
+        cast<FrameIndexSDNode>(StackSlot)->getIndex());
+    return std::make_pair(StackSlot, PtrInfo);
+  };
+
   // Pass the return address of sin.
-  SDValue SinPtr = DAG.CreateStackTemporary(RetVT);
-  Entry.Node = SinPtr;
+  auto SinPtr = GetOrCreateOutPointer(SinST);
+  Entry.Node = SinPtr.first;
   Entry.Ty = PointerType::getUnqual(RetTy->getContext());
-  Entry.IsSExt = false;
-  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   // Also pass the return address of the cos.
-  SDValue CosPtr = DAG.CreateStackTemporary(RetVT);
-  Entry.Node = CosPtr;
+  auto CosPtr = GetOrCreateOutPointer(CosST);
+  Entry.Node = CosPtr.first;
   Entry.Ty = PointerType::getUnqual(RetTy->getContext());
-  Entry.IsSExt = false;
-  Entry.IsZExt = false;
   Args.push_back(Entry);
 
-  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
-                                         TLI.getPointerTy(DAG.getDataLayout()));
-
-  SDLoc dl(Node);
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(InChain).setLibCallee(
-      TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee,
-      std::move(Args));
+  RTLIB::Libcall LC = RTLIB::getFSINCOS(RetVT);
+  auto [Call, Chain] = ExpandLibCall(LC, Node, std::move(Args), false);
 
-  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+  // Replace explict stores with the library call.
+  for (StoreSDNode *ST : {SinST, CosST}) {
+    if (ST)
+      DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), Chain);
+  }
 
-  Results.push_back(
-      DAG.getLoad(RetVT, dl, CallInfo.second, SinPtr, MachinePointerInfo()));
-  Results.push_back(
-      DAG.getLoad(RetVT, dl, CallInfo.second, CosPtr, MachinePointerInfo()));
+  SDLoc DL(Node);
+  for (auto [Ptr, PtrInfo] : {SinPtr, CosPtr}) {
+    SDValue LoadExp = DAG.getLoad(RetVT, DL, Chain, Ptr, PtrInfo);
+    Results.push_back(LoadExp);
+  }
 }
 
 SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const {
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index eb3190c7cd247a..8e5a0a0ca06082 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -398,6 +398,11 @@ RTLIB::Libcall RTLIB::getFREXP(EVT RetVT) {
                       FREXP_PPCF128);
 }
 
+RTLIB::Libcall RTLIB::getFSINCOS(EVT RetVT) {
+  return getFPLibCall(RetVT, SINCOS_F32, SINCOS_F64, SINCOS_F80, SINCOS_F128,
+                      SINCOS_PPCF128);
+}
+
 RTLIB::Libcall RTLIB::getOutlineAtomicHelper(const Libcall (&LC)[5][4],
                                              AtomicOrdering Order,
                                              uint64_t MemSize) {
diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
index 9c362ba117fefa..afd054a83a5014 100644
--- a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
+++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
@@ -24,22 +24,11 @@ entry:
 define void @sincos_f32_ptr_return(float %x, ptr %out_sin, ptr %out_cos) {
 ; CHECK-LABEL: sincos_f32_ptr_return:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    mov x19, x1
-; CHECK-NEXT:    mov x20, x0
-; CHECK-NEXT:    add x0, sp, #12
-; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldp s1, s0, [sp, #8]
-; CHECK-NEXT:    str s0, [x20]
-; CHECK-NEXT:    str s1, [x19]
-; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
   %sin = tail call float @llvm.sin.f32(float %x)
@@ -52,19 +41,13 @@ entry:
 define float @sincos_f32_mixed_return(float %x, ptr %out_sin) {
 ; CHECK-LABEL: sincos_f32_mixed_return:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    add x0, sp, #12
-; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    add x1, sp, #12
 ; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldp s0, s1, [sp, #8]
-; CHECK-NEXT:    str s1, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ldr s0, [sp, #12]
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
   %sin = tail call float @llvm.sin.f32(float %x)
@@ -99,25 +82,11 @@ entry:
 define void @sincos_f64_ptr_return(double %x, ptr %out_sin, ptr %out_cos) {
 ; CHECK-LABEL: sincos_f64_ptr_return:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #48
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
-; CHECK-NEXT:    mov x19, x1
-; CHECK-NEXT:    mov x20, x0
-; CHECK-NEXT:    add x0, sp, #24
-; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    bl sincos
-; CHECK-NEXT:    ldr d0, [sp, #24]
-; CHECK-NEXT:    ldr d1, [sp, #8]
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    str d0, [x20]
-; CHECK-NEXT:    str d1, [x19]
-; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
   %sin = tail call double @llvm.sin.f64(double %x)
@@ -130,19 +99,13 @@ entry:
 define double @sincos_f64_mixed_return(double %x, ptr %out_sin) {
 ; CHECK-LABEL: sincos_f64_mixed_return:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    add x0, sp, #8
-; CHECK-NEXT:    mov x1, sp
+; CHECK-NEXT:    add x1, sp, #8
 ; CHECK-NEXT:    bl sincos
-; CHECK-NEXT:    ldp d0, d1, [sp]
-; CHECK-NEXT:    str d1, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
   %sin = tail call double @llvm.sin.f64(double %x)

>From 37f4aee204d56ffecf597203122bca66b40d60fd Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 12 Sep 2024 18:16:46 +0000
Subject: [PATCH 3/6] Fixups

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 49 ++++++---
 .../CodeGen/AArch64/sincos-stack-slots.ll     | 99 ++++++++++++++++++-
 2 files changed, 131 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 7b0dc63c473d25..aa91d064db48e7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2351,9 +2351,7 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
     SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   EVT RetVT = Node->getValueType(0);
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
-
-  TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry{};
+  RTLIB::Libcall LC = RTLIB::getFSINCOS(RetVT);
 
   // Find users of the node that store the results. The destination pointers
   // can be used instead of creating stack allocations.
@@ -2366,17 +2364,15 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
     if (!ISD::isNormalStore(User))
       continue;
     auto *ST = cast<StoreSDNode>(User);
+    if (!ST->isSimple() || ST->getPointerInfo().getAddrSpace() != 0 ||
+        ST->getAlign() < DAG.getDataLayout().getABITypeAlign(RetTy))
+      continue;
     if (Use.getResNo() == 0)
       SinST = ST;
     if (Use.getResNo() == 1)
       CosST = ST;
   }
 
-  // Pass the argument.
-  Entry.Node = Node->getOperand(0);
-  Entry.Ty = RetTy;
-  Args.push_back(Entry);
-
   auto GetOrCreateOutPointer = [&](StoreSDNode *MaybeStore) {
     if (MaybeStore)
       return std::make_pair(MaybeStore->getBasePtr(),
@@ -2388,6 +2384,14 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
     return std::make_pair(StackSlot, PtrInfo);
   };
 
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry{};
+
+  // Pass the argument.
+  Entry.Node = Node->getOperand(0);
+  Entry.Ty = RetTy;
+  Args.push_back(Entry);
+
   // Pass the return address of sin.
   auto SinPtr = GetOrCreateOutPointer(SinST);
   Entry.Node = SinPtr.first;
@@ -2400,18 +2404,35 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
   Entry.Ty = PointerType::getUnqual(RetTy->getContext());
   Args.push_back(Entry);
 
-  RTLIB::Libcall LC = RTLIB::getFSINCOS(RetVT);
-  auto [Call, Chain] = ExpandLibCall(LC, Node, std::move(Args), false);
-
-  // Replace explict stores with the library call.
+  // Combine any input chains from the stores.
+  SmallVector<SDValue, 2> InChains{};
   for (StoreSDNode *ST : {SinST, CosST}) {
     if (ST)
-      DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), Chain);
+      InChains.push_back(ST->getChain());
   }
+  if (InChains.empty())
+    InChains.push_back(DAG.getEntryNode());
 
   SDLoc DL(Node);
+  SDValue InChain = DAG.getTokenFactor(DL, InChains);
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy(DAG.getDataLayout()));
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(DL).setChain(InChain).setLibCallee(
+      TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee,
+      std::move(Args));
+
+  auto [Call, OutChain] = TLI.LowerCallTo(CLI);
+
+  // Replace the stores with the library call.
+  for (StoreSDNode *ST : {SinST, CosST}) {
+    if (!ST)
+      continue;
+    DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain);
+  }
+
   for (auto [Ptr, PtrInfo] : {SinPtr, CosPtr}) {
-    SDValue LoadExp = DAG.getLoad(RetVT, DL, Chain, Ptr, PtrInfo);
+    SDValue LoadExp = DAG.getLoad(RetVT, DL, OutChain, Ptr, PtrInfo);
     Results.push_back(LoadExp);
   }
 }
diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
index afd054a83a5014..c32ac58a6a851e 100644
--- a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
+++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
 
+; This file tests eliding stack slots when lowering the FSINCOS ISD node.
+
 define { float, float } @sincos_f32_value_return(float %x) {
 ; CHECK-LABEL: sincos_f32_value_return:
 ; CHECK:       // %bb.0: // %entry
@@ -91,8 +93,8 @@ define void @sincos_f64_ptr_return(double %x, ptr %out_sin, ptr %out_cos) {
 entry:
   %sin = tail call double @llvm.sin.f64(double %x)
   %cos = tail call double @llvm.cos.f64(double %x)
-  store double %sin, ptr %out_sin, align 4
-  store double %cos, ptr %out_cos, align 4
+  store double %sin, ptr %out_sin, align 8
+  store double %cos, ptr %out_cos, align 8
   ret void
 }
 
@@ -110,6 +112,97 @@ define double @sincos_f64_mixed_return(double %x, ptr %out_sin) {
 entry:
   %sin = tail call double @llvm.sin.f64(double %x)
   %cos = tail call double @llvm.cos.f64(double %x)
-  store double %sin, ptr %out_sin, align 4
+  store double %sin, ptr %out_sin, align 8
   ret double %cos
 }
+
+; Negative test. We can't fold volatile stores into the library call.
+define void @sincos_volatile_result_stores(float %x, ptr %out_sin, ptr %out_cos) {
+; CHECK-LABEL: negative_fold_sincos_volatile_store:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldp s1, s0, [sp, #8]
+; CHECK-NEXT:    str s0, [x20]
+; CHECK-NEXT:    str s1, [x19]
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call float @llvm.sin.f32(float %x)
+  %cos = tail call float @llvm.cos.f32(float %x)
+  store volatile float %sin, ptr %out_sin, align 4
+  store volatile float %cos, ptr %out_cos, align 4
+  ret void
+}
+
+; Negative test. We can't fold atomic stores into the library call.
+define void @sincos_atomic_result_stores(float %x, ptr %out_sin, ptr %out_cos) {
+; CHECK-LABEL: negative_fold_sincos_atomic_store:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldr w8, [sp, #12]
+; CHECK-NEXT:    str w8, [x20]
+; CHECK-NEXT:    ldr w8, [sp, #8]
+; CHECK-NEXT:    str w8, [x19]
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call float @llvm.sin.f32(float %x)
+  %cos = tail call float @llvm.cos.f32(float %x)
+  store atomic float %sin, ptr %out_sin unordered, align 4
+  store atomic float %cos, ptr %out_cos unordered, align 4
+  ret void
+}
+
+; Negative test. We can't fold misaligned stores into the library call.
+define void @sincos_misaligned_result_stores(double %x, ptr %out_sin, ptr %out_cos) {
+; CHECK-LABEL: negative_sincos_bad_alignment:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    add x0, sp, #24
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincos
+; CHECK-NEXT:    ldr d0, [sp, #24]
+; CHECK-NEXT:    ldr d1, [sp, #8]
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    str d0, [x20]
+; CHECK-NEXT:    str d1, [x19]
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call double @llvm.sin.f64(double %x)
+  %cos = tail call double @llvm.cos.f64(double %x)
+  store double %sin, ptr %out_sin, align 4
+  store double %cos, ptr %out_cos, align 4
+  ret void
+}

>From 09dfce4018bc50e68b63f74e916de9a5217ffd13 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 12 Sep 2024 20:43:05 +0000
Subject: [PATCH 4/6] Avoid cyclic in chains

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 83 ++++++++-----------
 .../CodeGen/AArch64/sincos-stack-slots.ll     |  6 +-
 2 files changed, 39 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index aa91d064db48e7..583859deb7ee7d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2355,8 +2355,7 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
 
   // Find users of the node that store the results. The destination pointers
   // can be used instead of creating stack allocations.
-  StoreSDNode *SinST = nullptr;
-  StoreSDNode *CosST = nullptr;
+  std::array<StoreSDNode *, 2> ResultStores = {nullptr};
   for (SDNode::use_iterator UI = Node->use_begin(), UE = Node->use_end();
        UI != UE; ++UI) {
     SDUse &Use = UI.getUse();
@@ -2367,22 +2366,18 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
     if (!ST->isSimple() || ST->getPointerInfo().getAddrSpace() != 0 ||
         ST->getAlign() < DAG.getDataLayout().getABITypeAlign(RetTy))
       continue;
-    if (Use.getResNo() == 0)
-      SinST = ST;
-    if (Use.getResNo() == 1)
-      CosST = ST;
-  }
-
-  auto GetOrCreateOutPointer = [&](StoreSDNode *MaybeStore) {
-    if (MaybeStore)
-      return std::make_pair(MaybeStore->getBasePtr(),
-                            MaybeStore->getPointerInfo());
-    SDValue StackSlot = DAG.CreateStackTemporary(RetVT);
-    auto PtrInfo = MachinePointerInfo::getFixedStack(
-        DAG.getMachineFunction(),
-        cast<FrameIndexSDNode>(StackSlot)->getIndex());
-    return std::make_pair(StackSlot, PtrInfo);
-  };
+    ResultStores[Use.getResNo()] = ST;
+  }
+
+  // Collect input chains (and avoid chains referring to one of the stores).
+  SmallVector<SDValue> InChains;
+  for (auto [ResNum, ST] : llvm::enumerate(ResultStores)) {
+    unsigned OtherResNum = ResNum == 0 ? 1 : 0;
+    if (ST && ST->getChain().getNode() != ResultStores[OtherResNum])
+      InChains.push_back(ST->getChain());
+  }
+  if (InChains.empty())
+    InChains.push_back(DAG.getEntryNode());
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry{};
@@ -2392,28 +2387,19 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
   Entry.Ty = RetTy;
   Args.push_back(Entry);
 
-  // Pass the return address of sin.
-  auto SinPtr = GetOrCreateOutPointer(SinST);
-  Entry.Node = SinPtr.first;
-  Entry.Ty = PointerType::getUnqual(RetTy->getContext());
-  Args.push_back(Entry);
-
-  // Also pass the return address of the cos.
-  auto CosPtr = GetOrCreateOutPointer(CosST);
-  Entry.Node = CosPtr.first;
-  Entry.Ty = PointerType::getUnqual(RetTy->getContext());
-  Args.push_back(Entry);
-
-  // Combine any input chains from the stores.
-  SmallVector<SDValue, 2> InChains{};
-  for (StoreSDNode *ST : {SinST, CosST}) {
-    if (ST)
-      InChains.push_back(ST->getChain());
+  // Pass the output pointers for sin and cos.
+  SmallVector<SDValue, 2> ResultPtrs{};
+  for (StoreSDNode *ST : ResultStores) {
+    SDValue ResultPtr = ST ? ST->getBasePtr() : DAG.CreateStackTemporary(RetVT);
+    Entry.Node = ResultPtr;
+    Entry.Ty = PointerType::getUnqual(RetTy->getContext());
+    Args.push_back(Entry);
+    ResultPtrs.push_back(ResultPtr);
   }
-  if (InChains.empty())
-    InChains.push_back(DAG.getEntryNode());
 
   SDLoc DL(Node);
+
+  // Combine any input chains from the stores.
   SDValue InChain = DAG.getTokenFactor(DL, InChains);
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
                                          TLI.getPointerTy(DAG.getDataLayout()));
@@ -2424,16 +2410,19 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
 
   auto [Call, OutChain] = TLI.LowerCallTo(CLI);
 
-  // Replace the stores with the library call.
-  for (StoreSDNode *ST : {SinST, CosST}) {
-    if (!ST)
-      continue;
-    DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain);
-  }
-
-  for (auto [Ptr, PtrInfo] : {SinPtr, CosPtr}) {
-    SDValue LoadExp = DAG.getLoad(RetVT, DL, OutChain, Ptr, PtrInfo);
-    Results.push_back(LoadExp);
+  for (auto [ResNo, ResultPtr] : llvm::enumerate(ResultPtrs)) {
+    MachinePointerInfo PtrInfo;
+    if (StoreSDNode *ST = ResultStores[ResNo]) {
+      // Replace store with the library call.
+      DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain);
+      PtrInfo = ST->getPointerInfo();
+    } else {
+      PtrInfo = MachinePointerInfo::getFixedStack(
+          DAG.getMachineFunction(),
+          cast<FrameIndexSDNode>(ResultPtr)->getIndex());
+    }
+    SDValue LoadResult = DAG.getLoad(RetVT, DL, OutChain, ResultPtr, PtrInfo);
+    Results.push_back(LoadResult);
   }
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
index c32ac58a6a851e..697e9c3444f86a 100644
--- a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
+++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
@@ -118,7 +118,7 @@ entry:
 
 ; Negative test. We can't fold volatile stores into the library call.
 define void @sincos_volatile_result_stores(float %x, ptr %out_sin, ptr %out_cos) {
-; CHECK-LABEL: negative_fold_sincos_volatile_store:
+; CHECK-LABEL: sincos_volatile_result_stores:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
 ; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
@@ -147,7 +147,7 @@ entry:
 
 ; Negative test. We can't fold atomic stores into the library call.
 define void @sincos_atomic_result_stores(float %x, ptr %out_sin, ptr %out_cos) {
-; CHECK-LABEL: negative_fold_sincos_atomic_store:
+; CHECK-LABEL: sincos_atomic_result_stores:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
 ; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
@@ -177,7 +177,7 @@ entry:
 
 ; Negative test. We can't fold misaligned stores into the library call.
 define void @sincos_misaligned_result_stores(double %x, ptr %out_sin, ptr %out_cos) {
-; CHECK-LABEL: negative_sincos_bad_alignment:
+; CHECK-LABEL: sincos_misaligned_result_stores:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sub sp, sp, #48
 ; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill

>From 74113f0b7cb9bc121ef5e4667da6972acee55f0c Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 16 Sep 2024 10:45:53 +0000
Subject: [PATCH 5/6] Fixups

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 583859deb7ee7d..e4a2cf5b382991 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2349,9 +2349,9 @@ static bool useSinCos(SDNode *Node) {
 /// Issue libcalls to sincos to compute sin / cos pairs.
 void SelectionDAGLegalize::ExpandSinCosLibCall(
     SDNode *Node, SmallVectorImpl<SDValue> &Results) {
-  EVT RetVT = Node->getValueType(0);
-  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
-  RTLIB::Libcall LC = RTLIB::getFSINCOS(RetVT);
+  EVT VT = Node->getValueType(0);
+  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+  RTLIB::Libcall LC = RTLIB::getFSINCOS(VT);
 
   // Find users of the node that store the results. The destination pointers
   // can be used instead of creating stack allocations.
@@ -2363,14 +2363,14 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
     if (!ISD::isNormalStore(User))
       continue;
     auto *ST = cast<StoreSDNode>(User);
-    if (!ST->isSimple() || ST->getPointerInfo().getAddrSpace() != 0 ||
-        ST->getAlign() < DAG.getDataLayout().getABITypeAlign(RetTy))
+    if (!ST->isSimple() || ST->getAddressSpace() != 0 ||
+        ST->getAlign() < DAG.getDataLayout().getABITypeAlign(Ty))
       continue;
     ResultStores[Use.getResNo()] = ST;
   }
 
   // Collect input chains (and avoid chains referring to one of the stores).
-  SmallVector<SDValue> InChains;
+  SmallVector<SDValue, 2> InChains;
   for (auto [ResNum, ST] : llvm::enumerate(ResultStores)) {
     unsigned OtherResNum = ResNum == 0 ? 1 : 0;
     if (ST && ST->getChain().getNode() != ResultStores[OtherResNum])
@@ -2384,15 +2384,15 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
 
   // Pass the argument.
   Entry.Node = Node->getOperand(0);
-  Entry.Ty = RetTy;
+  Entry.Ty = Ty;
   Args.push_back(Entry);
 
   // Pass the output pointers for sin and cos.
   SmallVector<SDValue, 2> ResultPtrs{};
   for (StoreSDNode *ST : ResultStores) {
-    SDValue ResultPtr = ST ? ST->getBasePtr() : DAG.CreateStackTemporary(RetVT);
+    SDValue ResultPtr = ST ? ST->getBasePtr() : DAG.CreateStackTemporary(VT);
     Entry.Node = ResultPtr;
-    Entry.Ty = PointerType::getUnqual(RetTy->getContext());
+    Entry.Ty = PointerType::getUnqual(Ty->getContext());
     Args.push_back(Entry);
     ResultPtrs.push_back(ResultPtr);
   }
@@ -2421,7 +2421,7 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
           DAG.getMachineFunction(),
           cast<FrameIndexSDNode>(ResultPtr)->getIndex());
     }
-    SDValue LoadResult = DAG.getLoad(RetVT, DL, OutChain, ResultPtr, PtrInfo);
+    SDValue LoadResult = DAG.getLoad(VT, DL, OutChain, ResultPtr, PtrInfo);
     Results.push_back(LoadResult);
   }
 }

>From 451e4994b135df108b90904c3f1f51cac0fa5327 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 17 Sep 2024 13:09:01 +0000
Subject: [PATCH 6/6] Use for-range loop

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index e4a2cf5b382991..5b02218fc24721 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2356,17 +2356,14 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
   // Find users of the node that store the results. The destination pointers
   // can be used instead of creating stack allocations.
   std::array<StoreSDNode *, 2> ResultStores = {nullptr};
-  for (SDNode::use_iterator UI = Node->use_begin(), UE = Node->use_end();
-       UI != UE; ++UI) {
-    SDUse &Use = UI.getUse();
-    SDNode *User = Use.getUser();
+  for (SDNode* User : Node->uses()) {
     if (!ISD::isNormalStore(User))
       continue;
     auto *ST = cast<StoreSDNode>(User);
     if (!ST->isSimple() || ST->getAddressSpace() != 0 ||
         ST->getAlign() < DAG.getDataLayout().getABITypeAlign(Ty))
       continue;
-    ResultStores[Use.getResNo()] = ST;
+    ResultStores[ST.getValue().getResNo()] = ST;
   }
 
   // Collect input chains (and avoid chains referring to one of the stores).