[llvm] [SDAG] Avoid creating redundant stack slots when lowering FSINCOS (PR #108401)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 12 11:49:25 PDT 2024
https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/108401
>From ce4e988ab90a064a6d6c5202a75f9226491e75e0 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 12 Sep 2024 13:52:24 +0000
Subject: [PATCH 1/3] Precommit test for sincos stack slots
---
.../CodeGen/AArch64/sincos-stack-slots.ll | 152 ++++++++++++++++++
1 file changed, 152 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
new file mode 100644
index 00000000000000..9c362ba117fefa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
@@ -0,0 +1,152 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+
+define { float, float } @sincos_f32_value_return(float %x) {
+; CHECK-LABEL: sincos_f32_value_return:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: add x0, sp, #12
+; CHECK-NEXT: add x1, sp, #8
+; CHECK-NEXT: bl sincosf
+; CHECK-NEXT: ldp s1, s0, [sp, #8]
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %sin = tail call float @llvm.sin.f32(float %x)
+ %cos = tail call float @llvm.cos.f32(float %x)
+ %ret_0 = insertvalue { float, float } poison, float %sin, 0
+ %ret_1 = insertvalue { float, float } %ret_0, float %cos, 1
+ ret { float, float } %ret_1
+}
+
+define void @sincos_f32_ptr_return(float %x, ptr %out_sin, ptr %out_cos) {
+; CHECK-LABEL: sincos_f32_ptr_return:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w20, -16
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: mov x19, x1
+; CHECK-NEXT: mov x20, x0
+; CHECK-NEXT: add x0, sp, #12
+; CHECK-NEXT: add x1, sp, #8
+; CHECK-NEXT: bl sincosf
+; CHECK-NEXT: ldp s1, s0, [sp, #8]
+; CHECK-NEXT: str s0, [x20]
+; CHECK-NEXT: str s1, [x19]
+; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %sin = tail call float @llvm.sin.f32(float %x)
+ %cos = tail call float @llvm.cos.f32(float %x)
+ store float %sin, ptr %out_sin, align 4
+ store float %cos, ptr %out_cos, align 4
+ ret void
+}
+
+define float @sincos_f32_mixed_return(float %x, ptr %out_sin) {
+; CHECK-LABEL: sincos_f32_mixed_return:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #32
+; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: mov x19, x0
+; CHECK-NEXT: add x0, sp, #12
+; CHECK-NEXT: add x1, sp, #8
+; CHECK-NEXT: bl sincosf
+; CHECK-NEXT: ldp s0, s1, [sp, #8]
+; CHECK-NEXT: str s1, [x19]
+; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: ret
+entry:
+ %sin = tail call float @llvm.sin.f32(float %x)
+ %cos = tail call float @llvm.cos.f32(float %x)
+ store float %sin, ptr %out_sin, align 4
+ ret float %cos
+}
+
+define { double, double } @sincos_f64_value_return(double %x) {
+; CHECK-LABEL: sincos_f64_value_return:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #32
+; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: add x0, sp, #24
+; CHECK-NEXT: add x1, sp, #8
+; CHECK-NEXT: bl sincos
+; CHECK-NEXT: ldr d0, [sp, #24]
+; CHECK-NEXT: ldr d1, [sp, #8]
+; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: ret
+entry:
+ %sin = tail call double @llvm.sin.f64(double %x)
+ %cos = tail call double @llvm.cos.f64(double %x)
+ %ret_0 = insertvalue { double, double } poison, double %sin, 0
+ %ret_1 = insertvalue { double, double } %ret_0, double %cos, 1
+ ret { double, double } %ret_1
+}
+
+define void @sincos_f64_ptr_return(double %x, ptr %out_sin, ptr %out_cos) {
+; CHECK-LABEL: sincos_f64_ptr_return:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #48
+; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w20, -16
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: mov x19, x1
+; CHECK-NEXT: mov x20, x0
+; CHECK-NEXT: add x0, sp, #24
+; CHECK-NEXT: add x1, sp, #8
+; CHECK-NEXT: bl sincos
+; CHECK-NEXT: ldr d0, [sp, #24]
+; CHECK-NEXT: ldr d1, [sp, #8]
+; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: str d0, [x20]
+; CHECK-NEXT: str d1, [x19]
+; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #48
+; CHECK-NEXT: ret
+entry:
+ %sin = tail call double @llvm.sin.f64(double %x)
+ %cos = tail call double @llvm.cos.f64(double %x)
+ store double %sin, ptr %out_sin, align 4
+ store double %cos, ptr %out_cos, align 4
+ ret void
+}
+
+define double @sincos_f64_mixed_return(double %x, ptr %out_sin) {
+; CHECK-LABEL: sincos_f64_mixed_return:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #32
+; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: mov x19, x0
+; CHECK-NEXT: add x0, sp, #8
+; CHECK-NEXT: mov x1, sp
+; CHECK-NEXT: bl sincos
+; CHECK-NEXT: ldp d0, d1, [sp]
+; CHECK-NEXT: str d1, [x19]
+; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: ret
+entry:
+ %sin = tail call double @llvm.sin.f64(double %x)
+ %cos = tail call double @llvm.cos.f64(double %x)
+ store double %sin, ptr %out_sin, align 4
+ ret double %cos
+}
>From 8033c5caf3a98504ced4f312d843ca5a5baa3ddd Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 12 Sep 2024 13:55:25 +0000
Subject: [PATCH 2/3] [SDAG] Avoid creating redundant stack slots when lowering
FSINCOS
When lowering `FSINCOS` to a library call (that takes output pointers)
we can avoid creating new stack allocations if the results of the
`FSINCOS` are being stored. Instead, we can take the destination
pointers from the stores and pass those to the library call.
---
.../include/llvm/CodeGen/RuntimeLibcallUtil.h | 4 +
llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 99 +++++++++----------
llvm/lib/CodeGen/TargetLoweringBase.cpp | 5 +
.../CodeGen/AArch64/sincos-stack-slots.ll | 73 ++++----------
4 files changed, 75 insertions(+), 106 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
index 7a131645893921..045ec7d3653119 100644
--- a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
+++ b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
@@ -62,6 +62,10 @@ Libcall getLDEXP(EVT RetVT);
/// UNKNOWN_LIBCALL if there is none.
Libcall getFREXP(EVT RetVT);
+/// getFSINCOS - Return the FSINCOS_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+Libcall getFSINCOS(EVT RetVT);
+
/// Return the SYNC_FETCH_AND_* value for the given opcode and type, or
/// UNKNOWN_LIBCALL if there is none.
Libcall getSYNC(unsigned Opc, MVT VT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index f5fbc01cd95e96..7b0dc63c473d25 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2326,15 +2326,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
/// Return true if sincos libcall is available.
static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
- RTLIB::Libcall LC;
- switch (Node->getSimpleValueType(0).SimpleTy) {
- default: llvm_unreachable("Unexpected request for libcall!");
- case MVT::f32: LC = RTLIB::SINCOS_F32; break;
- case MVT::f64: LC = RTLIB::SINCOS_F64; break;
- case MVT::f80: LC = RTLIB::SINCOS_F80; break;
- case MVT::f128: LC = RTLIB::SINCOS_F128; break;
- case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
- }
+ RTLIB::Libcall LC = RTLIB::getFSINCOS(Node->getSimpleValueType(0).SimpleTy);
return TLI.getLibcallName(LC) != nullptr;
}
@@ -2355,68 +2347,73 @@ static bool useSinCos(SDNode *Node) {
}
/// Issue libcalls to sincos to compute sin / cos pairs.
-void
-SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
- SmallVectorImpl<SDValue> &Results) {
- RTLIB::Libcall LC;
- switch (Node->getSimpleValueType(0).SimpleTy) {
- default: llvm_unreachable("Unexpected request for libcall!");
- case MVT::f32: LC = RTLIB::SINCOS_F32; break;
- case MVT::f64: LC = RTLIB::SINCOS_F64; break;
- case MVT::f80: LC = RTLIB::SINCOS_F80; break;
- case MVT::f128: LC = RTLIB::SINCOS_F128; break;
- case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
- }
-
- // The input chain to this libcall is the entry node of the function.
- // Legalizing the call will automatically add the previous call to the
- // dependence.
- SDValue InChain = DAG.getEntryNode();
-
+void SelectionDAGLegalize::ExpandSinCosLibCall(
+ SDNode *Node, SmallVectorImpl<SDValue> &Results) {
EVT RetVT = Node->getValueType(0);
Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
+ TargetLowering::ArgListEntry Entry{};
+
+ // Find users of the node that store the results. The destination pointers
+ // can be used instead of creating stack allocations.
+ StoreSDNode *SinST = nullptr;
+ StoreSDNode *CosST = nullptr;
+ for (SDNode::use_iterator UI = Node->use_begin(), UE = Node->use_end();
+ UI != UE; ++UI) {
+ SDUse &Use = UI.getUse();
+ SDNode *User = Use.getUser();
+ if (!ISD::isNormalStore(User))
+ continue;
+ auto *ST = cast<StoreSDNode>(User);
+ if (Use.getResNo() == 0)
+ SinST = ST;
+ if (Use.getResNo() == 1)
+ CosST = ST;
+ }
// Pass the argument.
Entry.Node = Node->getOperand(0);
Entry.Ty = RetTy;
- Entry.IsSExt = false;
- Entry.IsZExt = false;
Args.push_back(Entry);
+ auto GetOrCreateOutPointer = [&](StoreSDNode *MaybeStore) {
+ if (MaybeStore)
+ return std::make_pair(MaybeStore->getBasePtr(),
+ MaybeStore->getPointerInfo());
+ SDValue StackSlot = DAG.CreateStackTemporary(RetVT);
+ auto PtrInfo = MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(),
+ cast<FrameIndexSDNode>(StackSlot)->getIndex());
+ return std::make_pair(StackSlot, PtrInfo);
+ };
+
// Pass the return address of sin.
- SDValue SinPtr = DAG.CreateStackTemporary(RetVT);
- Entry.Node = SinPtr;
+ auto SinPtr = GetOrCreateOutPointer(SinST);
+ Entry.Node = SinPtr.first;
Entry.Ty = PointerType::getUnqual(RetTy->getContext());
- Entry.IsSExt = false;
- Entry.IsZExt = false;
Args.push_back(Entry);
// Also pass the return address of the cos.
- SDValue CosPtr = DAG.CreateStackTemporary(RetVT);
- Entry.Node = CosPtr;
+ auto CosPtr = GetOrCreateOutPointer(CosST);
+ Entry.Node = CosPtr.first;
Entry.Ty = PointerType::getUnqual(RetTy->getContext());
- Entry.IsSExt = false;
- Entry.IsZExt = false;
Args.push_back(Entry);
- SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
- TLI.getPointerTy(DAG.getDataLayout()));
-
- SDLoc dl(Node);
- TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(dl).setChain(InChain).setLibCallee(
- TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee,
- std::move(Args));
+ RTLIB::Libcall LC = RTLIB::getFSINCOS(RetVT);
+ auto [Call, Chain] = ExpandLibCall(LC, Node, std::move(Args), false);
- std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+ // Replace explict stores with the library call.
+ for (StoreSDNode *ST : {SinST, CosST}) {
+ if (ST)
+ DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), Chain);
+ }
- Results.push_back(
- DAG.getLoad(RetVT, dl, CallInfo.second, SinPtr, MachinePointerInfo()));
- Results.push_back(
- DAG.getLoad(RetVT, dl, CallInfo.second, CosPtr, MachinePointerInfo()));
+ SDLoc DL(Node);
+ for (auto [Ptr, PtrInfo] : {SinPtr, CosPtr}) {
+ SDValue LoadExp = DAG.getLoad(RetVT, DL, Chain, Ptr, PtrInfo);
+ Results.push_back(LoadExp);
+ }
}
SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const {
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index eb3190c7cd247a..8e5a0a0ca06082 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -398,6 +398,11 @@ RTLIB::Libcall RTLIB::getFREXP(EVT RetVT) {
FREXP_PPCF128);
}
+RTLIB::Libcall RTLIB::getFSINCOS(EVT RetVT) {
+ return getFPLibCall(RetVT, SINCOS_F32, SINCOS_F64, SINCOS_F80, SINCOS_F128,
+ SINCOS_PPCF128);
+}
+
RTLIB::Libcall RTLIB::getOutlineAtomicHelper(const Libcall (&LC)[5][4],
AtomicOrdering Order,
uint64_t MemSize) {
diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
index 9c362ba117fefa..afd054a83a5014 100644
--- a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
+++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
@@ -24,22 +24,11 @@ entry:
define void @sincos_f32_ptr_return(float %x, ptr %out_sin, ptr %out_cos) {
; CHECK-LABEL: sincos_f32_ptr_return:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: .cfi_offset w19, -8
-; CHECK-NEXT: .cfi_offset w20, -16
-; CHECK-NEXT: .cfi_offset w30, -32
-; CHECK-NEXT: mov x19, x1
-; CHECK-NEXT: mov x20, x0
-; CHECK-NEXT: add x0, sp, #12
-; CHECK-NEXT: add x1, sp, #8
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: bl sincosf
-; CHECK-NEXT: ldp s1, s0, [sp, #8]
-; CHECK-NEXT: str s0, [x20]
-; CHECK-NEXT: str s1, [x19]
-; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%sin = tail call float @llvm.sin.f32(float %x)
@@ -52,19 +41,13 @@ entry:
define float @sincos_f32_mixed_return(float %x, ptr %out_sin) {
; CHECK-LABEL: sincos_f32_mixed_return:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #32
-; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: add x0, sp, #12
-; CHECK-NEXT: add x1, sp, #8
+; CHECK-NEXT: add x1, sp, #12
; CHECK-NEXT: bl sincosf
-; CHECK-NEXT: ldp s0, s1, [sp, #8]
-; CHECK-NEXT: str s1, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: ldr s0, [sp, #12]
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%sin = tail call float @llvm.sin.f32(float %x)
@@ -99,25 +82,11 @@ entry:
define void @sincos_f64_ptr_return(double %x, ptr %out_sin, ptr %out_cos) {
; CHECK-LABEL: sincos_f64_ptr_return:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #48
-; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 48
-; CHECK-NEXT: .cfi_offset w19, -8
-; CHECK-NEXT: .cfi_offset w20, -16
-; CHECK-NEXT: .cfi_offset w30, -32
-; CHECK-NEXT: mov x19, x1
-; CHECK-NEXT: mov x20, x0
-; CHECK-NEXT: add x0, sp, #24
-; CHECK-NEXT: add x1, sp, #8
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: bl sincos
-; CHECK-NEXT: ldr d0, [sp, #24]
-; CHECK-NEXT: ldr d1, [sp, #8]
-; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT: str d0, [x20]
-; CHECK-NEXT: str d1, [x19]
-; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #48
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%sin = tail call double @llvm.sin.f64(double %x)
@@ -130,19 +99,13 @@ entry:
define double @sincos_f64_mixed_return(double %x, ptr %out_sin) {
; CHECK-LABEL: sincos_f64_mixed_return:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #32
-; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: add x0, sp, #8
-; CHECK-NEXT: mov x1, sp
+; CHECK-NEXT: add x1, sp, #8
; CHECK-NEXT: bl sincos
-; CHECK-NEXT: ldp d0, d1, [sp]
-; CHECK-NEXT: str d1, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: ldr d0, [sp, #8]
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
%sin = tail call double @llvm.sin.f64(double %x)
>From f46d132f1f1d383a5dc838e256c344ce43d5e52d Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 12 Sep 2024 18:16:46 +0000
Subject: [PATCH 3/3] Fixups
---
llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 49 ++++++---
.../CodeGen/AArch64/sincos-stack-slots.ll | 99 ++++++++++++++++++-
2 files changed, 131 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 7b0dc63c473d25..aa91d064db48e7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2351,9 +2351,7 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
SDNode *Node, SmallVectorImpl<SDValue> &Results) {
EVT RetVT = Node->getValueType(0);
Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
-
- TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry{};
+ RTLIB::Libcall LC = RTLIB::getFSINCOS(RetVT);
// Find users of the node that store the results. The destination pointers
// can be used instead of creating stack allocations.
@@ -2366,17 +2364,15 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
if (!ISD::isNormalStore(User))
continue;
auto *ST = cast<StoreSDNode>(User);
+ if (!ST->isSimple() || ST->getPointerInfo().getAddrSpace() != 0 ||
+ ST->getAlign() < DAG.getDataLayout().getABITypeAlign(RetTy))
+ continue;
if (Use.getResNo() == 0)
SinST = ST;
if (Use.getResNo() == 1)
CosST = ST;
}
- // Pass the argument.
- Entry.Node = Node->getOperand(0);
- Entry.Ty = RetTy;
- Args.push_back(Entry);
-
auto GetOrCreateOutPointer = [&](StoreSDNode *MaybeStore) {
if (MaybeStore)
return std::make_pair(MaybeStore->getBasePtr(),
@@ -2388,6 +2384,14 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
return std::make_pair(StackSlot, PtrInfo);
};
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry{};
+
+ // Pass the argument.
+ Entry.Node = Node->getOperand(0);
+ Entry.Ty = RetTy;
+ Args.push_back(Entry);
+
// Pass the return address of sin.
auto SinPtr = GetOrCreateOutPointer(SinST);
Entry.Node = SinPtr.first;
@@ -2400,18 +2404,35 @@ void SelectionDAGLegalize::ExpandSinCosLibCall(
Entry.Ty = PointerType::getUnqual(RetTy->getContext());
Args.push_back(Entry);
- RTLIB::Libcall LC = RTLIB::getFSINCOS(RetVT);
- auto [Call, Chain] = ExpandLibCall(LC, Node, std::move(Args), false);
-
- // Replace explict stores with the library call.
+ // Combine any input chains from the stores.
+ SmallVector<SDValue, 2> InChains{};
for (StoreSDNode *ST : {SinST, CosST}) {
if (ST)
- DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), Chain);
+ InChains.push_back(ST->getChain());
}
+ if (InChains.empty())
+ InChains.push_back(DAG.getEntryNode());
SDLoc DL(Node);
+ SDValue InChain = DAG.getTokenFactor(DL, InChains);
+ SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+ TLI.getPointerTy(DAG.getDataLayout()));
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(DL).setChain(InChain).setLibCallee(
+ TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee,
+ std::move(Args));
+
+ auto [Call, OutChain] = TLI.LowerCallTo(CLI);
+
+ // Replace the stores with the library call.
+ for (StoreSDNode *ST : {SinST, CosST}) {
+ if (!ST)
+ continue;
+ DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain);
+ }
+
for (auto [Ptr, PtrInfo] : {SinPtr, CosPtr}) {
- SDValue LoadExp = DAG.getLoad(RetVT, DL, Chain, Ptr, PtrInfo);
+ SDValue LoadExp = DAG.getLoad(RetVT, DL, OutChain, Ptr, PtrInfo);
Results.push_back(LoadExp);
}
}
diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
index afd054a83a5014..c32ac58a6a851e 100644
--- a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
+++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+; This file tests eliding stack slots when lowering the FSINCOS ISD node.
+
define { float, float } @sincos_f32_value_return(float %x) {
; CHECK-LABEL: sincos_f32_value_return:
; CHECK: // %bb.0: // %entry
@@ -91,8 +93,8 @@ define void @sincos_f64_ptr_return(double %x, ptr %out_sin, ptr %out_cos) {
entry:
%sin = tail call double @llvm.sin.f64(double %x)
%cos = tail call double @llvm.cos.f64(double %x)
- store double %sin, ptr %out_sin, align 4
- store double %cos, ptr %out_cos, align 4
+ store double %sin, ptr %out_sin, align 8
+ store double %cos, ptr %out_cos, align 8
ret void
}
@@ -110,6 +112,97 @@ define double @sincos_f64_mixed_return(double %x, ptr %out_sin) {
entry:
%sin = tail call double @llvm.sin.f64(double %x)
%cos = tail call double @llvm.cos.f64(double %x)
- store double %sin, ptr %out_sin, align 4
+ store double %sin, ptr %out_sin, align 8
ret double %cos
}
+
+; Negative test. We can't fold volatile stores into the library call.
+define void @sincos_volatile_result_stores(float %x, ptr %out_sin, ptr %out_cos) {
+; CHECK-LABEL: negative_fold_sincos_volatile_store:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w20, -16
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: mov x19, x1
+; CHECK-NEXT: mov x20, x0
+; CHECK-NEXT: add x0, sp, #12
+; CHECK-NEXT: add x1, sp, #8
+; CHECK-NEXT: bl sincosf
+; CHECK-NEXT: ldp s1, s0, [sp, #8]
+; CHECK-NEXT: str s0, [x20]
+; CHECK-NEXT: str s1, [x19]
+; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %sin = tail call float @llvm.sin.f32(float %x)
+ %cos = tail call float @llvm.cos.f32(float %x)
+ store volatile float %sin, ptr %out_sin, align 4
+ store volatile float %cos, ptr %out_cos, align 4
+ ret void
+}
+
+; Negative test. We can't fold atomic stores into the library call.
+define void @sincos_atomic_result_stores(float %x, ptr %out_sin, ptr %out_cos) {
+; CHECK-LABEL: negative_fold_sincos_atomic_store:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w20, -16
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: mov x19, x1
+; CHECK-NEXT: mov x20, x0
+; CHECK-NEXT: add x0, sp, #12
+; CHECK-NEXT: add x1, sp, #8
+; CHECK-NEXT: bl sincosf
+; CHECK-NEXT: ldr w8, [sp, #12]
+; CHECK-NEXT: str w8, [x20]
+; CHECK-NEXT: ldr w8, [sp, #8]
+; CHECK-NEXT: str w8, [x19]
+; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %sin = tail call float @llvm.sin.f32(float %x)
+ %cos = tail call float @llvm.cos.f32(float %x)
+ store atomic float %sin, ptr %out_sin unordered, align 4
+ store atomic float %cos, ptr %out_cos unordered, align 4
+ ret void
+}
+
+; Negative test. We can't fold misaligned stores into the library call.
+define void @sincos_misaligned_result_stores(double %x, ptr %out_sin, ptr %out_cos) {
+; CHECK-LABEL: negative_sincos_bad_alignment:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #48
+; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w20, -16
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: mov x19, x1
+; CHECK-NEXT: mov x20, x0
+; CHECK-NEXT: add x0, sp, #24
+; CHECK-NEXT: add x1, sp, #8
+; CHECK-NEXT: bl sincos
+; CHECK-NEXT: ldr d0, [sp, #24]
+; CHECK-NEXT: ldr d1, [sp, #8]
+; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: str d0, [x20]
+; CHECK-NEXT: str d1, [x19]
+; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #48
+; CHECK-NEXT: ret
+entry:
+ %sin = tail call double @llvm.sin.f64(double %x)
+ %cos = tail call double @llvm.cos.f64(double %x)
+ store double %sin, ptr %out_sin, align 4
+ store double %cos, ptr %out_cos, align 4
+ ret void
+}
More information about the llvm-commits
mailing list