[llvm] 436b875 - [SDAG] avoid libcalls to fmin/fmax for soft-float targets
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 30 08:22:16 PDT 2022
Author: Sanjay Patel
Date: 2022-03-30T11:22:03-04:00
New Revision: 436b875e49ec05f24f7d7660a82fc5c5780221a0
URL: https://github.com/llvm/llvm-project/commit/436b875e49ec05f24f7d7660a82fc5c5780221a0
DIFF: https://github.com/llvm/llvm-project/commit/436b875e49ec05f24f7d7660a82fc5c5780221a0.diff
LOG: [SDAG] avoid libcalls to fmin/fmax for soft-float targets
This is an extension of D70965 to avoid creating a mathlib
call where it did not exist in the original source. Also see
D70852 for discussion about an alternative proposal that was
abandoned.
In the motivating bug report:
https://github.com/llvm/llvm-project/issues/54554
...we also have a more general issue about handling "no-builtin" options.
Differential Revision: https://reviews.llvm.org/D122610
Added:
Modified:
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll
llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll
llvm/test/CodeGen/RISCV/fmax-fmin.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 7d77caea808a6..677007d35de41 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4481,6 +4481,13 @@ class TargetLowering : public TargetLoweringBase {
return SDValue();
}
+ /// Try to convert the fminnum/fmaxnum to a compare/select sequence. This is
+ /// required for correctness since InstCombine might have canonicalized a
+ /// fcmp+select sequence to a FMINNUM/FMAXNUM intrinsic. If we were to fall
+ /// through to the default expansion/soften to libcall, we might introduce a
+ /// link-time dependency on libm into a file that originally did not have one.
+ SDValue createSelectForFMINNUM_FMAXNUM(SDNode *Node, SelectionDAG &DAG) const;
+
/// Return a reciprocal estimate value for the input operand.
/// \p Enabled is a ReciprocalEstimate enum with value either 'Unspecified' or
/// 'Enabled' as set by a potential default override attribute.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 6bf38d7296a8d..12f4118ff9bc9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -273,6 +273,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N) {
}
SDValue DAGTypeLegalizer::SoftenFloatRes_FMINNUM(SDNode *N) {
+ if (SDValue SelCC = TLI.createSelectForFMINNUM_FMAXNUM(N, DAG))
+ return SoftenFloatRes_SELECT_CC(SelCC.getNode());
return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
RTLIB::FMIN_F32,
RTLIB::FMIN_F64,
@@ -282,6 +284,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMINNUM(SDNode *N) {
}
SDValue DAGTypeLegalizer::SoftenFloatRes_FMAXNUM(SDNode *N) {
+ if (SDValue SelCC = TLI.createSelectForFMINNUM_FMAXNUM(N, DAG))
+ return SoftenFloatRes_SELECT_CC(SelCC.getNode());
return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
RTLIB::FMAX_F32,
RTLIB::FMAX_F64,
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 0da7edcc084d1..4190b13cd4968 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7259,6 +7259,30 @@ bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
return true;
}
+SDValue
+TargetLowering::createSelectForFMINNUM_FMAXNUM(SDNode *Node,
+ SelectionDAG &DAG) const {
+ unsigned Opcode = Node->getOpcode();
+ assert((Opcode == ISD::FMINNUM || Opcode == ISD::FMAXNUM ||
+ Opcode == ISD::STRICT_FMINNUM || Opcode == ISD::STRICT_FMAXNUM) &&
+ "Wrong opcode");
+
+ if (Node->getFlags().hasNoNaNs()) {
+ ISD::CondCode Pred = Opcode == ISD::FMINNUM ? ISD::SETLT : ISD::SETGT;
+ SDValue Op1 = Node->getOperand(0);
+ SDValue Op2 = Node->getOperand(1);
+ SDValue SelCC = DAG.getSelectCC(SDLoc(Node), Op1, Op2, Op1, Op2, Pred);
+ // Copy FMF flags, but always set the no-signed-zeros flag
+ // as this is implied by the FMINNUM/FMAXNUM semantics.
+ SDNodeFlags Flags = Node->getFlags();
+ Flags.setNoSignedZeros(true);
+ SelCC->setFlags(Flags);
+ return SelCC;
+ }
+
+ return SDValue();
+}
+
SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
SelectionDAG &DAG) const {
SDLoc dl(Node);
@@ -7301,25 +7325,8 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
}
}
- // If none of the above worked, but there are no NaNs, then expand to
- // a compare/select sequence. This is required for correctness since
- // InstCombine might have canonicalized a fcmp+select sequence to a
- // FMINNUM/FMAXNUM node. If we were to fall through to the default
- // expansion to libcall, we might introduce a link-time dependency
- // on libm into a file that originally did not have one.
- if (Node->getFlags().hasNoNaNs()) {
- ISD::CondCode Pred =
- Node->getOpcode() == ISD::FMINNUM ? ISD::SETLT : ISD::SETGT;
- SDValue Op1 = Node->getOperand(0);
- SDValue Op2 = Node->getOperand(1);
- SDValue SelCC = DAG.getSelectCC(dl, Op1, Op2, Op1, Op2, Pred);
- // Copy FMF flags, but always set the no-signed-zeros flag
- // as this is implied by the FMINNUM/FMAXNUM semantics.
- SDNodeFlags Flags = Node->getFlags();
- Flags.setNoSignedZeros(true);
- SelCC->setFlags(Flags);
+ if (SDValue SelCC = createSelectForFMINNUM_FMAXNUM(Node, DAG))
return SelCC;
- }
return SDValue();
}
diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll
index 48968ee7ba771..8cfcdbd3b4467 100644
--- a/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll
+++ b/llvm/test/CodeGen/ARM/vecreduce-fmax-legalization-soft-float.ll
@@ -9,33 +9,44 @@ declare fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128>)
define half @test_v4f16(<4 x half> %a) nounwind {
; CHECK-LABEL: test_v4f16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: mov r4, #255
-; CHECK-NEXT: mov r7, r0
-; CHECK-NEXT: orr r4, r4, #65280
-; CHECK-NEXT: mov r5, r2
-; CHECK-NEXT: and r0, r3, r4
-; CHECK-NEXT: mov r6, r1
-; CHECK-NEXT: bl __aeabi_h2f
-; CHECK-NEXT: mov r8, r0
-; CHECK-NEXT: and r0, r5, r4
-; CHECK-NEXT: bl __aeabi_h2f
-; CHECK-NEXT: mov r5, r0
-; CHECK-NEXT: and r0, r7, r4
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr}
+; CHECK-NEXT: mov r9, #255
+; CHECK-NEXT: mov r8, r3
+; CHECK-NEXT: orr r9, r9, #65280
+; CHECK-NEXT: mov r6, r2
+; CHECK-NEXT: and r0, r0, r9
+; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: bl __aeabi_h2f
; CHECK-NEXT: mov r7, r0
-; CHECK-NEXT: and r0, r6, r4
+; CHECK-NEXT: and r0, r5, r9
; CHECK-NEXT: bl __aeabi_h2f
-; CHECK-NEXT: mov r1, r0
+; CHECK-NEXT: mov r5, r0
; CHECK-NEXT: mov r0, r7
-; CHECK-NEXT: bl fmaxf
; CHECK-NEXT: mov r1, r5
-; CHECK-NEXT: bl fmaxf
-; CHECK-NEXT: mov r1, r8
-; CHECK-NEXT: bl fmaxf
+; CHECK-NEXT: bl __aeabi_fcmpgt
+; CHECK-NEXT: mov r4, r0
+; CHECK-NEXT: and r0, r6, r9
+; CHECK-NEXT: bl __aeabi_h2f
+; CHECK-NEXT: cmp r4, #0
+; CHECK-NEXT: mov r6, r0
+; CHECK-NEXT: movne r5, r7
+; CHECK-NEXT: mov r1, r6
+; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: bl __aeabi_fcmpgt
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: and r0, r8, r9
+; CHECK-NEXT: moveq r5, r6
+; CHECK-NEXT: bl __aeabi_h2f
+; CHECK-NEXT: mov r4, r0
+; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: mov r1, r4
+; CHECK-NEXT: bl __aeabi_fcmpgt
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: moveq r5, r4
+; CHECK-NEXT: mov r0, r5
; CHECK-NEXT: bl __aeabi_f2h
-; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, lr}
; CHECK-NEXT: mov pc, lr
%b = call fast half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
ret half %b
@@ -44,16 +55,27 @@ define half @test_v4f16(<4 x half> %a) nounwind {
define float @test_v4f32(<4 x float> %a) nounwind {
; CHECK-LABEL: test_v4f32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r4, r5, r11, lr}
-; CHECK-NEXT: push {r4, r5, r11, lr}
+; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr}
; CHECK-NEXT: mov r4, r3
-; CHECK-NEXT: mov r5, r2
-; CHECK-NEXT: bl fmaxf
-; CHECK-NEXT: mov r1, r5
-; CHECK-NEXT: bl fmaxf
+; CHECK-NEXT: mov r6, r2
+; CHECK-NEXT: mov r5, r1
+; CHECK-NEXT: mov r7, r0
+; CHECK-NEXT: bl __aeabi_fcmpgt
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mov r1, r6
+; CHECK-NEXT: movne r5, r7
+; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: bl __aeabi_fcmpgt
+; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: mov r1, r4
-; CHECK-NEXT: bl fmaxf
-; CHECK-NEXT: pop {r4, r5, r11, lr}
+; CHECK-NEXT: moveq r5, r6
+; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: bl __aeabi_fcmpgt
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: moveq r5, r4
+; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: pop {r4, r5, r6, r7, r11, lr}
; CHECK-NEXT: mov pc, lr
%b = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
ret float %b
@@ -62,10 +84,26 @@ define float @test_v4f32(<4 x float> %a) nounwind {
define double @test_v2f64(<2 x double> %a) nounwind {
; CHECK-LABEL: test_v2f64:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r11, lr}
-; CHECK-NEXT: push {r11, lr}
-; CHECK-NEXT: bl fmax
-; CHECK-NEXT: pop {r11, lr}
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: mov r4, r3
+; CHECK-NEXT: mov r6, r2
+; CHECK-NEXT: mov r8, r1
+; CHECK-NEXT: mov r7, r0
+; CHECK-NEXT: bl __aeabi_dcmpgt
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mov r5, r6
+; CHECK-NEXT: mov r0, r7
+; CHECK-NEXT: mov r1, r8
+; CHECK-NEXT: mov r2, r6
+; CHECK-NEXT: mov r3, r4
+; CHECK-NEXT: movne r5, r7
+; CHECK-NEXT: bl __aeabi_dcmpgt
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: movne r4, r8
+; CHECK-NEXT: mov r1, r4
+; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: mov pc, lr
%b = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
ret double %b
@@ -74,21 +112,65 @@ define double @test_v2f64(<2 x double> %a) nounwind {
define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
; CHECK-LABEL: test_v2f128:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r11, lr}
-; CHECK-NEXT: push {r11, lr}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: ldr r12, [sp, #36]
-; CHECK-NEXT: str r12, [sp, #12]
-; CHECK-NEXT: ldr r12, [sp, #32]
-; CHECK-NEXT: str r12, [sp, #8]
-; CHECK-NEXT: ldr r12, [sp, #28]
-; CHECK-NEXT: str r12, [sp, #4]
-; CHECK-NEXT: ldr r12, [sp, #24]
-; CHECK-NEXT: str r12, [sp]
-; CHECK-NEXT: bl fmaxl
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: pop {r11, lr}
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: .pad #28
+; CHECK-NEXT: sub sp, sp, #28
+; CHECK-NEXT: ldr r5, [sp, #76]
+; CHECK-NEXT: mov r8, r3
+; CHECK-NEXT: ldr r6, [sp, #72]
+; CHECK-NEXT: mov r9, r2
+; CHECK-NEXT: ldr r4, [sp, #68]
+; CHECK-NEXT: mov r10, r1
+; CHECK-NEXT: ldr r7, [sp, #64]
+; CHECK-NEXT: mov r11, r0
+; CHECK-NEXT: str r5, [sp, #12]
+; CHECK-NEXT: str r6, [sp, #8]
+; CHECK-NEXT: str r4, [sp, #4]
+; CHECK-NEXT: str r7, [sp]
+; CHECK-NEXT: bl __gttf2
+; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: mov r1, r10
+; CHECK-NEXT: mov r2, r9
+; CHECK-NEXT: mov r3, r8
+; CHECK-NEXT: str r7, [sp]
+; CHECK-NEXT: stmib sp, {r4, r6}
+; CHECK-NEXT: str r5, [sp, #12]
+; CHECK-NEXT: bl __gttf2
+; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: mov r1, r10
+; CHECK-NEXT: mov r2, r9
+; CHECK-NEXT: mov r3, r8
+; CHECK-NEXT: str r7, [sp]
+; CHECK-NEXT: stmib sp, {r4, r6}
+; CHECK-NEXT: str r5, [sp, #12]
+; CHECK-NEXT: bl __gttf2
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: str r7, [sp]
+; CHECK-NEXT: movgt r7, r11
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: stmib sp, {r4, r6}
+; CHECK-NEXT: movgt r4, r10
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: mov r1, r10
+; CHECK-NEXT: mov r2, r9
+; CHECK-NEXT: mov r3, r8
+; CHECK-NEXT: str r5, [sp, #12]
+; CHECK-NEXT: movgt r6, r9
+; CHECK-NEXT: bl __gttf2
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mov r0, r7
+; CHECK-NEXT: movgt r5, r8
+; CHECK-NEXT: mov r1, r4
+; CHECK-NEXT: mov r2, r6
+; CHECK-NEXT: mov r3, r5
+; CHECK-NEXT: add sp, sp, #28
+; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: mov pc, lr
%b = call fast fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a)
ret fp128 %b
diff --git a/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll b/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll
index 1252085ef948d..70c569e4f4781 100644
--- a/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll
+++ b/llvm/test/CodeGen/ARM/vecreduce-fmin-legalization-soft-float.ll
@@ -9,33 +9,44 @@ declare fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128>)
define half @test_v4f16(<4 x half> %a) nounwind {
; CHECK-LABEL: test_v4f16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: mov r4, #255
-; CHECK-NEXT: mov r7, r0
-; CHECK-NEXT: orr r4, r4, #65280
-; CHECK-NEXT: mov r5, r2
-; CHECK-NEXT: and r0, r3, r4
-; CHECK-NEXT: mov r6, r1
-; CHECK-NEXT: bl __aeabi_h2f
-; CHECK-NEXT: mov r8, r0
-; CHECK-NEXT: and r0, r5, r4
-; CHECK-NEXT: bl __aeabi_h2f
-; CHECK-NEXT: mov r5, r0
-; CHECK-NEXT: and r0, r7, r4
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r11, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r11, lr}
+; CHECK-NEXT: mov r9, #255
+; CHECK-NEXT: mov r8, r3
+; CHECK-NEXT: orr r9, r9, #65280
+; CHECK-NEXT: mov r6, r2
+; CHECK-NEXT: and r0, r0, r9
+; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: bl __aeabi_h2f
; CHECK-NEXT: mov r7, r0
-; CHECK-NEXT: and r0, r6, r4
+; CHECK-NEXT: and r0, r5, r9
; CHECK-NEXT: bl __aeabi_h2f
-; CHECK-NEXT: mov r1, r0
+; CHECK-NEXT: mov r5, r0
; CHECK-NEXT: mov r0, r7
-; CHECK-NEXT: bl fminf
; CHECK-NEXT: mov r1, r5
-; CHECK-NEXT: bl fminf
-; CHECK-NEXT: mov r1, r8
-; CHECK-NEXT: bl fminf
+; CHECK-NEXT: bl __aeabi_fcmplt
+; CHECK-NEXT: mov r4, r0
+; CHECK-NEXT: and r0, r6, r9
+; CHECK-NEXT: bl __aeabi_h2f
+; CHECK-NEXT: cmp r4, #0
+; CHECK-NEXT: mov r6, r0
+; CHECK-NEXT: movne r5, r7
+; CHECK-NEXT: mov r1, r6
+; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: bl __aeabi_fcmplt
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: and r0, r8, r9
+; CHECK-NEXT: moveq r5, r6
+; CHECK-NEXT: bl __aeabi_h2f
+; CHECK-NEXT: mov r4, r0
+; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: mov r1, r4
+; CHECK-NEXT: bl __aeabi_fcmplt
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: moveq r5, r4
+; CHECK-NEXT: mov r0, r5
; CHECK-NEXT: bl __aeabi_f2h
-; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r11, lr}
; CHECK-NEXT: mov pc, lr
%b = call fast half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
ret half %b
@@ -44,16 +55,27 @@ define half @test_v4f16(<4 x half> %a) nounwind {
define float @test_v4f32(<4 x float> %a) nounwind {
; CHECK-LABEL: test_v4f32:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r4, r5, r11, lr}
-; CHECK-NEXT: push {r4, r5, r11, lr}
+; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr}
; CHECK-NEXT: mov r4, r3
-; CHECK-NEXT: mov r5, r2
-; CHECK-NEXT: bl fminf
-; CHECK-NEXT: mov r1, r5
-; CHECK-NEXT: bl fminf
+; CHECK-NEXT: mov r6, r2
+; CHECK-NEXT: mov r5, r1
+; CHECK-NEXT: mov r7, r0
+; CHECK-NEXT: bl __aeabi_fcmplt
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mov r1, r6
+; CHECK-NEXT: movne r5, r7
+; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: bl __aeabi_fcmplt
+; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: mov r1, r4
-; CHECK-NEXT: bl fminf
-; CHECK-NEXT: pop {r4, r5, r11, lr}
+; CHECK-NEXT: moveq r5, r6
+; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: bl __aeabi_fcmplt
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: moveq r5, r4
+; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: pop {r4, r5, r6, r7, r11, lr}
; CHECK-NEXT: mov pc, lr
%b = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
ret float %b
@@ -62,10 +84,26 @@ define float @test_v4f32(<4 x float> %a) nounwind {
define double @test_v2f64(<2 x double> %a) nounwind {
; CHECK-LABEL: test_v2f64:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r11, lr}
-; CHECK-NEXT: push {r11, lr}
-; CHECK-NEXT: bl fmin
-; CHECK-NEXT: pop {r11, lr}
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: mov r4, r3
+; CHECK-NEXT: mov r6, r2
+; CHECK-NEXT: mov r8, r1
+; CHECK-NEXT: mov r7, r0
+; CHECK-NEXT: bl __aeabi_dcmplt
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mov r5, r6
+; CHECK-NEXT: mov r0, r7
+; CHECK-NEXT: mov r1, r8
+; CHECK-NEXT: mov r2, r6
+; CHECK-NEXT: mov r3, r4
+; CHECK-NEXT: movne r5, r7
+; CHECK-NEXT: bl __aeabi_dcmplt
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: movne r4, r8
+; CHECK-NEXT: mov r1, r4
+; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: mov pc, lr
%b = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
ret double %b
@@ -74,21 +112,65 @@ define double @test_v2f64(<2 x double> %a) nounwind {
define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
; CHECK-LABEL: test_v2f128:
; CHECK: @ %bb.0:
-; CHECK-NEXT: .save {r11, lr}
-; CHECK-NEXT: push {r11, lr}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: ldr r12, [sp, #36]
-; CHECK-NEXT: str r12, [sp, #12]
-; CHECK-NEXT: ldr r12, [sp, #32]
-; CHECK-NEXT: str r12, [sp, #8]
-; CHECK-NEXT: ldr r12, [sp, #28]
-; CHECK-NEXT: str r12, [sp, #4]
-; CHECK-NEXT: ldr r12, [sp, #24]
-; CHECK-NEXT: str r12, [sp]
-; CHECK-NEXT: bl fminl
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: pop {r11, lr}
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: .pad #28
+; CHECK-NEXT: sub sp, sp, #28
+; CHECK-NEXT: ldr r5, [sp, #76]
+; CHECK-NEXT: mov r8, r3
+; CHECK-NEXT: ldr r6, [sp, #72]
+; CHECK-NEXT: mov r9, r2
+; CHECK-NEXT: ldr r4, [sp, #68]
+; CHECK-NEXT: mov r10, r1
+; CHECK-NEXT: ldr r7, [sp, #64]
+; CHECK-NEXT: mov r11, r0
+; CHECK-NEXT: str r5, [sp, #12]
+; CHECK-NEXT: str r6, [sp, #8]
+; CHECK-NEXT: str r4, [sp, #4]
+; CHECK-NEXT: str r7, [sp]
+; CHECK-NEXT: bl __lttf2
+; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: mov r1, r10
+; CHECK-NEXT: mov r2, r9
+; CHECK-NEXT: mov r3, r8
+; CHECK-NEXT: str r7, [sp]
+; CHECK-NEXT: stmib sp, {r4, r6}
+; CHECK-NEXT: str r5, [sp, #12]
+; CHECK-NEXT: bl __lttf2
+; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: mov r1, r10
+; CHECK-NEXT: mov r2, r9
+; CHECK-NEXT: mov r3, r8
+; CHECK-NEXT: str r7, [sp]
+; CHECK-NEXT: stmib sp, {r4, r6}
+; CHECK-NEXT: str r5, [sp, #12]
+; CHECK-NEXT: bl __lttf2
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: str r7, [sp]
+; CHECK-NEXT: movmi r7, r11
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: stmib sp, {r4, r6}
+; CHECK-NEXT: movmi r4, r10
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: mov r1, r10
+; CHECK-NEXT: mov r2, r9
+; CHECK-NEXT: mov r3, r8
+; CHECK-NEXT: str r5, [sp, #12]
+; CHECK-NEXT: movmi r6, r9
+; CHECK-NEXT: bl __lttf2
+; CHECK-NEXT: cmp r0, #0
+; CHECK-NEXT: mov r0, r7
+; CHECK-NEXT: movmi r5, r8
+; CHECK-NEXT: mov r1, r4
+; CHECK-NEXT: mov r2, r6
+; CHECK-NEXT: mov r3, r5
+; CHECK-NEXT: add sp, sp, #28
+; CHECK-NEXT: pop {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: mov pc, lr
%b = call fast fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128> %a)
ret fp128 %b
diff --git a/llvm/test/CodeGen/RISCV/fmax-fmin.ll b/llvm/test/CodeGen/RISCV/fmax-fmin.ll
index 4be3e9e55d7c4..8a1ab85a1b336 100644
--- a/llvm/test/CodeGen/RISCV/fmax-fmin.ll
+++ b/llvm/test/CodeGen/RISCV/fmax-fmin.ll
@@ -29,18 +29,40 @@ define float @maxnum_f32_fast(float %x, float %y) nounwind {
; R32: # %bb.0:
; R32-NEXT: addi sp, sp, -16
; R32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; R32-NEXT: call fmaxf at plt
+; R32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; R32-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; R32-NEXT: mv s1, a1
+; R32-NEXT: mv s0, a0
+; R32-NEXT: call __gtsf2 at plt
+; R32-NEXT: bgtz a0, .LBB1_2
+; R32-NEXT: # %bb.1:
+; R32-NEXT: mv s0, s1
+; R32-NEXT: .LBB1_2:
+; R32-NEXT: mv a0, s0
; R32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; R32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
+; R32-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; R32-NEXT: addi sp, sp, 16
; R32-NEXT: ret
;
; R64-LABEL: maxnum_f32_fast:
; R64: # %bb.0:
-; R64-NEXT: addi sp, sp, -16
-; R64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; R64-NEXT: call fmaxf at plt
-; R64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; R64-NEXT: addi sp, sp, 16
+; R64-NEXT: addi sp, sp, -32
+; R64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
+; R64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; R64-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
+; R64-NEXT: mv s1, a1
+; R64-NEXT: mv s0, a0
+; R64-NEXT: call __gtsf2 at plt
+; R64-NEXT: bgtz a0, .LBB1_2
+; R64-NEXT: # %bb.1:
+; R64-NEXT: mv s0, s1
+; R64-NEXT: .LBB1_2:
+; R64-NEXT: mv a0, s0
+; R64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
+; R64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
+; R64-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
+; R64-NEXT: addi sp, sp, 32
; R64-NEXT: ret
%r = call fast float @llvm.maxnum.f32(float %x, float %y)
ret float %r
@@ -71,20 +93,61 @@ define double @maxnum_f64(double %x, double %y) nounwind {
define double @maxnum_f64_nnan(double %x, double %y) nounwind {
; R32-LABEL: maxnum_f64_nnan:
; R32: # %bb.0:
-; R32-NEXT: addi sp, sp, -16
-; R32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; R32-NEXT: call fmax at plt
-; R32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; R32-NEXT: addi sp, sp, 16
+; R32-NEXT: addi sp, sp, -32
+; R32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; R32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; R32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; R32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; R32-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; R32-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
+; R32-NEXT: mv s1, a3
+; R32-NEXT: mv s2, a2
+; R32-NEXT: mv s0, a1
+; R32-NEXT: mv s4, a0
+; R32-NEXT: call __gtdf2 at plt
+; R32-NEXT: mv s3, s4
+; R32-NEXT: bgtz a0, .LBB3_2
+; R32-NEXT: # %bb.1:
+; R32-NEXT: mv s3, s2
+; R32-NEXT: .LBB3_2:
+; R32-NEXT: mv a0, s4
+; R32-NEXT: mv a1, s0
+; R32-NEXT: mv a2, s2
+; R32-NEXT: mv a3, s1
+; R32-NEXT: call __gtdf2 at plt
+; R32-NEXT: bgtz a0, .LBB3_4
+; R32-NEXT: # %bb.3:
+; R32-NEXT: mv s0, s1
+; R32-NEXT: .LBB3_4:
+; R32-NEXT: mv a0, s3
+; R32-NEXT: mv a1, s0
+; R32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; R32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; R32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; R32-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; R32-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; R32-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
+; R32-NEXT: addi sp, sp, 32
; R32-NEXT: ret
;
; R64-LABEL: maxnum_f64_nnan:
; R64: # %bb.0:
-; R64-NEXT: addi sp, sp, -16
-; R64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; R64-NEXT: call fmax at plt
-; R64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; R64-NEXT: addi sp, sp, 16
+; R64-NEXT: addi sp, sp, -32
+; R64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
+; R64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; R64-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
+; R64-NEXT: mv s1, a1
+; R64-NEXT: mv s0, a0
+; R64-NEXT: call __gtdf2 at plt
+; R64-NEXT: bgtz a0, .LBB3_2
+; R64-NEXT: # %bb.1:
+; R64-NEXT: mv s0, s1
+; R64-NEXT: .LBB3_2:
+; R64-NEXT: mv a0, s0
+; R64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
+; R64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
+; R64-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
+; R64-NEXT: addi sp, sp, 32
; R64-NEXT: ret
%r = call nnan double @llvm.maxnum.f64(double %x, double %y)
ret double %r
@@ -117,18 +180,40 @@ define float @minnum_f32_nnan(float %x, float %y) nounwind {
; R32: # %bb.0:
; R32-NEXT: addi sp, sp, -16
; R32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; R32-NEXT: call fminf at plt
+; R32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; R32-NEXT: sw s1, 4(sp) # 4-byte Folded Spill
+; R32-NEXT: mv s1, a1
+; R32-NEXT: mv s0, a0
+; R32-NEXT: call __ltsf2 at plt
+; R32-NEXT: bltz a0, .LBB5_2
+; R32-NEXT: # %bb.1:
+; R32-NEXT: mv s0, s1
+; R32-NEXT: .LBB5_2:
+; R32-NEXT: mv a0, s0
; R32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; R32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
+; R32-NEXT: lw s1, 4(sp) # 4-byte Folded Reload
; R32-NEXT: addi sp, sp, 16
; R32-NEXT: ret
;
; R64-LABEL: minnum_f32_nnan:
; R64: # %bb.0:
-; R64-NEXT: addi sp, sp, -16
-; R64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; R64-NEXT: call fminf at plt
-; R64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; R64-NEXT: addi sp, sp, 16
+; R64-NEXT: addi sp, sp, -32
+; R64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
+; R64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; R64-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
+; R64-NEXT: mv s1, a1
+; R64-NEXT: mv s0, a0
+; R64-NEXT: call __ltsf2 at plt
+; R64-NEXT: bltz a0, .LBB5_2
+; R64-NEXT: # %bb.1:
+; R64-NEXT: mv s0, s1
+; R64-NEXT: .LBB5_2:
+; R64-NEXT: mv a0, s0
+; R64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
+; R64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
+; R64-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
+; R64-NEXT: addi sp, sp, 32
; R64-NEXT: ret
%r = call nnan float @llvm.minnum.f32(float %x, float %y)
ret float %r
@@ -159,20 +244,61 @@ define double @minnum_f64(double %x, double %y) nounwind {
define double @minnum_f64_fast(double %x, double %y) nounwind {
; R32-LABEL: minnum_f64_fast:
; R32: # %bb.0:
-; R32-NEXT: addi sp, sp, -16
-; R32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; R32-NEXT: call fmin at plt
-; R32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; R32-NEXT: addi sp, sp, 16
+; R32-NEXT: addi sp, sp, -32
+; R32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; R32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; R32-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; R32-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; R32-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; R32-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
+; R32-NEXT: mv s1, a3
+; R32-NEXT: mv s2, a2
+; R32-NEXT: mv s0, a1
+; R32-NEXT: mv s4, a0
+; R32-NEXT: call __ltdf2 at plt
+; R32-NEXT: mv s3, s4
+; R32-NEXT: bltz a0, .LBB7_2
+; R32-NEXT: # %bb.1:
+; R32-NEXT: mv s3, s2
+; R32-NEXT: .LBB7_2:
+; R32-NEXT: mv a0, s4
+; R32-NEXT: mv a1, s0
+; R32-NEXT: mv a2, s2
+; R32-NEXT: mv a3, s1
+; R32-NEXT: call __ltdf2 at plt
+; R32-NEXT: bltz a0, .LBB7_4
+; R32-NEXT: # %bb.3:
+; R32-NEXT: mv s0, s1
+; R32-NEXT: .LBB7_4:
+; R32-NEXT: mv a0, s3
+; R32-NEXT: mv a1, s0
+; R32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; R32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; R32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; R32-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; R32-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; R32-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
+; R32-NEXT: addi sp, sp, 32
; R32-NEXT: ret
;
; R64-LABEL: minnum_f64_fast:
; R64: # %bb.0:
-; R64-NEXT: addi sp, sp, -16
-; R64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
-; R64-NEXT: call fmin at plt
-; R64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
-; R64-NEXT: addi sp, sp, 16
+; R64-NEXT: addi sp, sp, -32
+; R64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
+; R64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; R64-NEXT: sd s1, 8(sp) # 8-byte Folded Spill
+; R64-NEXT: mv s1, a1
+; R64-NEXT: mv s0, a0
+; R64-NEXT: call __ltdf2 at plt
+; R64-NEXT: bltz a0, .LBB7_2
+; R64-NEXT: # %bb.1:
+; R64-NEXT: mv s0, s1
+; R64-NEXT: .LBB7_2:
+; R64-NEXT: mv a0, s0
+; R64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
+; R64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
+; R64-NEXT: ld s1, 8(sp) # 8-byte Folded Reload
+; R64-NEXT: addi sp, sp, 32
; R64-NEXT: ret
%r = call fast double @llvm.minnum.f64(double %x, double %y)
ret double %r
More information about the llvm-commits
mailing list