[llvm] [DAGCombiner] Eliminate fp casts if we have the right fast math flags (PR #131345)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 14 08:37:18 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: John Brawn (john-brawn-arm)
<details>
<summary>Changes</summary>
When floating-point operations are legalized to operations of a higher precision (e.g. f16 fadd being legalized to f32 fadd) then we get narrowing then widening operations between each operation. With the appropriate fast math flags (nnan ninf contract) we can eliminate these casts.
---
Patch is 63.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131345.diff
15 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+45)
- (modified) llvm/test/CodeGen/AArch64/f16-instructions.ll (+11-5)
- (modified) llvm/test/CodeGen/AArch64/fmla.ll (+2-5)
- (modified) llvm/test/CodeGen/AArch64/fp16_fast_math.ll (+109)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll (+61-111)
- (modified) llvm/test/CodeGen/AArch64/vecreduce-fadd.ll (+78-138)
- (modified) llvm/test/CodeGen/AArch64/vecreduce-fmul.ll (+54-94)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp.ll (-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp10.ll (-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp2.ll (-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log.ll (+4-24)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log10.ll (+4-24)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log2.ll (-20)
- (modified) llvm/test/CodeGen/ARM/fp16_fast_math.ll (+143-6)
- (modified) llvm/test/CodeGen/Thumb2/bf16-instructions.ll (+8-17)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c35838601cc9c..c16e2f0bc865b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18455,7 +18455,45 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
return SDValue();
}
+// Eliminate a floating-point widening of a narrowed value if the fast math
+// flags allow it.
+static SDValue eliminateFPCastPair(SDNode *N) {
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ unsigned NarrowingOp;
+ switch (N->getOpcode()) {
+ case ISD::FP16_TO_FP:
+ NarrowingOp = ISD::FP_TO_FP16;
+ break;
+ case ISD::BF16_TO_FP:
+ NarrowingOp = ISD::FP_TO_BF16;
+ break;
+ case ISD::FP_EXTEND:
+ NarrowingOp = ISD::FP_ROUND;
+ break;
+ default:
+ llvm_unreachable("Expected widening FP cast");
+ }
+
+ if (N0.getOpcode() == NarrowingOp && N0.getOperand(0).getValueType() == VT) {
+ const SDNodeFlags SrcFlags = N0->getFlags();
+ const SDNodeFlags DstFlags = N->getFlags();
+ // Narrowing can introduce inf and change the encoding of a nan, so the
+ // destination must have the nnan and ninf flags to indicate that we don't
+ // need to care about that. We are also removing a rounding step, and that
+ // requires both the source and destination to allow contraction.
+ if (DstFlags.hasNoNaNs() && DstFlags.hasNoInfs() &&
+ SrcFlags.hasAllowContract() && DstFlags.hasAllowContract()) {
+ return N0.getOperand(0);
+ }
+ }
+
+ return SDValue();
+}
+
SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
+ SelectionDAG::FlagInserter FlagsInserter(DAG, N);
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
SDLoc DL(N);
@@ -18507,6 +18545,9 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
return NewVSel;
+ if (SDValue CastEliminated = eliminateFPCastPair(N))
+ return CastEliminated;
+
return SDValue();
}
@@ -27209,6 +27250,7 @@ SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) {
}
SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {
+ SelectionDAG::FlagInserter FlagsInserter(DAG, N);
auto Op = N->getOpcode();
assert((Op == ISD::FP16_TO_FP || Op == ISD::BF16_TO_FP) &&
"opcode should be FP16_TO_FP or BF16_TO_FP.");
@@ -27223,6 +27265,9 @@ SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {
}
}
+ if (SDValue CastEliminated = eliminateFPCastPair(N))
+ return CastEliminated;
+
// Sometimes constants manage to survive very late in the pipeline, e.g.,
// because they are wrapped inside the <1 x f16> type. Try one last time to
// get rid of them.
diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll
index 5460a376931a5..adc536da26f26 100644
--- a/llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -84,11 +84,8 @@ define half @test_fmadd(half %a, half %b, half %c) #0 {
; CHECK-CVT-SD: // %bb.0:
; CHECK-CVT-SD-NEXT: fcvt s1, h1
; CHECK-CVT-SD-NEXT: fcvt s0, h0
-; CHECK-CVT-SD-NEXT: fmul s0, s0, s1
-; CHECK-CVT-SD-NEXT: fcvt s1, h2
-; CHECK-CVT-SD-NEXT: fcvt h0, s0
-; CHECK-CVT-SD-NEXT: fcvt s0, h0
-; CHECK-CVT-SD-NEXT: fadd s0, s0, s1
+; CHECK-CVT-SD-NEXT: fcvt s2, h2
+; CHECK-CVT-SD-NEXT: fmadd s0, s0, s1, s2
; CHECK-CVT-SD-NEXT: fcvt h0, s0
; CHECK-CVT-SD-NEXT: ret
;
@@ -1248,6 +1245,15 @@ define half @test_atan(half %a) #0 {
}
define half @test_atan2(half %a, half %b) #0 {
+; CHECK-LABEL: test_atan2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: bl atan2f
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
%r = call half @llvm.atan2.f16(half %a, half %b)
ret half %r
}
diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index 7bcaae5a77eac..a37aabb0b5384 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -1114,11 +1114,8 @@ define half @fmul_f16(half %a, half %b, half %c) {
; CHECK-SD-NOFP16: // %bb.0: // %entry
; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NOFP16-NEXT: fcvt s1, h2
-; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s1
+; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT: fmadd s0, s0, s1, s2
; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
; CHECK-SD-NOFP16-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/fp16_fast_math.ll b/llvm/test/CodeGen/AArch64/fp16_fast_math.ll
index b7d2de708a110..7d9654d1ff8c0 100644
--- a/llvm/test/CodeGen/AArch64/fp16_fast_math.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_fast_math.ll
@@ -88,3 +88,112 @@ entry:
%add = fadd ninf half %x, %y
ret half %add
}
+
+; Check that when we have the right fast math flags the converts in between the
+; two fadds are removed.
+
+define half @normal_fadd_sequence(half %x, half %y, half %z) {
+ ; CHECK-CVT-LABEL: name: normal_fadd_sequence
+ ; CHECK-CVT: bb.0.entry:
+ ; CHECK-CVT-NEXT: liveins: $h0, $h1, $h2
+ ; CHECK-CVT-NEXT: {{ $}}
+ ; CHECK-CVT-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h2
+ ; CHECK-CVT-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h1
+ ; CHECK-CVT-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0
+ ; CHECK-CVT-NEXT: [[FCVTSHr:%[0-9]+]]:fpr32 = nofpexcept FCVTSHr [[COPY1]], implicit $fpcr
+ ; CHECK-CVT-NEXT: [[FCVTSHr1:%[0-9]+]]:fpr32 = nofpexcept FCVTSHr [[COPY2]], implicit $fpcr
+ ; CHECK-CVT-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nofpexcept FADDSrr killed [[FCVTSHr1]], killed [[FCVTSHr]], implicit $fpcr
+ ; CHECK-CVT-NEXT: [[FCVTHSr:%[0-9]+]]:fpr16 = nofpexcept FCVTHSr killed [[FADDSrr]], implicit $fpcr
+ ; CHECK-CVT-NEXT: [[FCVTSHr2:%[0-9]+]]:fpr32 = nofpexcept FCVTSHr killed [[FCVTHSr]], implicit $fpcr
+ ; CHECK-CVT-NEXT: [[FCVTSHr3:%[0-9]+]]:fpr32 = nofpexcept FCVTSHr [[COPY]], implicit $fpcr
+ ; CHECK-CVT-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = nofpexcept FADDSrr killed [[FCVTSHr2]], killed [[FCVTSHr3]], implicit $fpcr
+ ; CHECK-CVT-NEXT: [[FCVTHSr1:%[0-9]+]]:fpr16 = nofpexcept FCVTHSr killed [[FADDSrr1]], implicit $fpcr
+ ; CHECK-CVT-NEXT: $h0 = COPY [[FCVTHSr1]]
+ ; CHECK-CVT-NEXT: RET_ReallyLR implicit $h0
+ ;
+ ; CHECK-FP16-LABEL: name: normal_fadd_sequence
+ ; CHECK-FP16: bb.0.entry:
+ ; CHECK-FP16-NEXT: liveins: $h0, $h1, $h2
+ ; CHECK-FP16-NEXT: {{ $}}
+ ; CHECK-FP16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h2
+ ; CHECK-FP16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h1
+ ; CHECK-FP16-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0
+ ; CHECK-FP16-NEXT: [[FADDHrr:%[0-9]+]]:fpr16 = nofpexcept FADDHrr [[COPY2]], [[COPY1]], implicit $fpcr
+ ; CHECK-FP16-NEXT: [[FADDHrr1:%[0-9]+]]:fpr16 = nofpexcept FADDHrr killed [[FADDHrr]], [[COPY]], implicit $fpcr
+ ; CHECK-FP16-NEXT: $h0 = COPY [[FADDHrr1]]
+ ; CHECK-FP16-NEXT: RET_ReallyLR implicit $h0
+entry:
+ %add1 = fadd half %x, %y
+ %add2 = fadd half %add1, %z
+ ret half %add2
+}
+
+define half @nnan_ninf_contract_fadd_sequence(half %x, half %y, half %z) {
+ ; CHECK-CVT-LABEL: name: nnan_ninf_contract_fadd_sequence
+ ; CHECK-CVT: bb.0.entry:
+ ; CHECK-CVT-NEXT: liveins: $h0, $h1, $h2
+ ; CHECK-CVT-NEXT: {{ $}}
+ ; CHECK-CVT-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h2
+ ; CHECK-CVT-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h1
+ ; CHECK-CVT-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0
+ ; CHECK-CVT-NEXT: [[FCVTSHr:%[0-9]+]]:fpr32 = nnan ninf contract nofpexcept FCVTSHr [[COPY1]], implicit $fpcr
+ ; CHECK-CVT-NEXT: [[FCVTSHr1:%[0-9]+]]:fpr32 = nnan ninf contract nofpexcept FCVTSHr [[COPY2]], implicit $fpcr
+ ; CHECK-CVT-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nnan ninf contract nofpexcept FADDSrr killed [[FCVTSHr1]], killed [[FCVTSHr]], implicit $fpcr
+ ; CHECK-CVT-NEXT: [[FCVTSHr2:%[0-9]+]]:fpr32 = nnan ninf contract nofpexcept FCVTSHr [[COPY]], implicit $fpcr
+ ; CHECK-CVT-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = nnan ninf contract nofpexcept FADDSrr killed [[FADDSrr]], killed [[FCVTSHr2]], implicit $fpcr
+ ; CHECK-CVT-NEXT: [[FCVTHSr:%[0-9]+]]:fpr16 = nnan ninf contract nofpexcept FCVTHSr killed [[FADDSrr1]], implicit $fpcr
+ ; CHECK-CVT-NEXT: $h0 = COPY [[FCVTHSr]]
+ ; CHECK-CVT-NEXT: RET_ReallyLR implicit $h0
+ ;
+ ; CHECK-FP16-LABEL: name: nnan_ninf_contract_fadd_sequence
+ ; CHECK-FP16: bb.0.entry:
+ ; CHECK-FP16-NEXT: liveins: $h0, $h1, $h2
+ ; CHECK-FP16-NEXT: {{ $}}
+ ; CHECK-FP16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h2
+ ; CHECK-FP16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h1
+ ; CHECK-FP16-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0
+ ; CHECK-FP16-NEXT: [[FADDHrr:%[0-9]+]]:fpr16 = nnan ninf contract nofpexcept FADDHrr [[COPY2]], [[COPY1]], implicit $fpcr
+ ; CHECK-FP16-NEXT: [[FADDHrr1:%[0-9]+]]:fpr16 = nnan ninf contract nofpexcept FADDHrr killed [[FADDHrr]], [[COPY]], implicit $fpcr
+ ; CHECK-FP16-NEXT: $h0 = COPY [[FADDHrr1]]
+ ; CHECK-FP16-NEXT: RET_ReallyLR implicit $h0
+entry:
+ %add1 = fadd nnan ninf contract half %x, %y
+ %add2 = fadd nnan ninf contract half %add1, %z
+ ret half %add2
+}
+
+define half @ninf_fadd_sequence(half %x, half %y, half %z) {
+ ; CHECK-CVT-LABEL: name: ninf_fadd_sequence
+ ; CHECK-CVT: bb.0.entry:
+ ; CHECK-CVT-NEXT: liveins: $h0, $h1, $h2
+ ; CHECK-CVT-NEXT: {{ $}}
+ ; CHECK-CVT-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h2
+ ; CHECK-CVT-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h1
+ ; CHECK-CVT-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0
+ ; CHECK-CVT-NEXT: [[FCVTSHr:%[0-9]+]]:fpr32 = ninf nofpexcept FCVTSHr [[COPY1]], implicit $fpcr
+ ; CHECK-CVT-NEXT: [[FCVTSHr1:%[0-9]+]]:fpr32 = ninf nofpexcept FCVTSHr [[COPY2]], implicit $fpcr
+ ; CHECK-CVT-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = ninf nofpexcept FADDSrr killed [[FCVTSHr1]], killed [[FCVTSHr]], implicit $fpcr
+ ; CHECK-CVT-NEXT: [[FCVTHSr:%[0-9]+]]:fpr16 = ninf nofpexcept FCVTHSr killed [[FADDSrr]], implicit $fpcr
+ ; CHECK-CVT-NEXT: [[FCVTSHr2:%[0-9]+]]:fpr32 = ninf nofpexcept FCVTSHr killed [[FCVTHSr]], implicit $fpcr
+ ; CHECK-CVT-NEXT: [[FCVTSHr3:%[0-9]+]]:fpr32 = ninf nofpexcept FCVTSHr [[COPY]], implicit $fpcr
+ ; CHECK-CVT-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = ninf nofpexcept FADDSrr killed [[FCVTSHr2]], killed [[FCVTSHr3]], implicit $fpcr
+ ; CHECK-CVT-NEXT: [[FCVTHSr1:%[0-9]+]]:fpr16 = ninf nofpexcept FCVTHSr killed [[FADDSrr1]], implicit $fpcr
+ ; CHECK-CVT-NEXT: $h0 = COPY [[FCVTHSr1]]
+ ; CHECK-CVT-NEXT: RET_ReallyLR implicit $h0
+ ;
+ ; CHECK-FP16-LABEL: name: ninf_fadd_sequence
+ ; CHECK-FP16: bb.0.entry:
+ ; CHECK-FP16-NEXT: liveins: $h0, $h1, $h2
+ ; CHECK-FP16-NEXT: {{ $}}
+ ; CHECK-FP16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h2
+ ; CHECK-FP16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h1
+ ; CHECK-FP16-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0
+ ; CHECK-FP16-NEXT: [[FADDHrr:%[0-9]+]]:fpr16 = ninf nofpexcept FADDHrr [[COPY2]], [[COPY1]], implicit $fpcr
+ ; CHECK-FP16-NEXT: [[FADDHrr1:%[0-9]+]]:fpr16 = ninf nofpexcept FADDHrr killed [[FADDHrr]], [[COPY]], implicit $fpcr
+ ; CHECK-FP16-NEXT: $h0 = COPY [[FADDHrr1]]
+ ; CHECK-FP16-NEXT: RET_ReallyLR implicit $h0
+entry:
+ %add1 = fadd ninf half %x, %y
+ %add2 = fadd ninf half %add1, %z
+ ret half %add2
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
index 4eaaee7ce5055..95ca0a68a7212 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
@@ -443,23 +443,17 @@ define half @faddv_v4f16(half %start, <4 x half> %a) {
; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
; NONEON-NOSVE-NEXT: str d1, [sp, #8]
; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: ldr h1, [sp, #8]
-; NONEON-NOSVE-NEXT: ldr h2, [sp, #10]
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fadd s1, s1, s2
-; NONEON-NOSVE-NEXT: ldr h2, [sp, #12]
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h1, s1
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fadd s1, s1, s2
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #10]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #14]
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h1, s1
+; NONEON-NOSVE-NEXT: ldr h3, [sp, #12]
+; NONEON-NOSVE-NEXT: ldr h4, [sp, #8]
; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt s3, h3
+; NONEON-NOSVE-NEXT: fcvt s2, h2
+; NONEON-NOSVE-NEXT: fcvt s4, h4
+; NONEON-NOSVE-NEXT: fadd s2, s3, s2
+; NONEON-NOSVE-NEXT: fadd s1, s4, s1
; NONEON-NOSVE-NEXT: fadd s1, s1, s2
-; NONEON-NOSVE-NEXT: fcvt h1, s1
-; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fadd s0, s0, s1
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: add sp, sp, #16
@@ -481,44 +475,30 @@ define half @faddv_v8f16(half %start, <8 x half> %a) {
; NONEON-NOSVE: // %bb.0:
; NONEON-NOSVE-NEXT: str q1, [sp, #-16]!
; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT: ldr h1, [sp]
-; NONEON-NOSVE-NEXT: ldr h2, [sp, #2]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fadd s1, s1, s2
-; NONEON-NOSVE-NEXT: ldr h2, [sp, #4]
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h1, s1
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fadd s1, s1, s2
+; NONEON-NOSVE-NEXT: ldr h1, [sp, #2]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #6]
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h1, s1
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fadd s1, s1, s2
-; NONEON-NOSVE-NEXT: ldr h2, [sp, #8]
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h1, s1
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fadd s1, s1, s2
-; NONEON-NOSVE-NEXT: ldr h2, [sp, #10]
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h1, s1
+; NONEON-NOSVE-NEXT: fcvt s0, h0
+; NONEON-NOSVE-NEXT: ldr h3, [sp, #4]
+; NONEON-NOSVE-NEXT: ldr h4, [sp]
+; NONEON-NOSVE-NEXT: ldr h5, [sp, #10]
+; NONEON-NOSVE-NEXT: ldr h6, [sp, #8]
; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fadd s1, s1, s2
-; NONEON-NOSVE-NEXT: ldr h2, [sp, #12]
; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h1, s1
-; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fcvt s4, h4
+; NONEON-NOSVE-NEXT: fcvt s3, h3
+; NONEON-NOSVE-NEXT: fcvt s5, h5
+; NONEON-NOSVE-NEXT: fcvt s6, h6
+; NONEON-NOSVE-NEXT: ldr h7, [sp, #12]
+; NONEON-NOSVE-NEXT: fadd s1, s4, s1
+; NONEON-NOSVE-NEXT: fadd s2, s3, s2
+; NONEON-NOSVE-NEXT: fcvt s3, h7
+; NONEON-NOSVE-NEXT: fadd s4, s6, s5
+; NONEON-NOSVE-NEXT: ldr h5, [sp, #14]
; NONEON-NOSVE-NEXT: fadd s1, s1, s2
-; NONEON-NOSVE-NEXT: ldr h2, [sp, #14]
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h1, s1
-; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fadd s2, s4, s3
+; NONEON-NOSVE-NEXT: fcvt s3, h5
; NONEON-NOSVE-NEXT: fadd s1, s1, s2
-; NONEON-NOSVE-NEXT: fcvt h1, s1
-; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fadd s0, s0, s3
; NONEON-NOSVE-NEXT: fadd s0, s0, s1
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: add sp, sp, #16
@@ -547,79 +527,49 @@ define half @faddv_v16f16(half %start, ptr %a) {
; NONEON-NOSVE-NEXT: fcvt s0, h0
; NONEON-NOSVE-NEXT: ldr h3, [sp, #16]
; NONEON-NOSVE-NEXT: ldr h4, [sp]
+; NONEON-NOSVE-NEXT: ldr h5, [sp, #20]
+; NONEON-NOSVE-NEXT: ldr h6, [sp, #4]
; NONEON-NOSVE-NEXT: fcvt s1, h1
; NONEON-NOSVE-NEXT: fcvt s2, h2
+; NONEON-NOSVE-NEXT: ldr h7, [sp, #22]
+; NONEON-NOSVE-NEXT: ldr h16, [sp, #6]
; NONEON-NOSVE-NEXT: fcvt s3, h3
+; NONEON-NOSVE-NEXT: ldr h17, [sp, #24]
+; NONEON-NOSVE-NEXT: ldr h18, [sp, #8]
; NONEON-NOSVE-NEXT: fcvt s4, h4
+; NONEON-NOSVE-NEXT: ldr h19, [sp, #26]
+; NONEON-NOSVE-NEXT: ldr h20, [sp, #10]
+; NONEON-NOSVE-NEXT: fcvt s5, h5
+; NONEON-NOSVE-NEXT: fcvt s6, h6
+; NONEON-NOSVE-NEXT: fcvt s7, h7
+; NONEON-NOSVE-NEXT: fcvt s16, h16
+; NONEON-NOSVE-NEXT: fcvt s17, h17
+; NONEON-NOSVE-NEXT: fcvt s18, h18
+; NONEON-NOSVE-NEXT: fcvt s19, h19
+; NONEON-NOSVE-NEXT: fcvt s20, h20
+; NONEON-NOSVE-NEXT: ldr h21, [sp, #28]
+; NONEON-NOSVE-NEXT: ldr h22, [sp, #12]
; NONEON-NOSVE-NEXT: fadd s1, s2, s1
; NONEON-NOSVE-NEXT: fadd s2, s4, s3
-; NONEON-NOSVE-NEXT: ldr h3, [sp, #20]
-; NONEON-NOSVE-NEXT: ldr h4, [sp, #4]
-; NONEON-NOSVE-NEXT: fcvt s3, h3
-; NONEON-NOSVE-NEXT: fcvt s4, h4
-; NONEON-NOSVE-NEXT: fcvt h1, s1
-; NONEON-NOSVE-NEXT: fcvt h2, s2
-; NONEON-NOSVE-NEXT: fadd s3, s4, s3
-; NONEON-NOSVE-NEXT: ldr h4, [sp, #6]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt s4, h4
+; NONEON-NOSVE-NEXT: fadd s3, s6, s5
+; NONEON-NOSVE-NEXT: fadd s4, s16, s7
+; NONEON-NOSVE-NEXT: fcvt s5, h21
+; NONEON-NOSVE-NEXT: fcvt s6, h22
+; NONEON-NOSVE-NEXT: fadd s7, s18, s17
+; NONEON-NOSVE-NEXT: ldr h17, [sp, #30]
+; NONEON-NOSVE-NEXT: fadd s16, s20, s19
+; NONEON-NOSVE-NEXT: ldr h18, [sp, #14]
; NONEON-NOSVE-NEXT: fadd s1, s2, s1
-; NONEON-NOSVE-NEXT: fcvt h2, s3
-; NONEON-NOSVE-NEXT: ldr h3, [sp, #22]
-; NONEON-NOSVE-NEXT: fcvt s3, h3
-; NONEON-NOSVE-NEXT: fcvt h1, s1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fadd s3, s4, s3
-; NONEON-NOSVE-NEXT: ldr h4, [sp, #8]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s4, h4
-; NONEON-NOSVE-NEXT: fadd s1, s1, s2
-; NONEON-NOSVE-NEXT: fcvt h2, s3
-; NONEON-NOSVE-NEXT: ldr h3, [sp, #24]
-; NONEON-NOSVE-NEXT: fcvt s3, h3
-; NONEON-NOSVE-NEXT: fcvt h1, s1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fadd s3, s4, s3
-; NONEON-NOSVE-NEXT: ldr h4, [sp, #10]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s4, h4
-; NONEON-NOSVE-NEXT: fadd s1, s1, s2
-; NONEON-NOSVE-NEXT: fcvt h2, s3
-; NONEON-NOSVE-NEXT: ldr h3, [sp, #26]
-; NONEON-NOSVE-NEXT: fcvt s3, h3
-; NONEON-NOSVE-NEXT: fcvt h1, s1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fadd s3, s4, s3
-; NONEON-NOSVE-NEXT: ldr h4, [sp, #12]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s4, h4
-; NONEON-NOSVE-NEXT: fadd s1, s1, s2
-; NONEON-NOSVE-NEXT: fcvt h2, s3
-; NONEON-NOSVE-NEXT: ldr h3, [sp, #28]
-; NONEON-NOSVE-NEXT: fcvt s3, h3
-; NONEON-NOSVE-NEXT: fcvt h1, s1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fadd s1, s1, s2
-; NONEON-NOSVE-NEXT: fadd s2, s4, s3
-; NONEON-NOSVE-NEXT: ldr h3, [sp, #30]
-; NONEON-NOSVE-NEXT: ldr h4, [sp, #14]
-; NONEON-NOSVE-NEXT: fcvt s3, h3
-; NONEON-NOSVE-NEXT: fcvt s4, h4
-; NONEON-NOSVE-NEXT: fcvt h1, s1
-; NONEON-NOSVE-NEXT: fcvt h2, s2
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
+; NONEON-NOSVE-NEXT: fadd s2, s3, s4
+; NONEON-NOSVE-NEXT: fcvt s4, h17
+; NONEON-NOSVE-NEXT: fadd s5, s6, s5
+; NONEON-NOSVE-NEXT: fcvt s6, h18
+; NONEON-NOSVE-NEXT: fadd s3, s7, s16
; NONEON-NOSVE-NEXT: fadd s1, s1, s2
-; NONEON-NOSVE-NEXT: fadd s2, s4, s3
-; NONEON-NOSVE-NEXT: fcvt h1, s1
-; NONEON-NOSVE-NEXT: fcvt h2, s2
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
+; NONEON-NOSVE-NEXT: fadd s2, s3, s5
+; NONEON-NOSVE-NEXT: fadd s3, s6, s4
; NONEON-NOSVE-NEXT: fadd s1, s1, s2
-; NONEON-NOSVE-NEXT: fcvt h1, s1
-; NONEON-NOSVE-NEXT: fcvt s1, h1
+; NONEON-NOSVE-NEXT: fadd s0, s0, s3
; NONEON-NOSVE-NEXT: fadd s0, s0, s1
; NONEON-NOSVE-NEXT: fcvt h0, s0
; NONEON-NOSVE-NEXT: add sp, sp, #32
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
index 03db1d0d433d3..11ce20f109623 100644
--- a/llvm/te...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/131345
More information about the llvm-commits
mailing list