[llvm] [AArch64][GlobalISel] Better vecreduce.fadd lowering. (PR #73294)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 24 00:26:22 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
@llvm/pr-subscribers-llvm-globalisel
Author: David Green (davemgreen)
<details>
<summary>Changes</summary>
This changes the fadd legalization to handle fp16 types, and treats more types as legal so that the backend can produce the correct patterns. This is currently a missing identity fold for `fadd x, -0.0 -> x`
---
Patch is 54.84 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/73294.diff
3 Files Affected:
- (modified) llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp (+1)
- (modified) llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp (+11-3)
- (modified) llvm/test/CodeGen/AArch64/vecreduce-fadd.ll (+800-436)
``````````diff
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 11d01429485dcbc..f6834eb3d4502b0 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2819,6 +2819,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
Observer.changedInstr(MI);
return Legalized;
}
+ case TargetOpcode::G_VECREDUCE_FADD:
case TargetOpcode::G_VECREDUCE_FMIN:
case TargetOpcode::G_VECREDUCE_FMAX:
case TargetOpcode::G_VECREDUCE_FMINIMUM:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 316a9eaa63d4bb4..e665bf42a98de8a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -970,11 +970,19 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.legalFor(PackedVectorAllTypeList)
.lowerIf(isScalar(0));
+ // For fadd reductions we have pairwise operations available. We treat the
+ // usual legal types as legal and handle the lowering to pairwise instructions
+ // later.
getActionDefinitionsBuilder(G_VECREDUCE_FADD)
- // We only have FADDP to do reduction-like operations. Lower the rest.
- .legalFor({{s32, v2s32}, {s64, v2s64}})
+ .legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}})
+ .legalIf([=](const LegalityQuery &Query) {
+ const auto &Ty = Query.Types[1];
+ return (Ty == v4s16 || Ty == v8s16) && HasFP16;
+ })
+ .minScalarOrElt(0, MinFPScalar)
.clampMaxNumElements(1, s64, 2)
- .clampMaxNumElements(1, s32, 2)
+ .clampMaxNumElements(1, s32, 4)
+ .clampMaxNumElements(1, s16, 8)
.lower();
getActionDefinitionsBuilder(G_VECREDUCE_ADD)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
index e770def93aa4e6c..43e44b6832f8c14 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
@@ -1,223 +1,346 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc --mtriple=aarch64-eabi -aarch64-neon-syntax=generic -mattr=+fullfp16 < %s | FileCheck --check-prefixes=CHECK,FULLFP16 %s
-; RUN: llc --mtriple=aarch64-eabi -aarch64-neon-syntax=generic < %s | FileCheck %s --check-prefixes=CHECK,CHECKNOFP16
+; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
define float @add_HalfS(<2 x float> %bin.rdx) {
-; CHECK-LABEL: add_HalfS:
-; CHECK: // %bb.0:
-; CHECK-NEXT: faddp s0, v0.2s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: add_HalfS:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: faddp s0, v0.2s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: add_HalfS:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: movi v1.2s, #128, lsl #24
+; CHECK-GI-NEXT: faddp s0, v0.2s
+; CHECK-GI-NEXT: fadd s0, s0, s1
+; CHECK-GI-NEXT: ret
%r = call fast float @llvm.vector.reduce.fadd.f32.v2f32(float -0.0, <2 x float> %bin.rdx)
ret float %r
}
define half @add_HalfH(<4 x half> %bin.rdx) {
-; FULLFP16-LABEL: add_HalfH:
-; FULLFP16: // %bb.0:
-; FULLFP16-NEXT: faddp v0.4h, v0.4h, v0.4h
-; FULLFP16-NEXT: faddp h0, v0.2h
-; FULLFP16-NEXT: ret
+; CHECK-SD-NOFP16-LABEL: add_HalfH:
+; CHECK-SD-NOFP16: // %bb.0:
+; CHECK-SD-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NOFP16-NEXT: mov h1, v0.h[1]
+; CHECK-SD-NOFP16-NEXT: fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT: fadd s1, s2, s1
+; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[2]
+; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[3]
+; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT: fadd s0, s1, s0
+; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT: ret
;
-; CHECKNOFP16-LABEL: add_HalfH:
-; CHECKNOFP16: // %bb.0:
-; CHECKNOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECKNOFP16-NEXT: mov h1, v0.h[1]
-; CHECKNOFP16-NEXT: fcvt s2, h0
-; CHECKNOFP16-NEXT: fcvt s1, h1
-; CHECKNOFP16-NEXT: fadd s1, s2, s1
-; CHECKNOFP16-NEXT: mov h2, v0.h[2]
-; CHECKNOFP16-NEXT: mov h0, v0.h[3]
-; CHECKNOFP16-NEXT: fcvt h1, s1
-; CHECKNOFP16-NEXT: fcvt s2, h2
-; CHECKNOFP16-NEXT: fcvt s0, h0
-; CHECKNOFP16-NEXT: fcvt s1, h1
-; CHECKNOFP16-NEXT: fadd s1, s1, s2
-; CHECKNOFP16-NEXT: fcvt h1, s1
-; CHECKNOFP16-NEXT: fcvt s1, h1
-; CHECKNOFP16-NEXT: fadd s0, s1, s0
-; CHECKNOFP16-NEXT: fcvt h0, s0
-; CHECKNOFP16-NEXT: ret
+; CHECK-SD-FP16-LABEL: add_HalfH:
+; CHECK-SD-FP16: // %bb.0:
+; CHECK-SD-FP16-NEXT: faddp v0.4h, v0.4h, v0.4h
+; CHECK-SD-FP16-NEXT: faddp h0, v0.2h
+; CHECK-SD-FP16-NEXT: ret
+;
+; CHECK-GI-NOFP16-LABEL: add_HalfH:
+; CHECK-GI-NOFP16: // %bb.0:
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: mov w8, #32768 // =0x8000
+; CHECK-GI-NOFP16-NEXT: fmov s1, w8
+; CHECK-GI-NOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT: faddp s0, v0.2s
+; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1
+; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT: ret
+;
+; CHECK-GI-FP16-LABEL: add_HalfH:
+; CHECK-GI-FP16: // %bb.0:
+; CHECK-GI-FP16-NEXT: faddp v0.4h, v0.4h, v0.4h
+; CHECK-GI-FP16-NEXT: adrp x8, .LCPI1_0
+; CHECK-GI-FP16-NEXT: ldr h1, [x8, :lo12:.LCPI1_0]
+; CHECK-GI-FP16-NEXT: faddp h0, v0.2h
+; CHECK-GI-FP16-NEXT: fadd h0, h0, h1
+; CHECK-GI-FP16-NEXT: ret
%r = call fast half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %bin.rdx)
ret half %r
}
define half @add_H(<8 x half> %bin.rdx) {
-; FULLFP16-LABEL: add_H:
-; FULLFP16: // %bb.0:
-; FULLFP16-NEXT: faddp v1.8h, v0.8h, v0.8h
-; FULLFP16-NEXT: faddp v0.8h, v1.8h, v0.8h
-; FULLFP16-NEXT: faddp h0, v0.2h
-; FULLFP16-NEXT: ret
+; CHECK-SD-NOFP16-LABEL: add_H:
+; CHECK-SD-NOFP16: // %bb.0:
+; CHECK-SD-NOFP16-NEXT: mov h1, v0.h[1]
+; CHECK-SD-NOFP16-NEXT: fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT: fadd s1, s2, s1
+; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[2]
+; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[3]
+; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[4]
+; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[5]
+; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT: mov h2, v0.h[6]
+; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT: fadd s0, s1, s0
+; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT: ret
+;
+; CHECK-SD-FP16-LABEL: add_H:
+; CHECK-SD-FP16: // %bb.0:
+; CHECK-SD-FP16-NEXT: faddp v1.8h, v0.8h, v0.8h
+; CHECK-SD-FP16-NEXT: faddp v0.8h, v1.8h, v0.8h
+; CHECK-SD-FP16-NEXT: faddp h0, v0.2h
+; CHECK-SD-FP16-NEXT: ret
+;
+; CHECK-GI-NOFP16-LABEL: add_H:
+; CHECK-GI-NOFP16: // %bb.0:
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
+; CHECK-GI-NOFP16-NEXT: mov w8, #32768 // =0x8000
+; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: fmov s1, w8
+; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: faddp s0, v0.2s
+; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1
+; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT: ret
;
-; CHECKNOFP16-LABEL: add_H:
-; CHECKNOFP16: // %bb.0:
-; CHECKNOFP16-NEXT: mov h1, v0.h[1]
-; CHECKNOFP16-NEXT: fcvt s2, h0
-; CHECKNOFP16-NEXT: fcvt s1, h1
-; CHECKNOFP16-NEXT: fadd s1, s2, s1
-; CHECKNOFP16-NEXT: mov h2, v0.h[2]
-; CHECKNOFP16-NEXT: fcvt h1, s1
-; CHECKNOFP16-NEXT: fcvt s2, h2
-; CHECKNOFP16-NEXT: fcvt s1, h1
-; CHECKNOFP16-NEXT: fadd s1, s1, s2
-; CHECKNOFP16-NEXT: mov h2, v0.h[3]
-; CHECKNOFP16-NEXT: fcvt h1, s1
-; CHECKNOFP16-NEXT: fcvt s2, h2
-; CHECKNOFP16-NEXT: fcvt s1, h1
-; CHECKNOFP16-NEXT: fadd s1, s1, s2
-; CHECKNOFP16-NEXT: mov h2, v0.h[4]
-; CHECKNOFP16-NEXT: fcvt h1, s1
-; CHECKNOFP16-NEXT: fcvt s2, h2
-; CHECKNOFP16-NEXT: fcvt s1, h1
-; CHECKNOFP16-NEXT: fadd s1, s1, s2
-; CHECKNOFP16-NEXT: mov h2, v0.h[5]
-; CHECKNOFP16-NEXT: fcvt h1, s1
-; CHECKNOFP16-NEXT: fcvt s2, h2
-; CHECKNOFP16-NEXT: fcvt s1, h1
-; CHECKNOFP16-NEXT: fadd s1, s1, s2
-; CHECKNOFP16-NEXT: mov h2, v0.h[6]
-; CHECKNOFP16-NEXT: mov h0, v0.h[7]
-; CHECKNOFP16-NEXT: fcvt h1, s1
-; CHECKNOFP16-NEXT: fcvt s2, h2
-; CHECKNOFP16-NEXT: fcvt s0, h0
-; CHECKNOFP16-NEXT: fcvt s1, h1
-; CHECKNOFP16-NEXT: fadd s1, s1, s2
-; CHECKNOFP16-NEXT: fcvt h1, s1
-; CHECKNOFP16-NEXT: fcvt s1, h1
-; CHECKNOFP16-NEXT: fadd s0, s1, s0
-; CHECKNOFP16-NEXT: fcvt h0, s0
-; CHECKNOFP16-NEXT: ret
+; CHECK-GI-FP16-LABEL: add_H:
+; CHECK-GI-FP16: // %bb.0:
+; CHECK-GI-FP16-NEXT: faddp v1.8h, v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT: adrp x8, .LCPI2_0
+; CHECK-GI-FP16-NEXT: faddp v0.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT: ldr h1, [x8, :lo12:.LCPI2_0]
+; CHECK-GI-FP16-NEXT: faddp h0, v0.2h
+; CHECK-GI-FP16-NEXT: fadd h0, h0, h1
+; CHECK-GI-FP16-NEXT: ret
%r = call fast half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %bin.rdx)
ret half %r
}
define float @add_S(<4 x float> %bin.rdx) {
-; CHECK-LABEL: add_S:
-; CHECK: // %bb.0:
-; CHECK-NEXT: faddp v0.4s, v0.4s, v0.4s
-; CHECK-NEXT: faddp s0, v0.2s
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: add_S:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: faddp v0.4s, v0.4s, v0.4s
+; CHECK-SD-NEXT: faddp s0, v0.2s
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: add_S:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: faddp v0.4s, v0.4s, v0.4s
+; CHECK-GI-NEXT: movi v1.2s, #128, lsl #24
+; CHECK-GI-NEXT: faddp s0, v0.2s
+; CHECK-GI-NEXT: fadd s0, s0, s1
+; CHECK-GI-NEXT: ret
%r = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %bin.rdx)
ret float %r
}
define double @add_D(<2 x double> %bin.rdx) {
-; CHECK-LABEL: add_D:
-; CHECK: // %bb.0:
-; CHECK-NEXT: faddp d0, v0.2d
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: add_D:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: faddp d0, v0.2d
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: add_D:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: faddp d0, v0.2d
+; CHECK-GI-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000
+; CHECK-GI-NEXT: fmov d1, x8
+; CHECK-GI-NEXT: fadd d0, d0, d1
+; CHECK-GI-NEXT: ret
%r = call fast double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %bin.rdx)
ret double %r
}
define half @add_2H(<16 x half> %bin.rdx) {
-; FULLFP16-LABEL: add_2H:
-; FULLFP16: // %bb.0:
-; FULLFP16-NEXT: fadd v0.8h, v0.8h, v1.8h
-; FULLFP16-NEXT: faddp v1.8h, v0.8h, v0.8h
-; FULLFP16-NEXT: faddp v0.8h, v1.8h, v0.8h
-; FULLFP16-NEXT: faddp h0, v0.2h
-; FULLFP16-NEXT: ret
+; CHECK-SD-NOFP16-LABEL: add_2H:
+; CHECK-SD-NOFP16: // %bb.0:
+; CHECK-SD-NOFP16-NEXT: mov h2, v1.h[1]
+; CHECK-SD-NOFP16-NEXT: mov h3, v0.h[1]
+; CHECK-SD-NOFP16-NEXT: fcvt s4, h1
+; CHECK-SD-NOFP16-NEXT: fcvt s5, h0
+; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT: fadd s4, s5, s4
+; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[2]
+; CHECK-SD-NOFP16-NEXT: fadd s2, s3, s2
+; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[2]
+; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
+; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT: fadd s3, s5, s3
+; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[3]
+; CHECK-SD-NOFP16-NEXT: fadd s2, s4, s2
+; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[3]
+; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
+; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
+; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT: fadd s4, s5, s4
+; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[4]
+; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3
+; CHECK-SD-NOFP16-NEXT: mov h3, v1.h[4]
+; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
+; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
+; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT: fadd s3, s5, s3
+; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[5]
+; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s4
+; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[5]
+; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
+; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
+; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT: fadd s4, s5, s4
+; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[6]
+; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3
+; CHECK-SD-NOFP16-NEXT: fcvt h3, s4
+; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[6]
+; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
+; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT: fadd s0, s0, s1
+; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3
+; CHECK-SD-NOFP16-NEXT: fadd s3, s5, s4
+; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT: fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
+; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT: fadd s2, s2, s3
+; CHECK-SD-NOFP16-NEXT: fcvt h1, s2
+; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT: fadd s0, s1, s0
+; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT: ret
;
-; CHECKNOFP16-LABEL: add_2H:
-; CHECKNOFP16: // %bb.0:
-; CHECKNOFP16-NEXT: mov h2, v1.h[1]
-; CHECKNOFP16-NEXT: mov h3, v0.h[1]
-; CHECKNOFP16-NEXT: fcvt s4, h1
-; CHECKNOFP16-NEXT: fcvt s5, h0
-; CHECKNOFP16-NEXT: fcvt s2, h2
-; CHECKNOFP16-NEXT: fcvt s3, h3
-; CHECKNOFP16-NEXT: fadd s4, s5, s4
-; CHECKNOFP16-NEXT: mov h5, v0.h[2]
-; CHECKNOFP16-NEXT: fadd s2, s3, s2
-; CHECKNOFP16-NEXT: mov h3, v1.h[2]
-; CHECKNOFP16-NEXT: fcvt h4, s4
-; CHECKNOFP16-NEXT: fcvt s5, h5
-; CHECKNOFP16-NEXT: fcvt h2, s2
-; CHECKNOFP16-NEXT: fcvt s3, h3
-; CHECKNOFP16-NEXT: fcvt s4, h4
-; CHECKNOFP16-NEXT: fcvt s2, h2
-; CHECKNOFP16-NEXT: fadd s3, s5, s3
-; CHECKNOFP16-NEXT: mov h5, v0.h[3]
-; CHECKNOFP16-NEXT: fadd s2, s4, s2
-; CHECKNOFP16-NEXT: mov h4, v1.h[3]
-; CHECKNOFP16-NEXT: fcvt h3, s3
-; CHECKNOFP16-NEXT: fcvt s5, h5
-; CHECKNOFP16-NEXT: fcvt h2, s2
-; CHECKNOFP16-NEXT: fcvt s4, h4
-; CHECKNOFP16-NEXT: fcvt s3, h3
-; CHECKNOFP16-NEXT: fcvt s2, h2
-; CHECKNOFP16-NEXT: fadd s4, s5, s4
-; CHECKNOFP16-NEXT: mov h5, v0.h[4]
-; CHECKNOFP16-NEXT: fadd s2, s2, s3
-; CHECKNOFP16-NEXT: mov h3, v1.h[4]
-; CHECKNOFP16-NEXT: fcvt h4, s4
-; CHECKNOFP16-NEXT: fcvt s5, h5
-; CHECKNOFP16-NEXT: fcvt h2, s2
-; CHECKNOFP16-NEXT: fcvt s3, h3
-; CHECKNOFP16-NEXT: fcvt s4, h4
-; CHECKNOFP16-NEXT: fcvt s2, h2
-; CHECKNOFP16-NEXT: fadd s3, s5, s3
-; CHECKNOFP16-NEXT: mov h5, v0.h[5]
-; CHECKNOFP16-NEXT: fadd s2, s2, s4
-; CHECKNOFP16-NEXT: mov h4, v1.h[5]
-; CHECKNOFP16-NEXT: fcvt h3, s3
-; CHECKNOFP16-NEXT: fcvt s5, h5
-; CHECKNOFP16-NEXT: fcvt h2, s2
-; CHECKNOFP16-NEXT: fcvt s4, h4
-; CHECKNOFP16-NEXT: fcvt s3, h3
-; CHECKNOFP16-NEXT: fcvt s2, h2
-; CHECKNOFP16-NEXT: fadd s4, s5, s4
-; CHECKNOFP16-NEXT: mov h5, v0.h[6]
-; CHECKNOFP16-NEXT: mov h0, v0.h[7]
-; CHECKNOFP16-NEXT: fadd s2, s2, s3
-; CHECKNOFP16-NEXT: fcvt h3, s4
-; CHECKNOFP16-NEXT: mov h4, v1.h[6]
-; CHECKNOFP16-NEXT: fcvt s5, h5
-; CHECKNOFP16-NEXT: mov h1, v1.h[7]
-; CHECKNOFP16-NEXT: fcvt s0, h0
-; CHECKNOFP16-NEXT: fcvt h2, s2
-; CHECKNOFP16-NEXT: fcvt s3, h3
-; CHECKNOFP16-NEXT: fcvt s4, h4
-; CHECKNOFP16-NEXT: fcvt s1, h1
-; CHECKNOFP16-NEXT: fcvt s2, h2
-; CHECKNOFP16-NEXT: fadd s0, s0, s1
-; CHECKNOFP16-NEXT: fadd s2, s2, s3
-; CHECKNOFP16-NEXT: fadd s3, s5, s4
-; CHECKNOFP16-NEXT: fcvt h0, s0
-; CHECKNOFP16-NEXT: fcvt h2, s2
-; CHECKNOFP16-NEXT: fcvt h3, s3
-; CHECKNOFP16-NEXT: fcvt s0, h0
-; CHECKNOFP16-NEXT: fcvt s2, h2
-; CHECKNOFP16-NEXT: fcvt s3, h3
-; CHECKNOFP16-NEXT: fadd s2, s2, s3
-; CHECKNOFP16-NEXT: fcvt h1, s2
-; CHECKNOFP16-NEXT: fcvt s1, h1
-; CHECKNOFP16-NEXT: fadd s0, s1, s0
-; CHECKNOFP16-NEXT: fcvt h0, s0
-; CHECKNOFP16-NEXT: ret
+; CHECK-SD-FP16-LABEL: add_2H:
+; CHECK-SD-FP16: // %bb.0:
+; CHECK-SD-FP16-NEXT: fadd v0.8h, v0.8h, v1.8h
+; CHECK-SD-FP16-NEXT: faddp v1.8h, v0.8h, v0.8h
+; CHECK-SD-FP16-NEXT: faddp v0.8h, v1.8h, v0.8h
+; CHECK-SD-FP16-NEXT: faddp h0, v0.2h
+; CHECK-SD-FP16-NEXT: ret
+;
+; CHECK-GI-NOFP16-LABEL: add_2H:
+; CHECK-GI-NOFP16: // %bb.0:
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
+; CHECK-GI-NOFP16-NEXT: mov w8, #32768 // =0x8000
+; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
+; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v2.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v3.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v0.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: fmov s1, w8
+; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: faddp s0, v0.2s
+; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1
+; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT: ret
+;
+; CHECK-GI-FP16-LABEL: add_2H:
+; CHECK-GI-FP16: // %bb.0:
+; CHECK-GI-FP16-NEXT: fadd v0.8h, v0.8h, v1.8h
+; CHECK-GI-FP16-NEXT: adrp x8, .LCPI5_0
+; CHECK-GI-FP16-NEXT: faddp v1.8h, v0.8h, v0.8h
+; CHECK-GI-FP16-NEXT: faddp v0.8h, v1.8h, v0.8h
+; CHECK-GI-FP16-NEXT: ldr h1, [x8, :lo12:.LCPI5_0]
+; CHECK-GI-FP16-NEXT: faddp h0, v0.2h
+; CHECK-GI-FP16-NEXT: fadd h0, h0, h1
+; CHE...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/73294
More information about the llvm-commits
mailing list