[llvm] fix `llvm.fma.f16` double rounding issue when there is no native support (PR #171904)
Folkert de Vries via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 16 01:40:09 PST 2025
https://github.com/folkertdev updated https://github.com/llvm/llvm-project/pull/171904
>From 6581c75d9cdf8627fbc627d90d37e5f50d34371d Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Thu, 11 Dec 2025 20:45:57 +0100
Subject: [PATCH 1/2] promote f16 fma to f64 if there is no instruction support
---
.../SelectionDAG/LegalizeFloatTypes.cpp | 23 +-
llvm/test/CodeGen/ARM/fp16-promote.ll | 80 ++--
llvm/test/CodeGen/Generic/half-op.ll | 32 +-
llvm/test/CodeGen/RISCV/half-arith.ll | 366 ++++++++++++------
llvm/test/CodeGen/RISCV/half-intrinsics.ll | 44 ++-
5 files changed, 369 insertions(+), 176 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 383a025a4d916..2406094ef0378 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -3507,6 +3507,7 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) {
SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
+ SDNodeFlags Flags = N->getFlags();
SDLoc dl(N);
// Promote to the larger FP type.
@@ -3515,9 +3516,27 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) {
Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1);
Op2 = DAG.getNode(PromotionOpcode, dl, NVT, Op2);
- SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2);
+ SDValue Res;
+ if (OVT == MVT::f16) {
+ // An f16 fma must go via f64 to prevent double rounding issues.
+ SDValue A64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op0, Flags);
+ SDValue B64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op1, Flags);
+ SDValue C64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op2, Flags);
+
+ // Prefer a wide FMA node if available; otherwise expand to mul+add.
+ SDValue WideRes;
+ if (TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), MVT::f64)) {
+ WideRes = DAG.getNode(ISD::FMA, dl, MVT::f64, A64, B64, C64, Flags);
+ } else {
+ SDValue Mul = DAG.getNode(ISD::FMUL, dl, MVT::f64, A64, B64, Flags);
+ WideRes = DAG.getNode(ISD::FADD, dl, MVT::f64, Mul, C64, Flags);
+ }
- // Convert back to FP16 as an integer.
+ return DAG.getNode(GetPromotionOpcode(MVT::f64, OVT), dl, MVT::i16,
+ WideRes);
+ }
+
+ Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2, Flags);
return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
}
diff --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll
index 8230e47259dd8..27a0bf2eb9037 100644
--- a/llvm/test/CodeGen/ARM/fp16-promote.ll
+++ b/llvm/test/CodeGen/ARM/fp16-promote.ll
@@ -1508,61 +1508,81 @@ define void @test_fma(ptr %p, ptr %q, ptr %r) #0 {
; CHECK-FP16-NEXT: push {r4, lr}
; CHECK-FP16-NEXT: mov r4, r0
; CHECK-FP16-NEXT: ldrh r0, [r1]
-; CHECK-FP16-NEXT: ldrh r1, [r4]
-; CHECK-FP16-NEXT: ldrh r2, [r2]
-; CHECK-FP16-NEXT: vmov s2, r0
+; CHECK-FP16-NEXT: ldrh r1, [r2]
+; CHECK-FP16-NEXT: vmov s0, r0
+; CHECK-FP16-NEXT: ldrh r0, [r4]
+; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT: vcvt.f64.f32 d16, s0
+; CHECK-FP16-NEXT: vmov s0, r0
+; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT: vcvt.f64.f32 d17, s0
; CHECK-FP16-NEXT: vmov s0, r1
-; CHECK-FP16-NEXT: vcvtb.f32.f16 s1, s2
-; CHECK-FP16-NEXT: vmov s2, r2
; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
-; CHECK-FP16-NEXT: vcvtb.f32.f16 s2, s2
-; CHECK-FP16-NEXT: bl fmaf
-; CHECK-FP16-NEXT: vcvtb.f16.f32 s0, s0
-; CHECK-FP16-NEXT: vmov r0, s0
+; CHECK-FP16-NEXT: vcvt.f64.f32 d18, s0
+; CHECK-FP16-NEXT: vmla.f64 d18, d17, d16
+; CHECK-FP16-NEXT: vmov r0, r1, d18
+; CHECK-FP16-NEXT: bl __aeabi_d2h
; CHECK-FP16-NEXT: strh r0, [r4]
; CHECK-FP16-NEXT: pop {r4, pc}
;
; CHECK-LIBCALL-VFP-LABEL: test_fma:
; CHECK-LIBCALL-VFP: .save {r4, r5, r6, lr}
; CHECK-LIBCALL-VFP-NEXT: push {r4, r5, r6, lr}
+; CHECK-LIBCALL-VFP-NEXT: .vsave {d8, d9}
+; CHECK-LIBCALL-VFP-NEXT: vpush {d8, d9}
; CHECK-LIBCALL-VFP-NEXT: mov r4, r0
-; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r2]
-; CHECK-LIBCALL-VFP-NEXT: mov r5, r1
+; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r0]
+; CHECK-LIBCALL-VFP-NEXT: mov r5, r2
+; CHECK-LIBCALL-VFP-NEXT: mov r6, r1
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f
-; CHECK-LIBCALL-VFP-NEXT: mov r6, r0
-; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r5]
+; CHECK-LIBCALL-VFP-NEXT: ldrh r1, [r6]
+; CHECK-LIBCALL-VFP-NEXT: vmov s16, r0
+; CHECK-LIBCALL-VFP-NEXT: ldrh r5, [r5]
+; CHECK-LIBCALL-VFP-NEXT: mov r0, r1
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f
-; CHECK-LIBCALL-VFP-NEXT: mov r5, r0
-; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r4]
+; CHECK-LIBCALL-VFP-NEXT: vmov s18, r0
+; CHECK-LIBCALL-VFP-NEXT: mov r0, r5
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f
; CHECK-LIBCALL-VFP-NEXT: vmov s0, r0
-; CHECK-LIBCALL-VFP-NEXT: vmov s1, r5
-; CHECK-LIBCALL-VFP-NEXT: vmov s2, r6
-; CHECK-LIBCALL-VFP-NEXT: bl fmaf
-; CHECK-LIBCALL-VFP-NEXT: vmov r0, s0
-; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_f2h
+; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d16, s18
+; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d17, s16
+; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d18, s0
+; CHECK-LIBCALL-VFP-NEXT: vmla.f64 d18, d17, d16
+; CHECK-LIBCALL-VFP-NEXT: vmov r0, r1, d18
+; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_d2h
; CHECK-LIBCALL-VFP-NEXT: strh r0, [r4]
+; CHECK-LIBCALL-VFP-NEXT: vpop {d8, d9}
; CHECK-LIBCALL-VFP-NEXT: pop {r4, r5, r6, pc}
;
; CHECK-NOVFP-LABEL: test_fma:
-; CHECK-NOVFP: .save {r4, r5, r6, lr}
-; CHECK-NOVFP-NEXT: push {r4, r5, r6, lr}
+; CHECK-NOVFP: .save {r4, r5, r6, r7, r11, lr}
+; CHECK-NOVFP-NEXT: push {r4, r5, r6, r7, r11, lr}
; CHECK-NOVFP-NEXT: mov r4, r0
; CHECK-NOVFP-NEXT: ldrh r0, [r1]
; CHECK-NOVFP-NEXT: mov r5, r2
; CHECK-NOVFP-NEXT: bl __aeabi_h2f
+; CHECK-NOVFP-NEXT: bl __aeabi_f2d
; CHECK-NOVFP-NEXT: mov r6, r0
-; CHECK-NOVFP-NEXT: ldrh r0, [r5]
-; CHECK-NOVFP-NEXT: bl __aeabi_h2f
-; CHECK-NOVFP-NEXT: mov r5, r0
; CHECK-NOVFP-NEXT: ldrh r0, [r4]
+; CHECK-NOVFP-NEXT: mov r7, r1
; CHECK-NOVFP-NEXT: bl __aeabi_h2f
-; CHECK-NOVFP-NEXT: mov r1, r6
-; CHECK-NOVFP-NEXT: mov r2, r5
-; CHECK-NOVFP-NEXT: bl fmaf
-; CHECK-NOVFP-NEXT: bl __aeabi_f2h
+; CHECK-NOVFP-NEXT: bl __aeabi_f2d
+; CHECK-NOVFP-NEXT: mov r2, r6
+; CHECK-NOVFP-NEXT: mov r3, r7
+; CHECK-NOVFP-NEXT: bl __aeabi_dmul
+; CHECK-NOVFP-NEXT: mov r6, r0
+; CHECK-NOVFP-NEXT: ldrh r0, [r5]
+; CHECK-NOVFP-NEXT: mov r7, r1
+; CHECK-NOVFP-NEXT: bl __aeabi_h2f
+; CHECK-NOVFP-NEXT: bl __aeabi_f2d
+; CHECK-NOVFP-NEXT: mov r2, r0
+; CHECK-NOVFP-NEXT: mov r3, r1
+; CHECK-NOVFP-NEXT: mov r0, r6
+; CHECK-NOVFP-NEXT: mov r1, r7
+; CHECK-NOVFP-NEXT: bl __aeabi_dadd
+; CHECK-NOVFP-NEXT: bl __aeabi_d2h
; CHECK-NOVFP-NEXT: strh r0, [r4]
-; CHECK-NOVFP-NEXT: pop {r4, r5, r6, pc}
+; CHECK-NOVFP-NEXT: pop {r4, r5, r6, r7, r11, pc}
%a = load half, ptr %p, align 2
%b = load half, ptr %q, align 2
%c = load half, ptr %r, align 2
diff --git a/llvm/test/CodeGen/Generic/half-op.ll b/llvm/test/CodeGen/Generic/half-op.ll
index 1037d8e20cc10..30509efcba8bb 100644
--- a/llvm/test/CodeGen/Generic/half-op.ll
+++ b/llvm/test/CodeGen/Generic/half-op.ll
@@ -8,37 +8,37 @@
; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=arm64ec-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if amdgpu-registered-target %{ llc %s -o - -mtriple=amdgcn-amd-amdhsa | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if arc-registered-target %{ llc %s -o - -mtriple=arc-elf | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
-; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=thumbv7em-none-eabi | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if avr-registered-target %{ llc %s -o - -mtriple=avr-none | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN %}
+; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=thumbv7em-none-eabi | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if avr-registered-target %{ llc %s -o - -mtriple=avr-none | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; FIXME: BPF has a compiler error
; RUN: %if csky-registered-target %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2 | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; RUN: %if csky-registered-target %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2 -mcpu=ck860fv -mattr=+hard-float | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; FIXME: directx has a compiler error
-; RUN: %if hexagon-registered-target %{ llc %s -o - -mtriple=hexagon-unknown-linux-musl | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if hexagon-registered-target %{ llc %s -o - -mtriple=hexagon-unknown-linux-musl | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if lanai-registered-target %{ llc %s -o - -mtriple=lanai-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
-; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu -mattr=+f | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu -mattr=+f | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if m68k-registered-target %{ llc %s -o - -mtriple=m68k-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
-; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips64-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips64el-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mipsel-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips64-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips64el-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mipsel-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if msp430-registered-target %{ llc %s -o - -mtriple=msp430-none-elf | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; RUN: %if nvptx-registered-target %{ llc %s -o - -mtriple=nvptx64-nvidia-cuda | FileCheck %s --check-prefixes=NOCRASH %}
; RUN: %if powerpc-registered-target %{ llc %s -o - -mtriple=powerpc-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; RUN: %if powerpc-registered-target %{ llc %s -o - -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; RUN: %if powerpc-registered-target %{ llc %s -o - -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
-; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if spirv-registered-target %{ llc %s -o - -mtriple=spirv-unknown-unknown | FileCheck %s --check-prefixes=NOCRASH %}
; RUN: %if systemz-registered-target %{ llc %s -o - -mtriple=s390x-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if ve-registered-target %{ llc %s -o - -mtriple=ve-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; RUN: %if webassembly-registered-target %{ llc %s -o - -mtriple=wasm32-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
-; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if xcore-registered-target %{ llc %s -o - -mtriple=xcore-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll
index e1eb860d26591..311905be2ce25 100644
--- a/llvm/test/CodeGen/RISCV/half-arith.ll
+++ b/llvm/test/CodeGen/RISCV/half-arith.ll
@@ -1093,28 +1093,41 @@ define half @fmadd_h(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: lui a1, 16
-; RV32I-NEXT: addi s3, a1, -1
-; RV32I-NEXT: and a0, a0, s3
+; RV32I-NEXT: addi s4, a1, -1
+; RV32I-NEXT: and a0, a0, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s2, a0
-; RV32I-NEXT: and a0, s1, s3
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: and a0, s1, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s2
+; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: call __muldf3
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: and a0, s0, s3
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: and a0, s0, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
-; RV32I-NEXT: mv a0, s2
-; RV32I-NEXT: mv a1, s1
-; RV32I-NEXT: call fmaf
-; RV32I-NEXT: call __truncsfhf2
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a1, s2
+; RV32I-NEXT: call __adddf3
+; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@@ -1132,17 +1145,22 @@ define half @fmadd_h(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: addi s3, a1, -1
; RV64I-NEXT: and a0, a0, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s2, a0
; RV64I-NEXT: and a0, s1, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s2
+; RV64I-NEXT: call __muldf3
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: and a0, s0, s3
; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv a2, a0
-; RV64I-NEXT: mv a0, s2
-; RV64I-NEXT: mv a1, s1
-; RV64I-NEXT: call fmaf
-; RV64I-NEXT: call __truncsfhf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s1
+; RV64I-NEXT: call __adddf3
+; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
@@ -1194,35 +1212,48 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a1
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: lui a0, 16
-; RV32I-NEXT: addi s2, a0, -1
-; RV32I-NEXT: and a0, a2, s2
+; RV32I-NEXT: addi s3, a0, -1
+; RV32I-NEXT: and a0, a2, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: call __addsf3
; RV32I-NEXT: call __truncsfhf2
; RV32I-NEXT: lui a1, 8
-; RV32I-NEXT: xor s3, a0, a1
-; RV32I-NEXT: and a0, s1, s2
+; RV32I-NEXT: xor s4, a0, a1
+; RV32I-NEXT: and a0, s1, s3
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: and a0, s0, s2
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: and a0, s0, s3
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a1, s2
+; RV32I-NEXT: call __muldf3
; RV32I-NEXT: mv s0, a0
-; RV32I-NEXT: and a0, s3, s2
+; RV32I-NEXT: mv s1, a1
+; RV32I-NEXT: and a0, s4, s3
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
-; RV32I-NEXT: mv a0, s1
-; RV32I-NEXT: mv a1, s0
-; RV32I-NEXT: call fmaf
-; RV32I-NEXT: call __truncsfhf2
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s0
+; RV32I-NEXT: mv a1, s1
+; RV32I-NEXT: call __adddf3
+; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@@ -1247,17 +1278,22 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: xor s3, a0, a1
; RV64I-NEXT: and a0, s1, s2
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: and a0, s0, s2
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s1
+; RV64I-NEXT: call __muldf3
; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s3, s2
; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv a2, a0
-; RV64I-NEXT: mv a0, s1
-; RV64I-NEXT: mv a1, s0
-; RV64I-NEXT: call fmaf
-; RV64I-NEXT: call __truncsfhf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s0
+; RV64I-NEXT: call __adddf3
+; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
@@ -1329,8 +1365,8 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a1
-; RV32I-NEXT: lui a1, 16
-; RV32I-NEXT: addi s3, a1, -1
+; RV32I-NEXT: lui s3, 16
+; RV32I-NEXT: addi s3, s3, -1
; RV32I-NEXT: and a0, a0, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: li a1, 0
@@ -1347,17 +1383,26 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: xor s4, a0, a1
; RV32I-NEXT: and a0, s1, s3
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s0, a0
+; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: and a0, s2, s3
; RV32I-NEXT: call __extendhfsf2
-; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, s0
+; RV32I-NEXT: mv a3, s1
+; RV32I-NEXT: call __muldf3
+; RV32I-NEXT: mv s0, a0
+; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: and a0, s4, s3
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
-; RV32I-NEXT: mv a0, s1
-; RV32I-NEXT: mv a1, s0
-; RV32I-NEXT: call fmaf
-; RV32I-NEXT: call __truncsfhf2
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s0
+; RV32I-NEXT: mv a1, s1
+; RV32I-NEXT: call __adddf3
+; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
@@ -1378,8 +1423,8 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill
; RV64I-NEXT: mv s0, a2
; RV64I-NEXT: mv s1, a1
-; RV64I-NEXT: lui a1, 16
-; RV64I-NEXT: addi s3, a1, -1
+; RV64I-NEXT: lui s3, 16
+; RV64I-NEXT: addi s3, s3, -1
; RV64I-NEXT: and a0, a0, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: li a1, 0
@@ -1396,17 +1441,21 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: xor s4, a0, a1
; RV64I-NEXT: and a0, s1, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s2, s3
; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv s1, a0
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, s0
+; RV64I-NEXT: call __muldf3
+; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s4, s3
; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv a2, a0
-; RV64I-NEXT: mv a0, s1
-; RV64I-NEXT: mv a1, s0
-; RV64I-NEXT: call fmaf
-; RV64I-NEXT: call __truncsfhf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s0
+; RV64I-NEXT: call __adddf3
+; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
@@ -1491,8 +1540,8 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: lui a0, 16
-; RV32I-NEXT: addi s3, a0, -1
+; RV32I-NEXT: lui s3, 16
+; RV32I-NEXT: addi s3, s3, -1
; RV32I-NEXT: and a0, a1, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: li a1, 0
@@ -1509,17 +1558,28 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: xor s4, a0, a1
; RV32I-NEXT: and a0, s1, s3
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s0, a0
+; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: and a0, s2, s3
; RV32I-NEXT: call __extendhfsf2
-; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s0
+; RV32I-NEXT: mv a1, s1
+; RV32I-NEXT: call __muldf3
+; RV32I-NEXT: mv s0, a0
+; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: and a0, s4, s3
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s0
; RV32I-NEXT: mv a1, s1
-; RV32I-NEXT: call fmaf
-; RV32I-NEXT: call __truncsfhf2
+; RV32I-NEXT: call __adddf3
+; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
@@ -1540,8 +1600,8 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill
; RV64I-NEXT: mv s0, a2
; RV64I-NEXT: mv s1, a0
-; RV64I-NEXT: lui a0, 16
-; RV64I-NEXT: addi s3, a0, -1
+; RV64I-NEXT: lui s3, 16
+; RV64I-NEXT: addi s3, s3, -1
; RV64I-NEXT: and a0, a1, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: li a1, 0
@@ -1558,17 +1618,22 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: xor s4, a0, a1
; RV64I-NEXT: and a0, s1, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s2, s3
; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv s1, a0
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s0
+; RV64I-NEXT: call __muldf3
+; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s4, s3
; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv a2, a0
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s0
-; RV64I-NEXT: mv a1, s1
-; RV64I-NEXT: call fmaf
-; RV64I-NEXT: call __truncsfhf2
+; RV64I-NEXT: call __adddf3
+; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
@@ -1659,23 +1724,35 @@ define half @fnmadd_h_3(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: lui a1, 16
-; RV32I-NEXT: addi s3, a1, -1
-; RV32I-NEXT: and a0, a0, s3
+; RV32I-NEXT: addi s4, a1, -1
+; RV32I-NEXT: and a0, a0, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s2, a0
-; RV32I-NEXT: and a0, s1, s3
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: and a0, s1, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s2
+; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: call __muldf3
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: and a0, s0, s3
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: and a0, s0, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
-; RV32I-NEXT: mv a0, s2
-; RV32I-NEXT: mv a1, s1
-; RV32I-NEXT: call fmaf
-; RV32I-NEXT: call __truncsfhf2
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a1, s2
+; RV32I-NEXT: call __adddf3
+; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lui a1, 1048568
; RV32I-NEXT: xor a0, a0, a1
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
@@ -1683,6 +1760,7 @@ define half @fnmadd_h_3(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@@ -1700,17 +1778,22 @@ define half @fnmadd_h_3(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: addi s3, a1, -1
; RV64I-NEXT: and a0, a0, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s2, a0
; RV64I-NEXT: and a0, s1, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s2
+; RV64I-NEXT: call __muldf3
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: and a0, s0, s3
; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv a2, a0
-; RV64I-NEXT: mv a0, s2
-; RV64I-NEXT: mv a1, s1
-; RV64I-NEXT: call fmaf
-; RV64I-NEXT: call __truncsfhf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s1
+; RV64I-NEXT: call __adddf3
+; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: lui a1, 1048568
; RV64I-NEXT: xor a0, a0, a1
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
@@ -1779,23 +1862,35 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: lui a1, 16
-; RV32I-NEXT: addi s3, a1, -1
-; RV32I-NEXT: and a0, a0, s3
+; RV32I-NEXT: addi s4, a1, -1
+; RV32I-NEXT: and a0, a0, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s2, a0
-; RV32I-NEXT: and a0, s1, s3
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: and a0, s1, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s2
+; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: call __muldf3
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: and a0, s0, s3
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: and a0, s0, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
-; RV32I-NEXT: mv a0, s2
-; RV32I-NEXT: mv a1, s1
-; RV32I-NEXT: call fmaf
-; RV32I-NEXT: call __truncsfhf2
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a1, s2
+; RV32I-NEXT: call __adddf3
+; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lui a1, 1048568
; RV32I-NEXT: xor a0, a0, a1
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
@@ -1803,6 +1898,7 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@@ -1820,17 +1916,22 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: addi s3, a1, -1
; RV64I-NEXT: and a0, a0, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s2, a0
; RV64I-NEXT: and a0, s1, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s2
+; RV64I-NEXT: call __muldf3
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: and a0, s0, s3
; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv a2, a0
-; RV64I-NEXT: mv a0, s2
-; RV64I-NEXT: mv a1, s1
-; RV64I-NEXT: call fmaf
-; RV64I-NEXT: call __truncsfhf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s1
+; RV64I-NEXT: call __adddf3
+; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: lui a1, 1048568
; RV64I-NEXT: xor a0, a0, a1
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
@@ -1892,34 +1993,46 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: lui a1, 16
-; RV32I-NEXT: addi s2, a1, -1
-; RV32I-NEXT: and a0, a0, s2
+; RV32I-NEXT: addi s3, a1, -1
+; RV32I-NEXT: and a0, a0, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: call __addsf3
; RV32I-NEXT: call __truncsfhf2
; RV32I-NEXT: lui a1, 8
-; RV32I-NEXT: xor s3, a0, a1
-; RV32I-NEXT: and a0, s1, s2
+; RV32I-NEXT: xor s4, a0, a1
+; RV32I-NEXT: and a0, s1, s3
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: and a0, s0, s2
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: and a0, s4, s3
; RV32I-NEXT: call __extendhfsf2
-; RV32I-NEXT: mv s0, a0
-; RV32I-NEXT: and a0, s3, s2
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, s1
+; RV32I-NEXT: mv a3, s2
+; RV32I-NEXT: call __muldf3
+; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: and a0, s0, s3
; RV32I-NEXT: call __extendhfsf2
-; RV32I-NEXT: mv a1, s1
-; RV32I-NEXT: mv a2, s0
-; RV32I-NEXT: call fmaf
-; RV32I-NEXT: call __truncsfhf2
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a1, s2
+; RV32I-NEXT: call __adddf3
+; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@@ -1944,16 +2057,21 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: xor s3, a0, a1
; RV64I-NEXT: and a0, s1, s2
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s1, a0
-; RV64I-NEXT: and a0, s0, s2
-; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s3, s2
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, s1
-; RV64I-NEXT: mv a2, s0
-; RV64I-NEXT: call fmaf
-; RV64I-NEXT: call __truncsfhf2
+; RV64I-NEXT: call __muldf3
+; RV64I-NEXT: mv s1, a0
+; RV64I-NEXT: and a0, s0, s2
+; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s1
+; RV64I-NEXT: call __adddf3
+; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
@@ -2020,35 +2138,48 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: lui a0, 16
-; RV32I-NEXT: addi s2, a0, -1
-; RV32I-NEXT: and a0, a1, s2
+; RV32I-NEXT: addi s3, a0, -1
+; RV32I-NEXT: and a0, a1, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: call __addsf3
; RV32I-NEXT: call __truncsfhf2
; RV32I-NEXT: lui a1, 8
-; RV32I-NEXT: xor s3, a0, a1
-; RV32I-NEXT: and a0, s1, s2
+; RV32I-NEXT: xor s4, a0, a1
+; RV32I-NEXT: and a0, s1, s3
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: and a0, s0, s2
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: and a0, s4, s3
; RV32I-NEXT: call __extendhfsf2
-; RV32I-NEXT: mv s0, a0
-; RV32I-NEXT: and a0, s3, s2
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a1, s2
+; RV32I-NEXT: call __muldf3
+; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: and a0, s0, s3
; RV32I-NEXT: call __extendhfsf2
-; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s1
-; RV32I-NEXT: mv a2, s0
-; RV32I-NEXT: call fmaf
-; RV32I-NEXT: call __truncsfhf2
+; RV32I-NEXT: mv a1, s2
+; RV32I-NEXT: call __adddf3
+; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@@ -2073,17 +2204,22 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: xor s3, a0, a1
; RV64I-NEXT: and a0, s1, s2
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s1, a0
-; RV64I-NEXT: and a0, s0, s2
-; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s3, s2
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s1
-; RV64I-NEXT: mv a2, s0
-; RV64I-NEXT: call fmaf
-; RV64I-NEXT: call __truncsfhf2
+; RV64I-NEXT: call __muldf3
+; RV64I-NEXT: mv s1, a0
+; RV64I-NEXT: and a0, s0, s2
+; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s1
+; RV64I-NEXT: call __adddf3
+; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/half-intrinsics.ll b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
index e712bd919b0b1..5ae127c1d00a3 100644
--- a/llvm/test/CodeGen/RISCV/half-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
@@ -1690,28 +1690,41 @@ define half @fma_f16(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: lui a1, 16
-; RV32I-NEXT: addi s3, a1, -1
-; RV32I-NEXT: and a0, a0, s3
+; RV32I-NEXT: addi s4, a1, -1
+; RV32I-NEXT: and a0, a0, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s2, a0
-; RV32I-NEXT: and a0, s1, s3
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: and a0, s1, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s2
+; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: call __muldf3
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: and a0, s0, s3
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: and a0, s0, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
-; RV32I-NEXT: mv a0, s2
-; RV32I-NEXT: mv a1, s1
-; RV32I-NEXT: call fmaf
-; RV32I-NEXT: call __truncsfhf2
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a1, s2
+; RV32I-NEXT: call __adddf3
+; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@@ -1729,17 +1742,22 @@ define half @fma_f16(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: addi s3, a1, -1
; RV64I-NEXT: and a0, a0, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s2, a0
; RV64I-NEXT: and a0, s1, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s2
+; RV64I-NEXT: call __muldf3
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: and a0, s0, s3
; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv a2, a0
-; RV64I-NEXT: mv a0, s2
-; RV64I-NEXT: mv a1, s1
-; RV64I-NEXT: call fmaf
-; RV64I-NEXT: call __truncsfhf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s1
+; RV64I-NEXT: call __adddf3
+; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
>From 438f8baea81dd58a913d83451e1d7012f51dc9f9 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Mon, 15 Dec 2025 00:44:51 +0100
Subject: [PATCH 2/2] add `AddPromotedToType(ISD::FMA, MVT::f16, MVT::f64);`
---
.../SelectionDAG/LegalizeFloatTypes.cpp | 3 +-
llvm/lib/CodeGen/TargetLoweringBase.cpp | 5 +
llvm/test/CodeGen/AArch64/f16-instructions.ll | 10 +-
llvm/test/CodeGen/AArch64/fmla.ll | 370 +++++++++---------
.../CodeGen/AArch64/fp-intrinsics-fp16.ll | 12 +-
...ve-streaming-mode-fixed-length-fp-arith.ll | 320 +++++++--------
llvm/test/CodeGen/Generic/half-op.ll | 6 +-
llvm/test/CodeGen/NVPTX/f16-instructions.ll | 27 +-
llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 54 +--
llvm/test/CodeGen/SystemZ/fp-mul-06.ll | 12 +-
llvm/test/CodeGen/SystemZ/fp-mul-08.ll | 12 +-
llvm/test/CodeGen/SystemZ/fp-mul-10.ll | 20 +-
llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll | 14 +-
.../test/CodeGen/X86/fp-strict-scalar-fp16.ll | 56 +--
llvm/test/CodeGen/X86/fp16-libcalls.ll | 36 +-
15 files changed, 494 insertions(+), 463 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 2406094ef0378..72814a6890075 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -3518,7 +3518,8 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) {
SDValue Res;
if (OVT == MVT::f16) {
- // An f16 fma must go via f64 to prevent double rounding issues.
+ // If f16 fma is not natively supported, the value must be promoted to an
+ // f64 (and not to f32!) to prevent double rounding issues.
SDValue A64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op0, Flags);
SDValue B64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op1, Flags);
SDValue C64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op2, Flags);
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 1d674b283db15..d62bf8a4ad74d 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -842,6 +842,11 @@ void TargetLoweringBase::initActions() {
}
}
+ // If f16 fma is not natively supported, the value must be promoted to an f64
+ // (and not to f32!) to prevent double rounding issues.
+ AddPromotedToType(ISD::FMA, MVT::f16, MVT::f64);
+ AddPromotedToType(ISD::STRICT_FMA, MVT::f16, MVT::f64);
+
// Set default actions for various operations.
for (MVT VT : MVT::all_valuetypes()) {
// Default all indexed load / store to expand.
diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll
index 085170c7ba381..f6d701b518699 100644
--- a/llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -1378,11 +1378,11 @@ define half @test_log2(half %a) #0 {
define half @test_fma(half %a, half %b, half %c) #0 {
; CHECK-CVT-SD-LABEL: test_fma:
; CHECK-CVT-SD: // %bb.0:
-; CHECK-CVT-SD-NEXT: fcvt s2, h2
-; CHECK-CVT-SD-NEXT: fcvt s1, h1
-; CHECK-CVT-SD-NEXT: fcvt s0, h0
-; CHECK-CVT-SD-NEXT: fmadd s0, s0, s1, s2
-; CHECK-CVT-SD-NEXT: fcvt h0, s0
+; CHECK-CVT-SD-NEXT: fcvt d2, h2
+; CHECK-CVT-SD-NEXT: fcvt d1, h1
+; CHECK-CVT-SD-NEXT: fcvt d0, h0
+; CHECK-CVT-SD-NEXT: fmadd d0, d0, d1, d2
+; CHECK-CVT-SD-NEXT: fcvt h0, d0
; CHECK-CVT-SD-NEXT: ret
;
; CHECK-FP16-LABEL: test_fma:
diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index 12b6562b5cf0c..16c835b59bd3f 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -27,11 +27,11 @@ entry:
define half @fma_f16(half %a, half %b, half %c) {
; CHECK-SD-NOFP16-LABEL: fma_f16:
; CHECK-SD-NOFP16: // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT: fmadd s0, s0, s1, s2
-; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT: fcvt d2, h2
+; CHECK-SD-NOFP16-NEXT: fcvt d1, h1
+; CHECK-SD-NOFP16-NEXT: fcvt d0, h0
+; CHECK-SD-NOFP16-NEXT: fmadd d0, d0, d1, d2
+; CHECK-SD-NOFP16-NEXT: fcvt h0, d0
; CHECK-SD-NOFP16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fma_f16:
@@ -178,69 +178,69 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
; CHECK-SD-NOFP16-NEXT: mov h3, v2.h[1]
; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[1]
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[1]
-; CHECK-SD-NOFP16-NEXT: fcvt s6, h2
-; CHECK-SD-NOFP16-NEXT: fcvt s7, h1
-; CHECK-SD-NOFP16-NEXT: fcvt s16, h0
+; CHECK-SD-NOFP16-NEXT: fcvt d6, h2
+; CHECK-SD-NOFP16-NEXT: fcvt d7, h1
+; CHECK-SD-NOFP16-NEXT: fcvt d16, h0
; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[2]
; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[2]
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[2]
-; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT: fmadd s6, s16, s7, s6
-; CHECK-SD-NOFP16-NEXT: fcvt s7, h17
-; CHECK-SD-NOFP16-NEXT: fcvt s16, h18
-; CHECK-SD-NOFP16-NEXT: fcvt s17, h19
+; CHECK-SD-NOFP16-NEXT: fcvt d3, h3
+; CHECK-SD-NOFP16-NEXT: fcvt d4, h4
+; CHECK-SD-NOFP16-NEXT: fcvt d5, h5
+; CHECK-SD-NOFP16-NEXT: fmadd d6, d16, d7, d6
+; CHECK-SD-NOFP16-NEXT: fcvt d7, h17
+; CHECK-SD-NOFP16-NEXT: fcvt d16, h18
+; CHECK-SD-NOFP16-NEXT: fcvt d17, h19
; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[3]
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[3]
-; CHECK-SD-NOFP16-NEXT: fmadd s4, s5, s4, s3
+; CHECK-SD-NOFP16-NEXT: fmadd d4, d5, d4, d3
; CHECK-SD-NOFP16-NEXT: mov h5, v2.h[3]
-; CHECK-SD-NOFP16-NEXT: fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT: fmadd s6, s17, s16, s7
+; CHECK-SD-NOFP16-NEXT: fcvt h3, d6
+; CHECK-SD-NOFP16-NEXT: fmadd d6, d17, d16, d7
; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[4]
-; CHECK-SD-NOFP16-NEXT: fcvt s7, h18
-; CHECK-SD-NOFP16-NEXT: fcvt s16, h19
+; CHECK-SD-NOFP16-NEXT: fcvt d7, h18
+; CHECK-SD-NOFP16-NEXT: fcvt d16, h19
; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[4]
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT: fcvt h4, d4
+; CHECK-SD-NOFP16-NEXT: fcvt d5, h5
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[4]
-; CHECK-SD-NOFP16-NEXT: fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT: fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT: fcvt s18, h18
+; CHECK-SD-NOFP16-NEXT: fcvt h6, d6
+; CHECK-SD-NOFP16-NEXT: fcvt d17, h17
+; CHECK-SD-NOFP16-NEXT: fcvt d18, h18
; CHECK-SD-NOFP16-NEXT: mov v3.h[1], v4.h[0]
; CHECK-SD-NOFP16-NEXT: mov h4, v2.h[5]
-; CHECK-SD-NOFP16-NEXT: fmadd s5, s16, s7, s5
+; CHECK-SD-NOFP16-NEXT: fmadd d5, d16, d7, d5
; CHECK-SD-NOFP16-NEXT: mov h7, v1.h[5]
; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[5]
-; CHECK-SD-NOFP16-NEXT: fcvt s19, h19
+; CHECK-SD-NOFP16-NEXT: fcvt d19, h19
; CHECK-SD-NOFP16-NEXT: mov v3.h[2], v6.h[0]
; CHECK-SD-NOFP16-NEXT: mov h6, v2.h[6]
-; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT: fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT: fmadd s17, s19, s18, s17
+; CHECK-SD-NOFP16-NEXT: fcvt d4, h4
+; CHECK-SD-NOFP16-NEXT: fcvt d7, h7
+; CHECK-SD-NOFP16-NEXT: fcvt d16, h16
+; CHECK-SD-NOFP16-NEXT: fcvt h5, d5
+; CHECK-SD-NOFP16-NEXT: fmadd d17, d19, d18, d17
; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[6]
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[6]
; CHECK-SD-NOFP16-NEXT: mov h2, v2.h[7]
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT: fmadd s4, s16, s7, s4
+; CHECK-SD-NOFP16-NEXT: fmadd d4, d16, d7, d4
; CHECK-SD-NOFP16-NEXT: mov v3.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT: fcvt s6, h18
-; CHECK-SD-NOFP16-NEXT: fcvt s7, h19
-; CHECK-SD-NOFP16-NEXT: fcvt h16, s17
-; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT: fmadd s5, s7, s6, s5
+; CHECK-SD-NOFP16-NEXT: fcvt d5, h6
+; CHECK-SD-NOFP16-NEXT: fcvt d6, h18
+; CHECK-SD-NOFP16-NEXT: fcvt d7, h19
+; CHECK-SD-NOFP16-NEXT: fcvt h16, d17
+; CHECK-SD-NOFP16-NEXT: fcvt d2, h2
+; CHECK-SD-NOFP16-NEXT: fcvt d1, h1
+; CHECK-SD-NOFP16-NEXT: fcvt d0, h0
+; CHECK-SD-NOFP16-NEXT: fcvt h4, d4
+; CHECK-SD-NOFP16-NEXT: fmadd d5, d7, d6, d5
; CHECK-SD-NOFP16-NEXT: mov v3.h[4], v16.h[0]
-; CHECK-SD-NOFP16-NEXT: fmadd s0, s0, s1, s2
+; CHECK-SD-NOFP16-NEXT: fmadd d0, d0, d1, d2
; CHECK-SD-NOFP16-NEXT: mov v3.h[5], v4.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT: fcvt h4, d5
+; CHECK-SD-NOFP16-NEXT: fcvt h0, d0
; CHECK-SD-NOFP16-NEXT: mov v3.h[6], v4.h[0]
; CHECK-SD-NOFP16-NEXT: mov v3.h[7], v0.h[0]
; CHECK-SD-NOFP16-NEXT: mov v0.16b, v3.16b
@@ -301,34 +301,34 @@ define <4 x half> @fma_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
; CHECK-SD-NOFP16-NEXT: mov h3, v2.h[1]
; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[1]
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[1]
-; CHECK-SD-NOFP16-NEXT: fcvt s6, h2
-; CHECK-SD-NOFP16-NEXT: fcvt s7, h1
-; CHECK-SD-NOFP16-NEXT: fcvt s16, h0
+; CHECK-SD-NOFP16-NEXT: fcvt d6, h2
+; CHECK-SD-NOFP16-NEXT: fcvt d7, h1
+; CHECK-SD-NOFP16-NEXT: fcvt d16, h0
; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[2]
; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[2]
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[2]
; CHECK-SD-NOFP16-NEXT: mov h2, v2.h[3]
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[3]
-; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT: fmadd s6, s16, s7, s6
+; CHECK-SD-NOFP16-NEXT: fcvt d3, h3
+; CHECK-SD-NOFP16-NEXT: fcvt d4, h4
+; CHECK-SD-NOFP16-NEXT: fcvt d5, h5
+; CHECK-SD-NOFP16-NEXT: fmadd d6, d16, d7, d6
; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[3]
-; CHECK-SD-NOFP16-NEXT: fcvt s7, h19
-; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT: fmadd s3, s5, s4, s3
-; CHECK-SD-NOFP16-NEXT: fcvt s4, h17
-; CHECK-SD-NOFP16-NEXT: fcvt s5, h18
-; CHECK-SD-NOFP16-NEXT: fcvt h0, s6
-; CHECK-SD-NOFP16-NEXT: fmadd s4, s7, s5, s4
-; CHECK-SD-NOFP16-NEXT: fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT: fcvt s5, h16
+; CHECK-SD-NOFP16-NEXT: fcvt d7, h19
+; CHECK-SD-NOFP16-NEXT: fcvt d2, h2
+; CHECK-SD-NOFP16-NEXT: fcvt d1, h1
+; CHECK-SD-NOFP16-NEXT: fmadd d3, d5, d4, d3
+; CHECK-SD-NOFP16-NEXT: fcvt d4, h17
+; CHECK-SD-NOFP16-NEXT: fcvt d5, h18
+; CHECK-SD-NOFP16-NEXT: fcvt h0, d6
+; CHECK-SD-NOFP16-NEXT: fmadd d4, d7, d5, d4
+; CHECK-SD-NOFP16-NEXT: fcvt h3, d3
+; CHECK-SD-NOFP16-NEXT: fcvt d5, h16
; CHECK-SD-NOFP16-NEXT: mov v0.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h3, s4
-; CHECK-SD-NOFP16-NEXT: fmadd s1, s5, s1, s2
+; CHECK-SD-NOFP16-NEXT: fcvt h3, d4
+; CHECK-SD-NOFP16-NEXT: fmadd d1, d5, d1, d2
; CHECK-SD-NOFP16-NEXT: mov v0.h[2], v3.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT: fcvt h1, d1
; CHECK-SD-NOFP16-NEXT: mov v0.h[3], v1.h[0]
; CHECK-SD-NOFP16-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-SD-NOFP16-NEXT: ret
@@ -364,69 +364,69 @@ define <8 x half> @fma_v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
; CHECK-SD-NOFP16-NEXT: mov h3, v2.h[1]
; CHECK-SD-NOFP16-NEXT: mov h4, v1.h[1]
; CHECK-SD-NOFP16-NEXT: mov h5, v0.h[1]
-; CHECK-SD-NOFP16-NEXT: fcvt s6, h2
-; CHECK-SD-NOFP16-NEXT: fcvt s7, h1
-; CHECK-SD-NOFP16-NEXT: fcvt s16, h0
+; CHECK-SD-NOFP16-NEXT: fcvt d6, h2
+; CHECK-SD-NOFP16-NEXT: fcvt d7, h1
+; CHECK-SD-NOFP16-NEXT: fcvt d16, h0
; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[2]
; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[2]
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[2]
-; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT: fmadd s6, s16, s7, s6
-; CHECK-SD-NOFP16-NEXT: fcvt s7, h17
-; CHECK-SD-NOFP16-NEXT: fcvt s16, h18
-; CHECK-SD-NOFP16-NEXT: fcvt s17, h19
+; CHECK-SD-NOFP16-NEXT: fcvt d3, h3
+; CHECK-SD-NOFP16-NEXT: fcvt d4, h4
+; CHECK-SD-NOFP16-NEXT: fcvt d5, h5
+; CHECK-SD-NOFP16-NEXT: fmadd d6, d16, d7, d6
+; CHECK-SD-NOFP16-NEXT: fcvt d7, h17
+; CHECK-SD-NOFP16-NEXT: fcvt d16, h18
+; CHECK-SD-NOFP16-NEXT: fcvt d17, h19
; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[3]
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[3]
-; CHECK-SD-NOFP16-NEXT: fmadd s4, s5, s4, s3
+; CHECK-SD-NOFP16-NEXT: fmadd d4, d5, d4, d3
; CHECK-SD-NOFP16-NEXT: mov h5, v2.h[3]
-; CHECK-SD-NOFP16-NEXT: fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT: fmadd s6, s17, s16, s7
+; CHECK-SD-NOFP16-NEXT: fcvt h3, d6
+; CHECK-SD-NOFP16-NEXT: fmadd d6, d17, d16, d7
; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[4]
-; CHECK-SD-NOFP16-NEXT: fcvt s7, h18
-; CHECK-SD-NOFP16-NEXT: fcvt s16, h19
+; CHECK-SD-NOFP16-NEXT: fcvt d7, h18
+; CHECK-SD-NOFP16-NEXT: fcvt d16, h19
; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[4]
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT: fcvt h4, d4
+; CHECK-SD-NOFP16-NEXT: fcvt d5, h5
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[4]
-; CHECK-SD-NOFP16-NEXT: fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT: fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT: fcvt s18, h18
+; CHECK-SD-NOFP16-NEXT: fcvt h6, d6
+; CHECK-SD-NOFP16-NEXT: fcvt d17, h17
+; CHECK-SD-NOFP16-NEXT: fcvt d18, h18
; CHECK-SD-NOFP16-NEXT: mov v3.h[1], v4.h[0]
; CHECK-SD-NOFP16-NEXT: mov h4, v2.h[5]
-; CHECK-SD-NOFP16-NEXT: fmadd s5, s16, s7, s5
+; CHECK-SD-NOFP16-NEXT: fmadd d5, d16, d7, d5
; CHECK-SD-NOFP16-NEXT: mov h7, v1.h[5]
; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[5]
-; CHECK-SD-NOFP16-NEXT: fcvt s19, h19
+; CHECK-SD-NOFP16-NEXT: fcvt d19, h19
; CHECK-SD-NOFP16-NEXT: mov v3.h[2], v6.h[0]
; CHECK-SD-NOFP16-NEXT: mov h6, v2.h[6]
-; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT: fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT: fmadd s17, s19, s18, s17
+; CHECK-SD-NOFP16-NEXT: fcvt d4, h4
+; CHECK-SD-NOFP16-NEXT: fcvt d7, h7
+; CHECK-SD-NOFP16-NEXT: fcvt d16, h16
+; CHECK-SD-NOFP16-NEXT: fcvt h5, d5
+; CHECK-SD-NOFP16-NEXT: fmadd d17, d19, d18, d17
; CHECK-SD-NOFP16-NEXT: mov h18, v1.h[6]
; CHECK-SD-NOFP16-NEXT: mov h19, v0.h[6]
; CHECK-SD-NOFP16-NEXT: mov h2, v2.h[7]
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT: fmadd s4, s16, s7, s4
+; CHECK-SD-NOFP16-NEXT: fmadd d4, d16, d7, d4
; CHECK-SD-NOFP16-NEXT: mov v3.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT: fcvt s6, h18
-; CHECK-SD-NOFP16-NEXT: fcvt s7, h19
-; CHECK-SD-NOFP16-NEXT: fcvt h16, s17
-; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT: fmadd s5, s7, s6, s5
+; CHECK-SD-NOFP16-NEXT: fcvt d5, h6
+; CHECK-SD-NOFP16-NEXT: fcvt d6, h18
+; CHECK-SD-NOFP16-NEXT: fcvt d7, h19
+; CHECK-SD-NOFP16-NEXT: fcvt h16, d17
+; CHECK-SD-NOFP16-NEXT: fcvt d2, h2
+; CHECK-SD-NOFP16-NEXT: fcvt d1, h1
+; CHECK-SD-NOFP16-NEXT: fcvt d0, h0
+; CHECK-SD-NOFP16-NEXT: fcvt h4, d4
+; CHECK-SD-NOFP16-NEXT: fmadd d5, d7, d6, d5
; CHECK-SD-NOFP16-NEXT: mov v3.h[4], v16.h[0]
-; CHECK-SD-NOFP16-NEXT: fmadd s0, s0, s1, s2
+; CHECK-SD-NOFP16-NEXT: fmadd d0, d0, d1, d2
; CHECK-SD-NOFP16-NEXT: mov v3.h[5], v4.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT: fcvt h4, d5
+; CHECK-SD-NOFP16-NEXT: fcvt h0, d0
; CHECK-SD-NOFP16-NEXT: mov v3.h[6], v4.h[0]
; CHECK-SD-NOFP16-NEXT: mov v3.h[7], v0.h[0]
; CHECK-SD-NOFP16-NEXT: mov v0.16b, v3.16b
@@ -468,136 +468,136 @@ define <16 x half> @fma_v16f16(<16 x half> %a, <16 x half> %b, <16 x half> %c) {
; CHECK-SD-NOFP16-NEXT: mov h6, v4.h[1]
; CHECK-SD-NOFP16-NEXT: mov h7, v2.h[1]
; CHECK-SD-NOFP16-NEXT: mov h16, v0.h[1]
-; CHECK-SD-NOFP16-NEXT: fcvt s17, h4
-; CHECK-SD-NOFP16-NEXT: fcvt s18, h2
-; CHECK-SD-NOFP16-NEXT: fcvt s19, h0
+; CHECK-SD-NOFP16-NEXT: fcvt d17, h4
+; CHECK-SD-NOFP16-NEXT: fcvt d18, h2
+; CHECK-SD-NOFP16-NEXT: fcvt d19, h0
; CHECK-SD-NOFP16-NEXT: mov h20, v4.h[2]
; CHECK-SD-NOFP16-NEXT: mov h21, v2.h[2]
; CHECK-SD-NOFP16-NEXT: mov h22, v0.h[2]
; CHECK-SD-NOFP16-NEXT: mov h23, v4.h[3]
; CHECK-SD-NOFP16-NEXT: mov h24, v2.h[3]
; CHECK-SD-NOFP16-NEXT: mov h25, v0.h[3]
-; CHECK-SD-NOFP16-NEXT: fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT: fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT: fmadd s17, s19, s18, s17
+; CHECK-SD-NOFP16-NEXT: fcvt d6, h6
+; CHECK-SD-NOFP16-NEXT: fcvt d7, h7
+; CHECK-SD-NOFP16-NEXT: fcvt d16, h16
+; CHECK-SD-NOFP16-NEXT: fmadd d17, d19, d18, d17
; CHECK-SD-NOFP16-NEXT: mov h26, v1.h[1]
-; CHECK-SD-NOFP16-NEXT: fcvt s27, h5
-; CHECK-SD-NOFP16-NEXT: fcvt s18, h20
-; CHECK-SD-NOFP16-NEXT: fcvt s19, h21
-; CHECK-SD-NOFP16-NEXT: fcvt s20, h22
-; CHECK-SD-NOFP16-NEXT: fcvt s21, h23
-; CHECK-SD-NOFP16-NEXT: fcvt s22, h24
-; CHECK-SD-NOFP16-NEXT: fcvt s23, h25
-; CHECK-SD-NOFP16-NEXT: fmadd s7, s16, s7, s6
+; CHECK-SD-NOFP16-NEXT: fcvt d27, h5
+; CHECK-SD-NOFP16-NEXT: fcvt d18, h20
+; CHECK-SD-NOFP16-NEXT: fcvt d19, h21
+; CHECK-SD-NOFP16-NEXT: fcvt d20, h22
+; CHECK-SD-NOFP16-NEXT: fcvt d21, h23
+; CHECK-SD-NOFP16-NEXT: fcvt d22, h24
+; CHECK-SD-NOFP16-NEXT: fcvt d23, h25
+; CHECK-SD-NOFP16-NEXT: fmadd d7, d16, d7, d6
; CHECK-SD-NOFP16-NEXT: mov h24, v5.h[1]
; CHECK-SD-NOFP16-NEXT: mov h25, v3.h[1]
-; CHECK-SD-NOFP16-NEXT: fcvt h6, s17
-; CHECK-SD-NOFP16-NEXT: fcvt s28, h3
-; CHECK-SD-NOFP16-NEXT: fcvt s29, h1
-; CHECK-SD-NOFP16-NEXT: fmadd s19, s20, s19, s18
-; CHECK-SD-NOFP16-NEXT: fcvt s26, h26
+; CHECK-SD-NOFP16-NEXT: fcvt h6, d17
+; CHECK-SD-NOFP16-NEXT: fcvt d28, h3
+; CHECK-SD-NOFP16-NEXT: fcvt d29, h1
+; CHECK-SD-NOFP16-NEXT: fmadd d19, d20, d19, d18
+; CHECK-SD-NOFP16-NEXT: fcvt d26, h26
; CHECK-SD-NOFP16-NEXT: mov h16, v4.h[4]
-; CHECK-SD-NOFP16-NEXT: fmadd s21, s23, s22, s21
+; CHECK-SD-NOFP16-NEXT: fmadd d21, d23, d22, d21
; CHECK-SD-NOFP16-NEXT: mov h22, v3.h[2]
; CHECK-SD-NOFP16-NEXT: mov h23, v1.h[2]
-; CHECK-SD-NOFP16-NEXT: fcvt h20, s7
-; CHECK-SD-NOFP16-NEXT: fcvt s24, h24
-; CHECK-SD-NOFP16-NEXT: fcvt s25, h25
+; CHECK-SD-NOFP16-NEXT: fcvt h20, d7
+; CHECK-SD-NOFP16-NEXT: fcvt d24, h24
+; CHECK-SD-NOFP16-NEXT: fcvt d25, h25
; CHECK-SD-NOFP16-NEXT: mov h17, v2.h[4]
; CHECK-SD-NOFP16-NEXT: mov h18, v0.h[4]
; CHECK-SD-NOFP16-NEXT: mov h7, v4.h[5]
-; CHECK-SD-NOFP16-NEXT: fcvt h19, s19
+; CHECK-SD-NOFP16-NEXT: fcvt h19, d19
; CHECK-SD-NOFP16-NEXT: mov h30, v2.h[5]
-; CHECK-SD-NOFP16-NEXT: fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT: fcvt h21, s21
+; CHECK-SD-NOFP16-NEXT: fcvt d16, h16
+; CHECK-SD-NOFP16-NEXT: fcvt h21, d21
; CHECK-SD-NOFP16-NEXT: mov h31, v1.h[4]
-; CHECK-SD-NOFP16-NEXT: fmadd s24, s26, s25, s24
-; CHECK-SD-NOFP16-NEXT: fmadd s25, s29, s28, s27
+; CHECK-SD-NOFP16-NEXT: fmadd d24, d26, d25, d24
+; CHECK-SD-NOFP16-NEXT: fmadd d25, d29, d28, d27
; CHECK-SD-NOFP16-NEXT: mov v6.h[1], v20.h[0]
; CHECK-SD-NOFP16-NEXT: mov h20, v5.h[2]
; CHECK-SD-NOFP16-NEXT: mov h26, v5.h[3]
; CHECK-SD-NOFP16-NEXT: mov h27, v3.h[3]
; CHECK-SD-NOFP16-NEXT: mov h28, v1.h[3]
-; CHECK-SD-NOFP16-NEXT: fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT: fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT: fcvt s29, h7
-; CHECK-SD-NOFP16-NEXT: fcvt s30, h30
+; CHECK-SD-NOFP16-NEXT: fcvt d17, h17
+; CHECK-SD-NOFP16-NEXT: fcvt d18, h18
+; CHECK-SD-NOFP16-NEXT: fcvt d29, h7
+; CHECK-SD-NOFP16-NEXT: fcvt d30, h30
; CHECK-SD-NOFP16-NEXT: mov v6.h[2], v19.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h24, s24
-; CHECK-SD-NOFP16-NEXT: fcvt h7, s25
-; CHECK-SD-NOFP16-NEXT: fcvt s19, h20
-; CHECK-SD-NOFP16-NEXT: fcvt s20, h22
-; CHECK-SD-NOFP16-NEXT: fcvt s22, h23
-; CHECK-SD-NOFP16-NEXT: fmadd s16, s18, s17, s16
+; CHECK-SD-NOFP16-NEXT: fcvt h24, d24
+; CHECK-SD-NOFP16-NEXT: fcvt h7, d25
+; CHECK-SD-NOFP16-NEXT: fcvt d19, h20
+; CHECK-SD-NOFP16-NEXT: fcvt d20, h22
+; CHECK-SD-NOFP16-NEXT: fcvt d22, h23
+; CHECK-SD-NOFP16-NEXT: fmadd d16, d18, d17, d16
; CHECK-SD-NOFP16-NEXT: mov h23, v0.h[5]
-; CHECK-SD-NOFP16-NEXT: fcvt s25, h26
-; CHECK-SD-NOFP16-NEXT: fcvt s26, h27
-; CHECK-SD-NOFP16-NEXT: fcvt s27, h28
+; CHECK-SD-NOFP16-NEXT: fcvt d25, h26
+; CHECK-SD-NOFP16-NEXT: fcvt d26, h27
+; CHECK-SD-NOFP16-NEXT: fcvt d27, h28
; CHECK-SD-NOFP16-NEXT: mov h18, v4.h[6]
; CHECK-SD-NOFP16-NEXT: mov v6.h[3], v21.h[0]
; CHECK-SD-NOFP16-NEXT: mov v7.h[1], v24.h[0]
; CHECK-SD-NOFP16-NEXT: mov h24, v5.h[5]
-; CHECK-SD-NOFP16-NEXT: fmadd s19, s22, s20, s19
+; CHECK-SD-NOFP16-NEXT: fmadd d19, d22, d20, d19
; CHECK-SD-NOFP16-NEXT: mov h20, v5.h[4]
; CHECK-SD-NOFP16-NEXT: mov h22, v3.h[4]
-; CHECK-SD-NOFP16-NEXT: fcvt s23, h23
+; CHECK-SD-NOFP16-NEXT: fcvt d23, h23
; CHECK-SD-NOFP16-NEXT: mov h28, v0.h[6]
-; CHECK-SD-NOFP16-NEXT: fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT: fcvt s18, h18
+; CHECK-SD-NOFP16-NEXT: fcvt h16, d16
+; CHECK-SD-NOFP16-NEXT: fcvt d18, h18
; CHECK-SD-NOFP16-NEXT: mov h4, v4.h[7]
; CHECK-SD-NOFP16-NEXT: mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT: fcvt s20, h20
-; CHECK-SD-NOFP16-NEXT: fcvt s21, h22
-; CHECK-SD-NOFP16-NEXT: fcvt s22, h31
-; CHECK-SD-NOFP16-NEXT: fmadd s17, s23, s30, s29
-; CHECK-SD-NOFP16-NEXT: fmadd s23, s27, s26, s25
-; CHECK-SD-NOFP16-NEXT: fcvt h19, s19
+; CHECK-SD-NOFP16-NEXT: fcvt d20, h20
+; CHECK-SD-NOFP16-NEXT: fcvt d21, h22
+; CHECK-SD-NOFP16-NEXT: fcvt d22, h31
+; CHECK-SD-NOFP16-NEXT: fmadd d17, d23, d30, d29
+; CHECK-SD-NOFP16-NEXT: fmadd d23, d27, d26, d25
+; CHECK-SD-NOFP16-NEXT: fcvt h19, d19
; CHECK-SD-NOFP16-NEXT: mov h25, v3.h[5]
; CHECK-SD-NOFP16-NEXT: mov h26, v1.h[5]
; CHECK-SD-NOFP16-NEXT: mov h27, v2.h[6]
; CHECK-SD-NOFP16-NEXT: mov h29, v1.h[6]
; CHECK-SD-NOFP16-NEXT: mov h2, v2.h[7]
; CHECK-SD-NOFP16-NEXT: mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT: fmadd s20, s22, s21, s20
+; CHECK-SD-NOFP16-NEXT: fmadd d20, d22, d21, d20
; CHECK-SD-NOFP16-NEXT: mov h21, v5.h[6]
; CHECK-SD-NOFP16-NEXT: mov h22, v3.h[6]
; CHECK-SD-NOFP16-NEXT: mov v7.h[2], v19.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h19, s23
-; CHECK-SD-NOFP16-NEXT: fcvt s23, h24
-; CHECK-SD-NOFP16-NEXT: fcvt s24, h25
-; CHECK-SD-NOFP16-NEXT: fcvt s25, h26
-; CHECK-SD-NOFP16-NEXT: fcvt s26, h27
-; CHECK-SD-NOFP16-NEXT: fcvt s27, h28
-; CHECK-SD-NOFP16-NEXT: fcvt s28, h29
+; CHECK-SD-NOFP16-NEXT: fcvt h19, d23
+; CHECK-SD-NOFP16-NEXT: fcvt d23, h24
+; CHECK-SD-NOFP16-NEXT: fcvt d24, h25
+; CHECK-SD-NOFP16-NEXT: fcvt d25, h26
+; CHECK-SD-NOFP16-NEXT: fcvt d26, h27
+; CHECK-SD-NOFP16-NEXT: fcvt d27, h28
+; CHECK-SD-NOFP16-NEXT: fcvt d28, h29
; CHECK-SD-NOFP16-NEXT: mov h5, v5.h[7]
-; CHECK-SD-NOFP16-NEXT: fcvt s21, h21
-; CHECK-SD-NOFP16-NEXT: fcvt s22, h22
+; CHECK-SD-NOFP16-NEXT: fcvt d21, h21
+; CHECK-SD-NOFP16-NEXT: fcvt d22, h22
; CHECK-SD-NOFP16-NEXT: mov h3, v3.h[7]
; CHECK-SD-NOFP16-NEXT: mov v7.h[3], v19.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h19, s20
+; CHECK-SD-NOFP16-NEXT: fcvt h19, d20
; CHECK-SD-NOFP16-NEXT: mov v6.h[4], v16.h[0]
-; CHECK-SD-NOFP16-NEXT: fmadd s20, s25, s24, s23
-; CHECK-SD-NOFP16-NEXT: fcvt h16, s17
-; CHECK-SD-NOFP16-NEXT: fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT: fmadd s18, s27, s26, s18
-; CHECK-SD-NOFP16-NEXT: fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT: fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT: fmadd s21, s28, s22, s21
-; CHECK-SD-NOFP16-NEXT: fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT: fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT: fmadd d20, d25, d24, d23
+; CHECK-SD-NOFP16-NEXT: fcvt h16, d17
+; CHECK-SD-NOFP16-NEXT: fcvt d4, h4
+; CHECK-SD-NOFP16-NEXT: fmadd d18, d27, d26, d18
+; CHECK-SD-NOFP16-NEXT: fcvt d2, h2
+; CHECK-SD-NOFP16-NEXT: fcvt d0, h0
+; CHECK-SD-NOFP16-NEXT: fmadd d21, d28, d22, d21
+; CHECK-SD-NOFP16-NEXT: fcvt d5, h5
+; CHECK-SD-NOFP16-NEXT: fcvt d3, h3
; CHECK-SD-NOFP16-NEXT: mov v7.h[4], v19.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT: fcvt h17, s20
+; CHECK-SD-NOFP16-NEXT: fcvt d1, h1
+; CHECK-SD-NOFP16-NEXT: fcvt h17, d20
; CHECK-SD-NOFP16-NEXT: mov v6.h[5], v16.h[0]
-; CHECK-SD-NOFP16-NEXT: fmadd s0, s0, s2, s4
-; CHECK-SD-NOFP16-NEXT: fcvt h2, s18
-; CHECK-SD-NOFP16-NEXT: fcvt h4, s21
-; CHECK-SD-NOFP16-NEXT: fmadd s1, s1, s3, s5
+; CHECK-SD-NOFP16-NEXT: fmadd d0, d0, d2, d4
+; CHECK-SD-NOFP16-NEXT: fcvt h2, d18
+; CHECK-SD-NOFP16-NEXT: fcvt h4, d21
+; CHECK-SD-NOFP16-NEXT: fmadd d1, d1, d3, d5
; CHECK-SD-NOFP16-NEXT: mov v7.h[5], v17.h[0]
; CHECK-SD-NOFP16-NEXT: mov v6.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT: fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT: fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT: fcvt h0, d0
+; CHECK-SD-NOFP16-NEXT: fcvt h1, d1
; CHECK-SD-NOFP16-NEXT: mov v7.h[6], v4.h[0]
; CHECK-SD-NOFP16-NEXT: mov v6.h[7], v0.h[0]
; CHECK-SD-NOFP16-NEXT: mov v7.h[7], v1.h[0]
diff --git a/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll
index 86029a7169abb..368fa0a0cfae9 100644
--- a/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll
+++ b/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll
@@ -170,11 +170,11 @@ define half @frem_f16(half %x, half %y) #0 {
define half @fma_f16(half %x, half %y, half %z) #0 {
; CHECK-NOFP16-LABEL: fma_f16:
; CHECK-NOFP16: // %bb.0:
-; CHECK-NOFP16-NEXT: fcvt s2, h2
-; CHECK-NOFP16-NEXT: fcvt s1, h1
-; CHECK-NOFP16-NEXT: fcvt s0, h0
-; CHECK-NOFP16-NEXT: fmadd s0, s0, s1, s2
-; CHECK-NOFP16-NEXT: fcvt h0, s0
+; CHECK-NOFP16-NEXT: fcvt d2, h2
+; CHECK-NOFP16-NEXT: fcvt d1, h1
+; CHECK-NOFP16-NEXT: fcvt d0, h0
+; CHECK-NOFP16-NEXT: fmadd d0, d0, d1, d2
+; CHECK-NOFP16-NEXT: fcvt h0, d0
; CHECK-NOFP16-NEXT: ret
;
; CHECK-FP16-LABEL: fma_f16:
@@ -1382,3 +1382,5 @@ declare i1 @llvm.experimental.constrained.fcmp.f16(half, half, metadata, metadat
declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata)
declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-GI: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
index 20c06f0a1aff5..2f708cbda1f2b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
@@ -1043,38 +1043,38 @@ define <2 x half> @fma_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3)
; NONEON-NOSVE-NEXT: ldr h0, [sp, #22]
; NONEON-NOSVE-NEXT: ldr h1, [sp, #14]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #6]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #12]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #4]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #30]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #20]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #10]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #2]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #28]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #18]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #8]
; NONEON-NOSVE-NEXT: ldr h2, [sp]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #26]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #16]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #24]
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
@@ -1103,38 +1103,38 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
; NONEON-NOSVE-NEXT: ldr h0, [sp, #22]
; NONEON-NOSVE-NEXT: ldr h1, [sp, #14]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #6]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #12]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #4]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #30]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #20]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #10]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #2]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #28]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #18]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #8]
; NONEON-NOSVE-NEXT: ldr h2, [sp]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #26]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #16]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #24]
; NONEON-NOSVE-NEXT: ldr d0, [sp, #24]
; NONEON-NOSVE-NEXT: add sp, sp, #32
@@ -1163,74 +1163,74 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
; NONEON-NOSVE-NEXT: ldr h0, [sp, #46]
; NONEON-NOSVE-NEXT: ldr h1, [sp, #30]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #14]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #28]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #12]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #62]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #44]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #26]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #10]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #60]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #42]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #24]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #8]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #58]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #40]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #22]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #6]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #56]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #38]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #20]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #4]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #54]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #36]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #18]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #2]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #52]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #34]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #16]
; NONEON-NOSVE-NEXT: ldr h2, [sp]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #50]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #32]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #48]
; NONEON-NOSVE-NEXT: ldr q0, [sp, #48]
; NONEON-NOSVE-NEXT: add sp, sp, #64
@@ -1264,146 +1264,146 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
; NONEON-NOSVE-NEXT: stp q1, q5, [sp, #32]
; NONEON-NOSVE-NEXT: ldr h1, [sp, #78]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #62]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #76]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #60]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #126]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #92]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #74]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #58]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #124]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #90]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #72]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #56]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #122]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #88]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #70]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #54]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #120]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #86]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #68]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #52]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #118]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #84]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #66]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #50]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #116]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #82]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #64]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #48]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #114]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #80]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #30]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #14]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #112]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #46]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #28]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #12]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #110]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #44]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #26]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #10]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #108]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #42]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #24]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #8]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #106]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #40]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #22]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #6]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #104]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #38]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #20]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #4]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #102]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #36]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #18]
; NONEON-NOSVE-NEXT: ldr h2, [sp, #2]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #100]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #34]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
; NONEON-NOSVE-NEXT: ldr h1, [sp, #16]
; NONEON-NOSVE-NEXT: ldr h2, [sp]
-; NONEON-NOSVE-NEXT: fcvt s1, h1
-; NONEON-NOSVE-NEXT: fcvt s2, h2
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d1, h1
+; NONEON-NOSVE-NEXT: fcvt d2, h2
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #98]
; NONEON-NOSVE-NEXT: ldr h0, [sp, #32]
-; NONEON-NOSVE-NEXT: fcvt s0, h0
-; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0
-; NONEON-NOSVE-NEXT: fcvt h0, s0
+; NONEON-NOSVE-NEXT: fcvt d0, h0
+; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT: fcvt h0, d0
; NONEON-NOSVE-NEXT: str h0, [sp, #96]
; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96]
; NONEON-NOSVE-NEXT: stp q0, q1, [x0]
diff --git a/llvm/test/CodeGen/Generic/half-op.ll b/llvm/test/CodeGen/Generic/half-op.ll
index 30509efcba8bb..f8ad39f9456aa 100644
--- a/llvm/test/CodeGen/Generic/half-op.ll
+++ b/llvm/test/CodeGen/Generic/half-op.ll
@@ -35,12 +35,12 @@
; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if spirv-registered-target %{ llc %s -o - -mtriple=spirv-unknown-unknown | FileCheck %s --check-prefixes=NOCRASH %}
-; RUN: %if systemz-registered-target %{ llc %s -o - -mtriple=s390x-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if systemz-registered-target %{ llc %s -o - -mtriple=s390x-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if ve-registered-target %{ llc %s -o - -mtriple=ve-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; RUN: %if webassembly-registered-target %{ llc %s -o - -mtriple=wasm32-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
-; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if xcore-registered-target %{ llc %s -o - -mtriple=xcore-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
; RUN: %if xtensa-registered-target %{ llc %s -o - -mtriple=xtensa-none-elf | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,CHECK-FMA %}
diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
index 4e2f7ea9e5208..53288b35d55a4 100644
--- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; ## Full FP16 support enabled by default.
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
@@ -953,11 +954,11 @@ define half @test_cos(half %a) #0 {
; CHECK-DAG: ld.param.b16 [[C:%rs[0-9]+]], [test_fma_param_2];
; CHECK-F16-NOFTZ: fma.rn.f16 [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
; CHECK-F16-FTZ: fma.rn.ftz.f16 [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%r[0-9]+]], [[A]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%r[0-9]+]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%r[0-9]+]], [[C]]
-; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%r[0-9]+]], [[A32]], [[B32]], [[C32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
+; CHECK-NOF16-DAG: cvt.f64.f16 [[A64:%rd[0-9]+]], [[A]]
+; CHECK-NOF16-DAG: cvt.f64.f16 [[B64:%rd[0-9]+]], [[B]]
+; CHECK-NOF16-DAG: cvt.f64.f16 [[C64:%rd[0-9]+]], [[C]]
+; CHECK-NOF16-NEXT: fma.rn.f64 [[R64:%rd[0-9]+]], [[A64]], [[B64]], [[C64]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f64 [[R:%rs[0-9]+]], [[R64]]
; CHECK: st.param.b16 [func_retval0], [[R]];
; CHECK: ret
define half @test_fma(half %a, half %b, half %c) #0 {
@@ -1151,11 +1152,11 @@ define half @test_round(half %a) #0 {
; CHECK-DAG: ld.param.b16 [[C:%rs[0-9]+]], [test_fmuladd_param_2];
; CHECK-F16-NOFTZ: fma.rn.f16 [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
; CHECK-F16-FTZ: fma.rn.ftz.f16 [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
-; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%r[0-9]+]], [[A]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%r[0-9]+]], [[B]]
-; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%r[0-9]+]], [[C]]
-; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%r[0-9]+]], [[A32]], [[B32]], [[C32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
+; CHECK-NOF16-DAG: cvt.f64.f16 [[A64:%rd[0-9]+]], [[A]]
+; CHECK-NOF16-DAG: cvt.f64.f16 [[B64:%rd[0-9]+]], [[B]]
+; CHECK-NOF16-DAG: cvt.f64.f16 [[C64:%rd[0-9]+]], [[C]]
+; CHECK-NOF16-NEXT: fma.rn.f64 [[R64:%rd[0-9]+]], [[A64]], [[B64]], [[C64]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f64 [[R:%rs[0-9]+]], [[R64]]
; CHECK: st.param.b16 [func_retval0], [[R]];
; CHECK: ret;
define half @test_fmuladd(half %a, half %b, half %c) #0 {
@@ -1183,3 +1184,9 @@ define <2 x half> @test_neg_f16x2(<2 x half> noundef %arg) #0 {
}
attributes #0 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
+; CHECK-F16-FTZ: {{.*}}
+; CHECK-F16-NOFTZ: {{.*}}
+; CHECK-NOF16: {{.*}}
+; CHECK-NOFTZ: {{.*}}
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index e9143d540b047..3ebaf68d4a15f 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -1766,27 +1766,28 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
; CHECK-NOF16-LABEL: test_fma(
; CHECK-NOF16: {
; CHECK-NOF16-NEXT: .reg .b16 %rs<9>;
-; CHECK-NOF16-NEXT: .reg .b32 %r<13>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF16-NEXT: .reg .b64 %rd<9>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_fma_param_2];
; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fma_param_1];
; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fma_param_0];
; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r3;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT: cvt.f64.f16 %rd1, %rs2;
; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs4;
+; CHECK-NOF16-NEXT: cvt.f64.f16 %rd2, %rs4;
; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs6;
-; CHECK-NOF16-NEXT: fma.rn.f32 %r7, %r6, %r5, %r4;
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs7, %r7;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs1;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs3;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs5;
-; CHECK-NOF16-NEXT: fma.rn.f32 %r11, %r10, %r9, %r8;
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs8, %r11;
-; CHECK-NOF16-NEXT: mov.b32 %r12, {%rs8, %rs7};
-; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r12;
+; CHECK-NOF16-NEXT: cvt.f64.f16 %rd3, %rs6;
+; CHECK-NOF16-NEXT: fma.rn.f64 %rd4, %rd3, %rd2, %rd1;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f64 %rs7, %rd4;
+; CHECK-NOF16-NEXT: cvt.f64.f16 %rd5, %rs1;
+; CHECK-NOF16-NEXT: cvt.f64.f16 %rd6, %rs3;
+; CHECK-NOF16-NEXT: cvt.f64.f16 %rd7, %rs5;
+; CHECK-NOF16-NEXT: fma.rn.f64 %rd8, %rd7, %rd6, %rd5;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f64 %rs8, %rd8;
+; CHECK-NOF16-NEXT: mov.b32 %r4, {%rs8, %rs7};
+; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r4;
; CHECK-NOF16-NEXT: ret;
%r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
ret <2 x half> %r
@@ -2203,27 +2204,28 @@ define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
; CHECK-NOF16-LABEL: test_fmuladd(
; CHECK-NOF16: {
; CHECK-NOF16-NEXT: .reg .b16 %rs<9>;
-; CHECK-NOF16-NEXT: .reg .b32 %r<13>;
+; CHECK-NOF16-NEXT: .reg .b32 %r<5>;
+; CHECK-NOF16-NEXT: .reg .b64 %rd<9>;
; CHECK-NOF16-EMPTY:
; CHECK-NOF16-NEXT: // %bb.0:
; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_fmuladd_param_2];
; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fmuladd_param_1];
; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fmuladd_param_0];
; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r3;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT: cvt.f64.f16 %rd1, %rs2;
; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs4;
+; CHECK-NOF16-NEXT: cvt.f64.f16 %rd2, %rs4;
; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs6;
-; CHECK-NOF16-NEXT: fma.rn.f32 %r7, %r6, %r5, %r4;
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs7, %r7;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs1;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs3;
-; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs5;
-; CHECK-NOF16-NEXT: fma.rn.f32 %r11, %r10, %r9, %r8;
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs8, %r11;
-; CHECK-NOF16-NEXT: mov.b32 %r12, {%rs8, %rs7};
-; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r12;
+; CHECK-NOF16-NEXT: cvt.f64.f16 %rd3, %rs6;
+; CHECK-NOF16-NEXT: fma.rn.f64 %rd4, %rd3, %rd2, %rd1;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f64 %rs7, %rd4;
+; CHECK-NOF16-NEXT: cvt.f64.f16 %rd5, %rs1;
+; CHECK-NOF16-NEXT: cvt.f64.f16 %rd6, %rs3;
+; CHECK-NOF16-NEXT: cvt.f64.f16 %rd7, %rs5;
+; CHECK-NOF16-NEXT: fma.rn.f64 %rd8, %rd7, %rd6, %rd5;
+; CHECK-NOF16-NEXT: cvt.rn.f16.f64 %rs8, %rd8;
+; CHECK-NOF16-NEXT: mov.b32 %r4, {%rs8, %rs7};
+; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r4;
; CHECK-NOF16-NEXT: ret;
%r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
ret <2 x half> %r
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
index 6b285a49057dc..3b0c63749d378 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
@@ -8,12 +8,12 @@ declare float @llvm.fma.f32(float %f1, float %f2, float %f3)
define half @f0(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f0:
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK-SCALAR: maebr %f0, %f9, %f10
-; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK-SCALAR: madbr %f0, %f9, %f10
+; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2 at PLT
; CHECK: br %r14
%res = call half @llvm.fma.f16 (half %f1, half %f2, half %acc)
ret half %res
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
index e739bddd4f18f..542cae41d4745 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
@@ -10,12 +10,12 @@ define half @f0(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f0:
; CHECK-NOT: brasl
; CHECK: lcdfr %f{{[0-9]+}}, %f4
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK-SCALAR: maebr %f0, %f8, %f10
-; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK-SCALAR: madbr %f0, %f8, %f10
+; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2 at PLT
; CHECK: br %r14
%negacc = fneg half %acc
%res = call half @llvm.fma.f16 (half %f1, half %f2, half %negacc)
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
index 8f2cd23112cd0..0badf2993cca7 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
@@ -25,11 +25,11 @@ define double @f2(double %f1, double %f2, double %acc) {
define half @f3_half(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f3_half:
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2 at PLT
; CHECK-NOT: brasl
; CHECK: lcdfr %f0, %f0
; CHECK-NEXT: lmg
@@ -52,11 +52,11 @@ define half @f4_half(half %f1, half %f2, half %acc) {
; CHECK-LABEL: f4_half:
; CHECK-NOT: brasl
; CHECK: lcdfr %f0, %f4
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2 at PLT
; CHECK-NOT: brasl
; CHECK: lcdfr %f0, %f0
; CHECK-NEXT: lmg
diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll b/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll
index c951c79aeb7c6..05ce53c98db13 100644
--- a/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll
@@ -8,13 +8,13 @@ declare float @llvm.experimental.constrained.fma.f32(float, float, float, metada
define half @f0(half %f1, half %f2, half %acc) #0 {
; CHECK-LABEL: f0:
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK-SCALAR: maebr %f10, %f0, %f8
-; CHECK-SCALAR: ler %f0, %f10
-; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK-SCALAR: madbr %f10, %f0, %f8
+; CHECK-SCALAR: ldr %f0, %f10
+; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2 at PLT
; CHECK: br %r14
%res = call half @llvm.experimental.constrained.fma.f16 (
half %f1, half %f2, half %acc,
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
index b013ddad19a95..61a0c4eda8c72 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
@@ -432,8 +432,7 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
; SSE2: # %bb.0:
; SSE2-NEXT: subq $24, %rsp
; SSE2-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE2-NEXT: callq __extendhfsf2 at PLT
; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
@@ -443,12 +442,17 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq __extendhfsf2 at PLT
-; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
-; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload
-; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: callq fmaf at PLT
-; SSE2-NEXT: callq __truncsfhf2 at PLT
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: cvtss2sd %xmm0, %xmm2
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: cvtss2sd %xmm0, %xmm1
+; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: cvtss2sd %xmm0, %xmm0
+; SSE2-NEXT: callq fma at PLT
+; SSE2-NEXT: callq __truncdfhf2 at PLT
; SSE2-NEXT: addq $24, %rsp
; SSE2-NEXT: retq
;
@@ -460,38 +464,42 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
; F16C-NEXT: vpextrw $0, %xmm2, %edx
; F16C-NEXT: movzwl %dx, %edx
; F16C-NEXT: vmovd %edx, %xmm0
-; F16C-NEXT: vcvtph2ps %xmm0, %xmm2
+; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm2
; F16C-NEXT: movzwl %cx, %ecx
; F16C-NEXT: vmovd %ecx, %xmm0
-; F16C-NEXT: vcvtph2ps %xmm0, %xmm1
+; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm1
; F16C-NEXT: movzwl %ax, %eax
; F16C-NEXT: vmovd %eax, %xmm0
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT: callq fmaf at PLT
-; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; F16C-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; F16C-NEXT: callq fma at PLT
+; F16C-NEXT: callq __truncdfhf2 at PLT
; F16C-NEXT: popq %rax
; F16C-NEXT: retq
;
; AVX512-LABEL: fma_f16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpextrw $0, %xmm1, %eax
-; AVX512-NEXT: vpextrw $0, %xmm0, %ecx
+; AVX512-NEXT: pushq %rax
+; AVX512-NEXT: vpextrw $0, %xmm0, %eax
+; AVX512-NEXT: vpextrw $0, %xmm1, %ecx
; AVX512-NEXT: vpextrw $0, %xmm2, %edx
; AVX512-NEXT: movzwl %dx, %edx
; AVX512-NEXT: vmovd %edx, %xmm0
; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm1
; AVX512-NEXT: movzwl %cx, %ecx
-; AVX512-NEXT: vmovd %ecx, %xmm1
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vmovd %ecx, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm2
; AVX512-NEXT: movzwl %ax, %eax
-; AVX512-NEXT: vmovd %eax, %xmm2
-; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm0
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
+; AVX512-NEXT: callq __truncdfhf2 at PLT
+; AVX512-NEXT: popq %rax
; AVX512-NEXT: retq
;
; X86-LABEL: fma_f16:
diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll
index 9b5c45f44acd0..6abdf9a5ba652 100644
--- a/llvm/test/CodeGen/X86/fp16-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll
@@ -421,10 +421,13 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
; F16C-NEXT: pushq %rbx
; F16C-NEXT: movq %rdi, %rbx
; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; F16C-NEXT: vcvtph2ps %xmm1, %xmm1
+; F16C-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; F16C-NEXT: vcvtph2ps %xmm2, %xmm2
-; F16C-NEXT: callq fmaf at PLT
-; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; F16C-NEXT: callq fma at PLT
+; F16C-NEXT: callq __truncdfhf2 at PLT
; F16C-NEXT: vpextrw $0, %xmm0, (%rbx)
; F16C-NEXT: popq %rbx
; F16C-NEXT: retq
@@ -440,24 +443,27 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
; X64-NEXT: pushq %rbx
; X64-NEXT: subq $16, %rsp
; X64-NEXT: movq %rdi, %rbx
-; X64-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: movaps %xmm2, %xmm0
; X64-NEXT: callq __extendhfsf2 at PLT
; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
; X64-NEXT: callq __extendhfsf2 at PLT
; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; X64-NEXT: # xmm0 = mem[0],zero,zero,zero
; X64-NEXT: callq __extendhfsf2 at PLT
+; X64-NEXT: cvtss2sd %xmm0, %xmm0
; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
; X64-NEXT: # xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: cvtss2sd %xmm1, %xmm1
; X64-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload
; X64-NEXT: # xmm2 = mem[0],zero,zero,zero
-; X64-NEXT: callq fmaf at PLT
-; X64-NEXT: callq __truncsfhf2 at PLT
+; X64-NEXT: cvtss2sd %xmm2, %xmm2
+; X64-NEXT: callq fma at PLT
+; X64-NEXT: callq __truncdfhf2 at PLT
; X64-NEXT: pextrw $0, %xmm0, %eax
; X64-NEXT: movw %ax, (%rbx)
; X64-NEXT: addq $16, %rsp
@@ -467,7 +473,7 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
; X86-LABEL: test_half_fma:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $72, %esp
+; X86-NEXT: subl $88, %esp
; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0
@@ -487,17 +493,17 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
; X86-NEXT: pextrw $0, %xmm0, %eax
; X86-NEXT: movw %ax, (%esp)
; X86-NEXT: calll __extendhfsf2
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: fstpl {{[0-9]+}}(%esp)
; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
+; X86-NEXT: fstpl {{[0-9]+}}(%esp)
; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
-; X86-NEXT: fstps (%esp)
-; X86-NEXT: calll fmaf
-; X86-NEXT: fstps (%esp)
-; X86-NEXT: calll __truncsfhf2
+; X86-NEXT: fstpl (%esp)
+; X86-NEXT: calll fma
+; X86-NEXT: fstpl (%esp)
+; X86-NEXT: calll __truncdfhf2
; X86-NEXT: pextrw $0, %xmm0, %eax
; X86-NEXT: movw %ax, (%esi)
-; X86-NEXT: addl $72, %esp
+; X86-NEXT: addl $88, %esp
; X86-NEXT: popl %esi
; X86-NEXT: retl
%res = call half @llvm.fma.half(half %a0, half %a1, half %a2)
More information about the llvm-commits
mailing list