[llvm] fix `llvm.fma.f16` double rounding issue when there is no native support (PR #171904)

Folkert de Vries via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 16 01:40:09 PST 2025


https://github.com/folkertdev updated https://github.com/llvm/llvm-project/pull/171904

>From 6581c75d9cdf8627fbc627d90d37e5f50d34371d Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Thu, 11 Dec 2025 20:45:57 +0100
Subject: [PATCH 1/2] promote f16 fma to f64 if there is no instruction support

---
 .../SelectionDAG/LegalizeFloatTypes.cpp       |  23 +-
 llvm/test/CodeGen/ARM/fp16-promote.ll         |  80 ++--
 llvm/test/CodeGen/Generic/half-op.ll          |  32 +-
 llvm/test/CodeGen/RISCV/half-arith.ll         | 366 ++++++++++++------
 llvm/test/CodeGen/RISCV/half-intrinsics.ll    |  44 ++-
 5 files changed, 369 insertions(+), 176 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 383a025a4d916..2406094ef0378 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -3507,6 +3507,7 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) {
   SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
   SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
   SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
+  SDNodeFlags Flags = N->getFlags();
   SDLoc dl(N);
 
   // Promote to the larger FP type.
@@ -3515,9 +3516,27 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) {
   Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1);
   Op2 = DAG.getNode(PromotionOpcode, dl, NVT, Op2);
 
-  SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2);
+  SDValue Res;
+  if (OVT == MVT::f16) {
+    // An f16 fma must go via f64 to prevent double rounding issues.
+    SDValue A64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op0, Flags);
+    SDValue B64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op1, Flags);
+    SDValue C64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op2, Flags);
+
+    // Prefer a wide FMA node if available; otherwise expand to mul+add.
+    SDValue WideRes;
+    if (TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), MVT::f64)) {
+      WideRes = DAG.getNode(ISD::FMA, dl, MVT::f64, A64, B64, C64, Flags);
+    } else {
+      SDValue Mul = DAG.getNode(ISD::FMUL, dl, MVT::f64, A64, B64, Flags);
+      WideRes = DAG.getNode(ISD::FADD, dl, MVT::f64, Mul, C64, Flags);
+    }
 
-  // Convert back to FP16 as an integer.
+    return DAG.getNode(GetPromotionOpcode(MVT::f64, OVT), dl, MVT::i16,
+                       WideRes);
+  }
+
+  Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2, Flags);
   return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
 }
 
diff --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll
index 8230e47259dd8..27a0bf2eb9037 100644
--- a/llvm/test/CodeGen/ARM/fp16-promote.ll
+++ b/llvm/test/CodeGen/ARM/fp16-promote.ll
@@ -1508,61 +1508,81 @@ define void @test_fma(ptr %p, ptr %q, ptr %r) #0 {
 ; CHECK-FP16-NEXT:    push {r4, lr}
 ; CHECK-FP16-NEXT:    mov r4, r0
 ; CHECK-FP16-NEXT:    ldrh r0, [r1]
-; CHECK-FP16-NEXT:    ldrh r1, [r4]
-; CHECK-FP16-NEXT:    ldrh r2, [r2]
-; CHECK-FP16-NEXT:    vmov s2, r0
+; CHECK-FP16-NEXT:    ldrh r1, [r2]
+; CHECK-FP16-NEXT:    vmov s0, r0
+; CHECK-FP16-NEXT:    ldrh r0, [r4]
+; CHECK-FP16-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT:    vcvt.f64.f32 d16, s0
+; CHECK-FP16-NEXT:    vmov s0, r0
+; CHECK-FP16-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT:    vcvt.f64.f32 d17, s0
 ; CHECK-FP16-NEXT:    vmov s0, r1
-; CHECK-FP16-NEXT:    vcvtb.f32.f16 s1, s2
-; CHECK-FP16-NEXT:    vmov s2, r2
 ; CHECK-FP16-NEXT:    vcvtb.f32.f16 s0, s0
-; CHECK-FP16-NEXT:    vcvtb.f32.f16 s2, s2
-; CHECK-FP16-NEXT:    bl fmaf
-; CHECK-FP16-NEXT:    vcvtb.f16.f32 s0, s0
-; CHECK-FP16-NEXT:    vmov r0, s0
+; CHECK-FP16-NEXT:    vcvt.f64.f32 d18, s0
+; CHECK-FP16-NEXT:    vmla.f64 d18, d17, d16
+; CHECK-FP16-NEXT:    vmov r0, r1, d18
+; CHECK-FP16-NEXT:    bl __aeabi_d2h
 ; CHECK-FP16-NEXT:    strh r0, [r4]
 ; CHECK-FP16-NEXT:    pop {r4, pc}
 ;
 ; CHECK-LIBCALL-VFP-LABEL: test_fma:
 ; CHECK-LIBCALL-VFP:         .save {r4, r5, r6, lr}
 ; CHECK-LIBCALL-VFP-NEXT:    push {r4, r5, r6, lr}
+; CHECK-LIBCALL-VFP-NEXT:    .vsave {d8, d9}
+; CHECK-LIBCALL-VFP-NEXT:    vpush {d8, d9}
 ; CHECK-LIBCALL-VFP-NEXT:    mov r4, r0
-; CHECK-LIBCALL-VFP-NEXT:    ldrh r0, [r2]
-; CHECK-LIBCALL-VFP-NEXT:    mov r5, r1
+; CHECK-LIBCALL-VFP-NEXT:    ldrh r0, [r0]
+; CHECK-LIBCALL-VFP-NEXT:    mov r5, r2
+; CHECK-LIBCALL-VFP-NEXT:    mov r6, r1
 ; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_h2f
-; CHECK-LIBCALL-VFP-NEXT:    mov r6, r0
-; CHECK-LIBCALL-VFP-NEXT:    ldrh r0, [r5]
+; CHECK-LIBCALL-VFP-NEXT:    ldrh r1, [r6]
+; CHECK-LIBCALL-VFP-NEXT:    vmov s16, r0
+; CHECK-LIBCALL-VFP-NEXT:    ldrh r5, [r5]
+; CHECK-LIBCALL-VFP-NEXT:    mov r0, r1
 ; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_h2f
-; CHECK-LIBCALL-VFP-NEXT:    mov r5, r0
-; CHECK-LIBCALL-VFP-NEXT:    ldrh r0, [r4]
+; CHECK-LIBCALL-VFP-NEXT:    vmov s18, r0
+; CHECK-LIBCALL-VFP-NEXT:    mov r0, r5
 ; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_h2f
 ; CHECK-LIBCALL-VFP-NEXT:    vmov s0, r0
-; CHECK-LIBCALL-VFP-NEXT:    vmov s1, r5
-; CHECK-LIBCALL-VFP-NEXT:    vmov s2, r6
-; CHECK-LIBCALL-VFP-NEXT:    bl fmaf
-; CHECK-LIBCALL-VFP-NEXT:    vmov r0, s0
-; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_f2h
+; CHECK-LIBCALL-VFP-NEXT:    vcvt.f64.f32 d16, s18
+; CHECK-LIBCALL-VFP-NEXT:    vcvt.f64.f32 d17, s16
+; CHECK-LIBCALL-VFP-NEXT:    vcvt.f64.f32 d18, s0
+; CHECK-LIBCALL-VFP-NEXT:    vmla.f64 d18, d17, d16
+; CHECK-LIBCALL-VFP-NEXT:    vmov r0, r1, d18
+; CHECK-LIBCALL-VFP-NEXT:    bl __aeabi_d2h
 ; CHECK-LIBCALL-VFP-NEXT:    strh r0, [r4]
+; CHECK-LIBCALL-VFP-NEXT:    vpop {d8, d9}
 ; CHECK-LIBCALL-VFP-NEXT:    pop {r4, r5, r6, pc}
 ;
 ; CHECK-NOVFP-LABEL: test_fma:
-; CHECK-NOVFP:         .save {r4, r5, r6, lr}
-; CHECK-NOVFP-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NOVFP:         .save {r4, r5, r6, r7, r11, lr}
+; CHECK-NOVFP-NEXT:    push {r4, r5, r6, r7, r11, lr}
 ; CHECK-NOVFP-NEXT:    mov r4, r0
 ; CHECK-NOVFP-NEXT:    ldrh r0, [r1]
 ; CHECK-NOVFP-NEXT:    mov r5, r2
 ; CHECK-NOVFP-NEXT:    bl __aeabi_h2f
+; CHECK-NOVFP-NEXT:    bl __aeabi_f2d
 ; CHECK-NOVFP-NEXT:    mov r6, r0
-; CHECK-NOVFP-NEXT:    ldrh r0, [r5]
-; CHECK-NOVFP-NEXT:    bl __aeabi_h2f
-; CHECK-NOVFP-NEXT:    mov r5, r0
 ; CHECK-NOVFP-NEXT:    ldrh r0, [r4]
+; CHECK-NOVFP-NEXT:    mov r7, r1
 ; CHECK-NOVFP-NEXT:    bl __aeabi_h2f
-; CHECK-NOVFP-NEXT:    mov r1, r6
-; CHECK-NOVFP-NEXT:    mov r2, r5
-; CHECK-NOVFP-NEXT:    bl fmaf
-; CHECK-NOVFP-NEXT:    bl __aeabi_f2h
+; CHECK-NOVFP-NEXT:    bl __aeabi_f2d
+; CHECK-NOVFP-NEXT:    mov r2, r6
+; CHECK-NOVFP-NEXT:    mov r3, r7
+; CHECK-NOVFP-NEXT:    bl __aeabi_dmul
+; CHECK-NOVFP-NEXT:    mov r6, r0
+; CHECK-NOVFP-NEXT:    ldrh r0, [r5]
+; CHECK-NOVFP-NEXT:    mov r7, r1
+; CHECK-NOVFP-NEXT:    bl __aeabi_h2f
+; CHECK-NOVFP-NEXT:    bl __aeabi_f2d
+; CHECK-NOVFP-NEXT:    mov r2, r0
+; CHECK-NOVFP-NEXT:    mov r3, r1
+; CHECK-NOVFP-NEXT:    mov r0, r6
+; CHECK-NOVFP-NEXT:    mov r1, r7
+; CHECK-NOVFP-NEXT:    bl __aeabi_dadd
+; CHECK-NOVFP-NEXT:    bl __aeabi_d2h
 ; CHECK-NOVFP-NEXT:    strh r0, [r4]
-; CHECK-NOVFP-NEXT:    pop {r4, r5, r6, pc}
+; CHECK-NOVFP-NEXT:    pop {r4, r5, r6, r7, r11, pc}
   %a = load half, ptr %p, align 2
   %b = load half, ptr %q, align 2
   %c = load half, ptr %r, align 2
diff --git a/llvm/test/CodeGen/Generic/half-op.ll b/llvm/test/CodeGen/Generic/half-op.ll
index 1037d8e20cc10..30509efcba8bb 100644
--- a/llvm/test/CodeGen/Generic/half-op.ll
+++ b/llvm/test/CodeGen/Generic/half-op.ll
@@ -8,37 +8,37 @@
 ; RUN: %if aarch64-registered-target     %{ llc %s -o - -mtriple=arm64ec-pc-windows-msvc         | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
 ; RUN: %if amdgpu-registered-target      %{ llc %s -o - -mtriple=amdgcn-amd-amdhsa               | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
 ; RUN: %if arc-registered-target         %{ llc %s -o - -mtriple=arc-elf                         | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
-; RUN: %if arm-registered-target         %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi       | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if arm-registered-target         %{ llc %s -o - -mtriple=thumbv7em-none-eabi             | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if avr-registered-target         %{ llc %s -o - -mtriple=avr-none                        | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if arm-registered-target         %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi       | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN %}
+; RUN: %if arm-registered-target         %{ llc %s -o - -mtriple=thumbv7em-none-eabi             | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if avr-registered-target         %{ llc %s -o - -mtriple=avr-none                        | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
 ; FIXME: BPF has a compiler error
 ; RUN: %if csky-registered-target        %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2     | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
 ; RUN: %if csky-registered-target        %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2 -mcpu=ck860fv -mattr=+hard-float | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
 ; FIXME: directx has a compiler error
-; RUN: %if hexagon-registered-target     %{ llc %s -o - -mtriple=hexagon-unknown-linux-musl      | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if hexagon-registered-target     %{ llc %s -o - -mtriple=hexagon-unknown-linux-musl      | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
 ; RUN: %if lanai-registered-target       %{ llc %s -o - -mtriple=lanai-unknown-unknown           | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
-; RUN: %if loongarch-registered-target   %{ llc %s -o - -mtriple=loongarch32-unknown-linux-gnu   | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if loongarch-registered-target   %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu   | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if loongarch-registered-target   %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu -mattr=+f | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if loongarch-registered-target   %{ llc %s -o - -mtriple=loongarch32-unknown-linux-gnu   | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if loongarch-registered-target   %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu   | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if loongarch-registered-target   %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu -mattr=+f | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
 ; RUN: %if m68k-registered-target        %{ llc %s -o - -mtriple=m68k-unknown-linux-gnu          | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
-; RUN: %if mips-registered-target        %{ llc %s -o - -mtriple=mips-unknown-linux-gnu          | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if mips-registered-target        %{ llc %s -o - -mtriple=mips64-unknown-linux-gnuabi64   | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if mips-registered-target        %{ llc %s -o - -mtriple=mips64el-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if mips-registered-target        %{ llc %s -o - -mtriple=mipsel-unknown-linux-gnu        | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if mips-registered-target        %{ llc %s -o - -mtriple=mips-unknown-linux-gnu          | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if mips-registered-target        %{ llc %s -o - -mtriple=mips64-unknown-linux-gnuabi64   | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if mips-registered-target        %{ llc %s -o - -mtriple=mips64el-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if mips-registered-target        %{ llc %s -o - -mtriple=mipsel-unknown-linux-gnu        | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
 ; RUN: %if msp430-registered-target      %{ llc %s -o - -mtriple=msp430-none-elf                 | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
 ; RUN: %if nvptx-registered-target       %{ llc %s -o - -mtriple=nvptx64-nvidia-cuda             | FileCheck %s --check-prefixes=NOCRASH %}
 ; RUN: %if powerpc-registered-target     %{ llc %s -o - -mtriple=powerpc-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
 ; RUN: %if powerpc-registered-target     %{ llc %s -o - -mtriple=powerpc64-unknown-linux-gnu     | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
 ; RUN: %if powerpc-registered-target     %{ llc %s -o - -mtriple=powerpc64le-unknown-linux-gnu   | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
-; RUN: %if riscv-registered-target       %{ llc %s -o - -mtriple=riscv32-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if riscv-registered-target       %{ llc %s -o - -mtriple=riscv64-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if sparc-registered-target       %{ llc %s -o - -mtriple=sparc-unknown-linux-gnu         | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if sparc-registered-target       %{ llc %s -o - -mtriple=sparc64-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if riscv-registered-target       %{ llc %s -o - -mtriple=riscv32-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if riscv-registered-target       %{ llc %s -o - -mtriple=riscv64-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if sparc-registered-target       %{ llc %s -o - -mtriple=sparc-unknown-linux-gnu         | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if sparc-registered-target       %{ llc %s -o - -mtriple=sparc64-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
 ; RUN: %if spirv-registered-target       %{ llc %s -o - -mtriple=spirv-unknown-unknown           | FileCheck %s --check-prefixes=NOCRASH %}
 ; RUN: %if systemz-registered-target     %{ llc %s -o - -mtriple=s390x-unknown-linux-gnu         | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
 ; RUN: %if ve-registered-target          %{ llc %s -o - -mtriple=ve-unknown-unknown              | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
 ; RUN: %if webassembly-registered-target %{ llc %s -o - -mtriple=wasm32-unknown-unknown          | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
-; RUN: %if x86-registered-target         %{ llc %s -o - -mtriple=i686-unknown-linux-gnu          | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if x86-registered-target         %{ llc %s -o - -mtriple=i686-unknown-linux-gnu          | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
 ; RUN: %if x86-registered-target         %{ llc %s -o - -mtriple=x86_64-pc-windows-msvc          | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
 ; RUN: %if x86-registered-target         %{ llc %s -o - -mtriple=x86_64-unknown-linux-gnu        | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
 ; RUN: %if xcore-registered-target       %{ llc %s -o - -mtriple=xcore-unknown-unknown           | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll
index e1eb860d26591..311905be2ce25 100644
--- a/llvm/test/CodeGen/RISCV/half-arith.ll
+++ b/llvm/test/CodeGen/RISCV/half-arith.ll
@@ -1093,28 +1093,41 @@ define half @fmadd_h(half %a, half %b, half %c) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
 ; RV32I-NEXT:    mv s1, a1
 ; RV32I-NEXT:    lui a1, 16
-; RV32I-NEXT:    addi s3, a1, -1
-; RV32I-NEXT:    and a0, a0, s3
+; RV32I-NEXT:    addi s4, a1, -1
+; RV32I-NEXT:    and a0, a0, s4
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
 ; RV32I-NEXT:    mv s2, a0
-; RV32I-NEXT:    and a0, s1, s3
+; RV32I-NEXT:    mv s3, a1
+; RV32I-NEXT:    and a0, s1, s4
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a1, s3
+; RV32I-NEXT:    call __muldf3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    and a0, s0, s3
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    and a0, s0, s4
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
 ; RV32I-NEXT:    mv a2, a0
-; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s1
-; RV32I-NEXT:    call fmaf
-; RV32I-NEXT:    call __truncsfhf2
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s2
+; RV32I-NEXT:    call __adddf3
+; RV32I-NEXT:    call __truncdfhf2
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
@@ -1132,17 +1145,22 @@ define half @fmadd_h(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi s3, a1, -1
 ; RV64I-NEXT:    and a0, a0, s3
 ; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    call __extendsfdf2
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    and a0, s1, s3
 ; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    call __extendsfdf2
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    call __muldf3
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    and a0, s0, s3
 ; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    mv a2, a0
-; RV64I-NEXT:    mv a0, s2
-; RV64I-NEXT:    mv a1, s1
-; RV64I-NEXT:    call fmaf
-; RV64I-NEXT:    call __truncsfhf2
+; RV64I-NEXT:    call __extendsfdf2
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    call __adddf3
+; RV64I-NEXT:    call __truncdfhf2
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -1194,35 +1212,48 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a1
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    lui a0, 16
-; RV32I-NEXT:    addi s2, a0, -1
-; RV32I-NEXT:    and a0, a2, s2
+; RV32I-NEXT:    addi s3, a0, -1
+; RV32I-NEXT:    and a0, a2, s3
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    call __truncsfhf2
 ; RV32I-NEXT:    lui a1, 8
-; RV32I-NEXT:    xor s3, a0, a1
-; RV32I-NEXT:    and a0, s1, s2
+; RV32I-NEXT:    xor s4, a0, a1
+; RV32I-NEXT:    and a0, s1, s3
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    and a0, s0, s2
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    and a0, s0, s3
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s2
+; RV32I-NEXT:    call __muldf3
 ; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    and a0, s3, s2
+; RV32I-NEXT:    mv s1, a1
+; RV32I-NEXT:    and a0, s4, s3
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
 ; RV32I-NEXT:    mv a2, a0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    call fmaf
-; RV32I-NEXT:    call __truncsfhf2
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    call __adddf3
+; RV32I-NEXT:    call __truncdfhf2
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
@@ -1247,17 +1278,22 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    xor s3, a0, a1
 ; RV64I-NEXT:    and a0, s1, s2
 ; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    call __extendsfdf2
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    and a0, s0, s2
 ; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    call __extendsfdf2
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    call __muldf3
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    and a0, s3, s2
 ; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    mv a2, a0
-; RV64I-NEXT:    mv a0, s1
-; RV64I-NEXT:    mv a1, s0
-; RV64I-NEXT:    call fmaf
-; RV64I-NEXT:    call __truncsfhf2
+; RV64I-NEXT:    call __extendsfdf2
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    call __adddf3
+; RV64I-NEXT:    call __truncdfhf2
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -1329,8 +1365,8 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
 ; RV32I-NEXT:    mv s1, a1
-; RV32I-NEXT:    lui a1, 16
-; RV32I-NEXT:    addi s3, a1, -1
+; RV32I-NEXT:    lui s3, 16
+; RV32I-NEXT:    addi s3, s3, -1
 ; RV32I-NEXT:    and a0, a0, s3
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    li a1, 0
@@ -1347,17 +1383,26 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
 ; RV32I-NEXT:    xor s4, a0, a1
 ; RV32I-NEXT:    and a0, s1, s3
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
 ; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    mv s1, a1
 ; RV32I-NEXT:    and a0, s2, s3
 ; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    call __extendsfdf2
+; RV32I-NEXT:    mv a2, s0
+; RV32I-NEXT:    mv a3, s1
+; RV32I-NEXT:    call __muldf3
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    mv s1, a1
 ; RV32I-NEXT:    and a0, s4, s3
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
 ; RV32I-NEXT:    mv a2, a0
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a1, s0
-; RV32I-NEXT:    call fmaf
-; RV32I-NEXT:    call __truncsfhf2
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    call __adddf3
+; RV32I-NEXT:    call __truncdfhf2
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -1378,8 +1423,8 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a2
 ; RV64I-NEXT:    mv s1, a1
-; RV64I-NEXT:    lui a1, 16
-; RV64I-NEXT:    addi s3, a1, -1
+; RV64I-NEXT:    lui s3, 16
+; RV64I-NEXT:    addi s3, s3, -1
 ; RV64I-NEXT:    and a0, a0, s3
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    li a1, 0
@@ -1396,17 +1441,21 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    xor s4, a0, a1
 ; RV64I-NEXT:    and a0, s1, s3
 ; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    call __extendsfdf2
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    and a0, s2, s3
 ; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    call __extendsfdf2
+; RV64I-NEXT:    mv a1, s0
+; RV64I-NEXT:    call __muldf3
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    and a0, s4, s3
 ; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    mv a2, a0
-; RV64I-NEXT:    mv a0, s1
-; RV64I-NEXT:    mv a1, s0
-; RV64I-NEXT:    call fmaf
-; RV64I-NEXT:    call __truncsfhf2
+; RV64I-NEXT:    call __extendsfdf2
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    call __adddf3
+; RV64I-NEXT:    call __truncdfhf2
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -1491,8 +1540,8 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
 ; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    lui a0, 16
-; RV32I-NEXT:    addi s3, a0, -1
+; RV32I-NEXT:    lui s3, 16
+; RV32I-NEXT:    addi s3, s3, -1
 ; RV32I-NEXT:    and a0, a1, s3
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    li a1, 0
@@ -1509,17 +1558,28 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
 ; RV32I-NEXT:    xor s4, a0, a1
 ; RV32I-NEXT:    and a0, s1, s3
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
 ; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    mv s1, a1
 ; RV32I-NEXT:    and a0, s2, s3
 ; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    call __extendsfdf2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    mv a1, s1
+; RV32I-NEXT:    call __muldf3
+; RV32I-NEXT:    mv s0, a0
+; RV32I-NEXT:    mv s1, a1
 ; RV32I-NEXT:    and a0, s4, s3
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
 ; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
 ; RV32I-NEXT:    mv a0, s0
 ; RV32I-NEXT:    mv a1, s1
-; RV32I-NEXT:    call fmaf
-; RV32I-NEXT:    call __truncsfhf2
+; RV32I-NEXT:    call __adddf3
+; RV32I-NEXT:    call __truncdfhf2
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
@@ -1540,8 +1600,8 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    sd s4, 0(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    mv s0, a2
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    lui a0, 16
-; RV64I-NEXT:    addi s3, a0, -1
+; RV64I-NEXT:    lui s3, 16
+; RV64I-NEXT:    addi s3, s3, -1
 ; RV64I-NEXT:    and a0, a1, s3
 ; RV64I-NEXT:    call __extendhfsf2
 ; RV64I-NEXT:    li a1, 0
@@ -1558,17 +1618,22 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    xor s4, a0, a1
 ; RV64I-NEXT:    and a0, s1, s3
 ; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    call __extendsfdf2
 ; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    and a0, s2, s3
 ; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    call __extendsfdf2
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    mv a0, s0
+; RV64I-NEXT:    call __muldf3
+; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    and a0, s4, s3
 ; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    mv a2, a0
+; RV64I-NEXT:    call __extendsfdf2
+; RV64I-NEXT:    mv a1, a0
 ; RV64I-NEXT:    mv a0, s0
-; RV64I-NEXT:    mv a1, s1
-; RV64I-NEXT:    call fmaf
-; RV64I-NEXT:    call __truncsfhf2
+; RV64I-NEXT:    call __adddf3
+; RV64I-NEXT:    call __truncdfhf2
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -1659,23 +1724,35 @@ define half @fnmadd_h_3(half %a, half %b, half %c) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
 ; RV32I-NEXT:    mv s1, a1
 ; RV32I-NEXT:    lui a1, 16
-; RV32I-NEXT:    addi s3, a1, -1
-; RV32I-NEXT:    and a0, a0, s3
+; RV32I-NEXT:    addi s4, a1, -1
+; RV32I-NEXT:    and a0, a0, s4
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
 ; RV32I-NEXT:    mv s2, a0
-; RV32I-NEXT:    and a0, s1, s3
+; RV32I-NEXT:    mv s3, a1
+; RV32I-NEXT:    and a0, s1, s4
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a1, s3
+; RV32I-NEXT:    call __muldf3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    and a0, s0, s3
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    and a0, s0, s4
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
 ; RV32I-NEXT:    mv a2, a0
-; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s1
-; RV32I-NEXT:    call fmaf
-; RV32I-NEXT:    call __truncsfhf2
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s2
+; RV32I-NEXT:    call __adddf3
+; RV32I-NEXT:    call __truncdfhf2
 ; RV32I-NEXT:    lui a1, 1048568
 ; RV32I-NEXT:    xor a0, a0, a1
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -1683,6 +1760,7 @@ define half @fnmadd_h_3(half %a, half %b, half %c) nounwind {
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
@@ -1700,17 +1778,22 @@ define half @fnmadd_h_3(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi s3, a1, -1
 ; RV64I-NEXT:    and a0, a0, s3
 ; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    call __extendsfdf2
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    and a0, s1, s3
 ; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    call __extendsfdf2
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    call __muldf3
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    and a0, s0, s3
 ; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    mv a2, a0
-; RV64I-NEXT:    mv a0, s2
-; RV64I-NEXT:    mv a1, s1
-; RV64I-NEXT:    call fmaf
-; RV64I-NEXT:    call __truncsfhf2
+; RV64I-NEXT:    call __extendsfdf2
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    call __adddf3
+; RV64I-NEXT:    call __truncdfhf2
 ; RV64I-NEXT:    lui a1, 1048568
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -1779,23 +1862,35 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
 ; RV32I-NEXT:    mv s1, a1
 ; RV32I-NEXT:    lui a1, 16
-; RV32I-NEXT:    addi s3, a1, -1
-; RV32I-NEXT:    and a0, a0, s3
+; RV32I-NEXT:    addi s4, a1, -1
+; RV32I-NEXT:    and a0, a0, s4
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
 ; RV32I-NEXT:    mv s2, a0
-; RV32I-NEXT:    and a0, s1, s3
+; RV32I-NEXT:    mv s3, a1
+; RV32I-NEXT:    and a0, s1, s4
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a1, s3
+; RV32I-NEXT:    call __muldf3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    and a0, s0, s3
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    and a0, s0, s4
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
 ; RV32I-NEXT:    mv a2, a0
-; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s1
-; RV32I-NEXT:    call fmaf
-; RV32I-NEXT:    call __truncsfhf2
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s2
+; RV32I-NEXT:    call __adddf3
+; RV32I-NEXT:    call __truncdfhf2
 ; RV32I-NEXT:    lui a1, 1048568
 ; RV32I-NEXT:    xor a0, a0, a1
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
@@ -1803,6 +1898,7 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind {
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
@@ -1820,17 +1916,22 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi s3, a1, -1
 ; RV64I-NEXT:    and a0, a0, s3
 ; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    call __extendsfdf2
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    and a0, s1, s3
 ; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    call __extendsfdf2
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    call __muldf3
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    and a0, s0, s3
 ; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    mv a2, a0
-; RV64I-NEXT:    mv a0, s2
-; RV64I-NEXT:    mv a1, s1
-; RV64I-NEXT:    call fmaf
-; RV64I-NEXT:    call __truncsfhf2
+; RV64I-NEXT:    call __extendsfdf2
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    call __adddf3
+; RV64I-NEXT:    call __truncdfhf2
 ; RV64I-NEXT:    lui a1, 1048568
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
@@ -1892,34 +1993,46 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
 ; RV32I-NEXT:    mv s1, a1
 ; RV32I-NEXT:    lui a1, 16
-; RV32I-NEXT:    addi s2, a1, -1
-; RV32I-NEXT:    and a0, a0, s2
+; RV32I-NEXT:    addi s3, a1, -1
+; RV32I-NEXT:    and a0, a0, s3
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    call __truncsfhf2
 ; RV32I-NEXT:    lui a1, 8
-; RV32I-NEXT:    xor s3, a0, a1
-; RV32I-NEXT:    and a0, s1, s2
+; RV32I-NEXT:    xor s4, a0, a1
+; RV32I-NEXT:    and a0, s1, s3
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    and a0, s0, s2
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    and a0, s4, s3
 ; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    and a0, s3, s2
+; RV32I-NEXT:    call __extendsfdf2
+; RV32I-NEXT:    mv a2, s1
+; RV32I-NEXT:    mv a3, s2
+; RV32I-NEXT:    call __muldf3
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    and a0, s0, s3
 ; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    mv a1, s1
-; RV32I-NEXT:    mv a2, s0
-; RV32I-NEXT:    call fmaf
-; RV32I-NEXT:    call __truncsfhf2
+; RV32I-NEXT:    call __extendsfdf2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s2
+; RV32I-NEXT:    call __adddf3
+; RV32I-NEXT:    call __truncdfhf2
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
@@ -1944,16 +2057,21 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    xor s3, a0, a1
 ; RV64I-NEXT:    and a0, s1, s2
 ; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    call __extendsfdf2
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    and a0, s0, s2
-; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    and a0, s3, s2
 ; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    call __extendsfdf2
 ; RV64I-NEXT:    mv a1, s1
-; RV64I-NEXT:    mv a2, s0
-; RV64I-NEXT:    call fmaf
-; RV64I-NEXT:    call __truncsfhf2
+; RV64I-NEXT:    call __muldf3
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    and a0, s0, s2
+; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    call __extendsfdf2
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    call __adddf3
+; RV64I-NEXT:    call __truncdfhf2
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
@@ -2020,35 +2138,48 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
 ; RV32I-NEXT:    mv s1, a0
 ; RV32I-NEXT:    lui a0, 16
-; RV32I-NEXT:    addi s2, a0, -1
-; RV32I-NEXT:    and a0, a1, s2
+; RV32I-NEXT:    addi s3, a0, -1
+; RV32I-NEXT:    and a0, a1, s3
 ; RV32I-NEXT:    call __extendhfsf2
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    call __addsf3
 ; RV32I-NEXT:    call __truncsfhf2
 ; RV32I-NEXT:    lui a1, 8
-; RV32I-NEXT:    xor s3, a0, a1
-; RV32I-NEXT:    and a0, s1, s2
+; RV32I-NEXT:    xor s4, a0, a1
+; RV32I-NEXT:    and a0, s1, s3
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    and a0, s0, s2
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    and a0, s4, s3
 ; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    and a0, s3, s2
+; RV32I-NEXT:    call __extendsfdf2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s2
+; RV32I-NEXT:    call __muldf3
+; RV32I-NEXT:    mv s1, a0
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    and a0, s0, s3
 ; RV32I-NEXT:    call __extendhfsf2
-; RV32I-NEXT:    mv a1, a0
+; RV32I-NEXT:    call __extendsfdf2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
 ; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    mv a2, s0
-; RV32I-NEXT:    call fmaf
-; RV32I-NEXT:    call __truncsfhf2
+; RV32I-NEXT:    mv a1, s2
+; RV32I-NEXT:    call __adddf3
+; RV32I-NEXT:    call __truncdfhf2
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
@@ -2073,17 +2204,22 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    xor s3, a0, a1
 ; RV64I-NEXT:    and a0, s1, s2
 ; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    call __extendsfdf2
 ; RV64I-NEXT:    mv s1, a0
-; RV64I-NEXT:    and a0, s0, s2
-; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    mv s0, a0
 ; RV64I-NEXT:    and a0, s3, s2
 ; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    call __extendsfdf2
 ; RV64I-NEXT:    mv a1, a0
 ; RV64I-NEXT:    mv a0, s1
-; RV64I-NEXT:    mv a2, s0
-; RV64I-NEXT:    call fmaf
-; RV64I-NEXT:    call __truncsfhf2
+; RV64I-NEXT:    call __muldf3
+; RV64I-NEXT:    mv s1, a0
+; RV64I-NEXT:    and a0, s0, s2
+; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    call __extendsfdf2
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    call __adddf3
+; RV64I-NEXT:    call __truncdfhf2
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/half-intrinsics.ll b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
index e712bd919b0b1..5ae127c1d00a3 100644
--- a/llvm/test/CodeGen/RISCV/half-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
@@ -1690,28 +1690,41 @@ define half @fma_f16(half %a, half %b, half %c) nounwind {
 ; RV32I-NEXT:    sw s1, 20(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s2, 16(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s4, 8(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    mv s0, a2
 ; RV32I-NEXT:    mv s1, a1
 ; RV32I-NEXT:    lui a1, 16
-; RV32I-NEXT:    addi s3, a1, -1
-; RV32I-NEXT:    and a0, a0, s3
+; RV32I-NEXT:    addi s4, a1, -1
+; RV32I-NEXT:    and a0, a0, s4
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
 ; RV32I-NEXT:    mv s2, a0
-; RV32I-NEXT:    and a0, s1, s3
+; RV32I-NEXT:    mv s3, a1
+; RV32I-NEXT:    and a0, s1, s4
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
+; RV32I-NEXT:    mv a2, a0
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    mv a0, s2
+; RV32I-NEXT:    mv a1, s3
+; RV32I-NEXT:    call __muldf3
 ; RV32I-NEXT:    mv s1, a0
-; RV32I-NEXT:    and a0, s0, s3
+; RV32I-NEXT:    mv s2, a1
+; RV32I-NEXT:    and a0, s0, s4
 ; RV32I-NEXT:    call __extendhfsf2
+; RV32I-NEXT:    call __extendsfdf2
 ; RV32I-NEXT:    mv a2, a0
-; RV32I-NEXT:    mv a0, s2
-; RV32I-NEXT:    mv a1, s1
-; RV32I-NEXT:    call fmaf
-; RV32I-NEXT:    call __truncsfhf2
+; RV32I-NEXT:    mv a3, a1
+; RV32I-NEXT:    mv a0, s1
+; RV32I-NEXT:    mv a1, s2
+; RV32I-NEXT:    call __adddf3
+; RV32I-NEXT:    call __truncdfhf2
 ; RV32I-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s1, 20(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s2, 16(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s4, 8(sp) # 4-byte Folded Reload
 ; RV32I-NEXT:    addi sp, sp, 32
 ; RV32I-NEXT:    ret
 ;
@@ -1729,17 +1742,22 @@ define half @fma_f16(half %a, half %b, half %c) nounwind {
 ; RV64I-NEXT:    addi s3, a1, -1
 ; RV64I-NEXT:    and a0, a0, s3
 ; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    call __extendsfdf2
 ; RV64I-NEXT:    mv s2, a0
 ; RV64I-NEXT:    and a0, s1, s3
 ; RV64I-NEXT:    call __extendhfsf2
+; RV64I-NEXT:    call __extendsfdf2
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    mv a0, s2
+; RV64I-NEXT:    call __muldf3
 ; RV64I-NEXT:    mv s1, a0
 ; RV64I-NEXT:    and a0, s0, s3
 ; RV64I-NEXT:    call __extendhfsf2
-; RV64I-NEXT:    mv a2, a0
-; RV64I-NEXT:    mv a0, s2
-; RV64I-NEXT:    mv a1, s1
-; RV64I-NEXT:    call fmaf
-; RV64I-NEXT:    call __truncsfhf2
+; RV64I-NEXT:    call __extendsfdf2
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    mv a0, s1
+; RV64I-NEXT:    call __adddf3
+; RV64I-NEXT:    call __truncdfhf2
 ; RV64I-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload

>From 438f8baea81dd58a913d83451e1d7012f51dc9f9 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Mon, 15 Dec 2025 00:44:51 +0100
Subject: [PATCH 2/2] add `AddPromotedToType(ISD::FMA, MVT::f16, MVT::f64);`

---
 .../SelectionDAG/LegalizeFloatTypes.cpp       |   3 +-
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |   5 +
 llvm/test/CodeGen/AArch64/f16-instructions.ll |  10 +-
 llvm/test/CodeGen/AArch64/fmla.ll             | 370 +++++++++---------
 .../CodeGen/AArch64/fp-intrinsics-fp16.ll     |  12 +-
 ...ve-streaming-mode-fixed-length-fp-arith.ll | 320 +++++++--------
 llvm/test/CodeGen/Generic/half-op.ll          |   6 +-
 llvm/test/CodeGen/NVPTX/f16-instructions.ll   |  27 +-
 llvm/test/CodeGen/NVPTX/f16x2-instructions.ll |  54 +--
 llvm/test/CodeGen/SystemZ/fp-mul-06.ll        |  12 +-
 llvm/test/CodeGen/SystemZ/fp-mul-08.ll        |  12 +-
 llvm/test/CodeGen/SystemZ/fp-mul-10.ll        |  20 +-
 llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll |  14 +-
 .../test/CodeGen/X86/fp-strict-scalar-fp16.ll |  56 +--
 llvm/test/CodeGen/X86/fp16-libcalls.ll        |  36 +-
 15 files changed, 494 insertions(+), 463 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 2406094ef0378..72814a6890075 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -3518,7 +3518,8 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) {
 
   SDValue Res;
   if (OVT == MVT::f16) {
-    // An f16 fma must go via f64 to prevent double rounding issues.
+    // If f16 fma is not natively supported, the value must be promoted to an
+    // f64 (and not to f32!) to prevent double rounding issues.
     SDValue A64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op0, Flags);
     SDValue B64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op1, Flags);
     SDValue C64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op2, Flags);
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 1d674b283db15..d62bf8a4ad74d 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -842,6 +842,11 @@ void TargetLoweringBase::initActions() {
     }
   }
 
+  // If f16 fma is not natively supported, the value must be promoted to an f64
+  // (and not to f32!) to prevent double rounding issues.
+  AddPromotedToType(ISD::FMA, MVT::f16, MVT::f64);
+  AddPromotedToType(ISD::STRICT_FMA, MVT::f16, MVT::f64);
+
   // Set default actions for various operations.
   for (MVT VT : MVT::all_valuetypes()) {
     // Default all indexed load / store to expand.
diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll
index 085170c7ba381..f6d701b518699 100644
--- a/llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -1378,11 +1378,11 @@ define half @test_log2(half %a) #0 {
 define half @test_fma(half %a, half %b, half %c) #0 {
 ; CHECK-CVT-SD-LABEL: test_fma:
 ; CHECK-CVT-SD:       // %bb.0:
-; CHECK-CVT-SD-NEXT:    fcvt s2, h2
-; CHECK-CVT-SD-NEXT:    fcvt s1, h1
-; CHECK-CVT-SD-NEXT:    fcvt s0, h0
-; CHECK-CVT-SD-NEXT:    fmadd s0, s0, s1, s2
-; CHECK-CVT-SD-NEXT:    fcvt h0, s0
+; CHECK-CVT-SD-NEXT:    fcvt d2, h2
+; CHECK-CVT-SD-NEXT:    fcvt d1, h1
+; CHECK-CVT-SD-NEXT:    fcvt d0, h0
+; CHECK-CVT-SD-NEXT:    fmadd d0, d0, d1, d2
+; CHECK-CVT-SD-NEXT:    fcvt h0, d0
 ; CHECK-CVT-SD-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_fma:
diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index 12b6562b5cf0c..16c835b59bd3f 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -27,11 +27,11 @@ entry:
 define half @fma_f16(half %a, half %b, half %c) {
 ; CHECK-SD-NOFP16-LABEL: fma_f16:
 ; CHECK-SD-NOFP16:       // %bb.0: // %entry
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fmadd s0, s0, s1, s2
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt d2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt d1, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt d0, h0
+; CHECK-SD-NOFP16-NEXT:    fmadd d0, d0, d1, d2
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, d0
 ; CHECK-SD-NOFP16-NEXT:    ret
 ;
 ; CHECK-SD-FP16-LABEL: fma_f16:
@@ -178,69 +178,69 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v2.h[1]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[1]
 ; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt d6, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt d7, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt d16, h0
 ; CHECK-SD-NOFP16-NEXT:    mov h17, v2.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h18, v1.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fmadd s6, s16, s7, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
+; CHECK-SD-NOFP16-NEXT:    fcvt d3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt d4, h4
+; CHECK-SD-NOFP16-NEXT:    fcvt d5, h5
+; CHECK-SD-NOFP16-NEXT:    fmadd d6, d16, d7, d6
+; CHECK-SD-NOFP16-NEXT:    fcvt d7, h17
+; CHECK-SD-NOFP16-NEXT:    fcvt d16, h18
+; CHECK-SD-NOFP16-NEXT:    fcvt d17, h19
 ; CHECK-SD-NOFP16-NEXT:    mov h18, v1.h[3]
 ; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fmadd s4, s5, s4, s3
+; CHECK-SD-NOFP16-NEXT:    fmadd d4, d5, d4, d3
 ; CHECK-SD-NOFP16-NEXT:    mov h5, v2.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    fmadd s6, s17, s16, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, d6
+; CHECK-SD-NOFP16-NEXT:    fmadd d6, d17, d16, d7
 ; CHECK-SD-NOFP16-NEXT:    mov h17, v2.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h19
+; CHECK-SD-NOFP16-NEXT:    fcvt d7, h18
+; CHECK-SD-NOFP16-NEXT:    fcvt d16, h19
 ; CHECK-SD-NOFP16-NEXT:    mov h18, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, d4
+; CHECK-SD-NOFP16-NEXT:    fcvt d5, h5
 ; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
+; CHECK-SD-NOFP16-NEXT:    fcvt h6, d6
+; CHECK-SD-NOFP16-NEXT:    fcvt d17, h17
+; CHECK-SD-NOFP16-NEXT:    fcvt d18, h18
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v2.h[5]
-; CHECK-SD-NOFP16-NEXT:    fmadd s5, s16, s7, s5
+; CHECK-SD-NOFP16-NEXT:    fmadd d5, d16, d7, d5
 ; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[5]
 ; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
+; CHECK-SD-NOFP16-NEXT:    fcvt d19, h19
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v6.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h6, v2.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fmadd s17, s19, s18, s17
+; CHECK-SD-NOFP16-NEXT:    fcvt d4, h4
+; CHECK-SD-NOFP16-NEXT:    fcvt d7, h7
+; CHECK-SD-NOFP16-NEXT:    fcvt d16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, d5
+; CHECK-SD-NOFP16-NEXT:    fmadd d17, d19, d18, d17
 ; CHECK-SD-NOFP16-NEXT:    mov h18, v1.h[6]
 ; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[6]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v2.h[7]
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    fmadd s4, s16, s7, s4
+; CHECK-SD-NOFP16-NEXT:    fmadd d4, d16, d7, d4
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fmadd s5, s7, s6, s5
+; CHECK-SD-NOFP16-NEXT:    fcvt d5, h6
+; CHECK-SD-NOFP16-NEXT:    fcvt d6, h18
+; CHECK-SD-NOFP16-NEXT:    fcvt d7, h19
+; CHECK-SD-NOFP16-NEXT:    fcvt h16, d17
+; CHECK-SD-NOFP16-NEXT:    fcvt d2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt d1, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt d0, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, d4
+; CHECK-SD-NOFP16-NEXT:    fmadd d5, d7, d6, d5
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fmadd s0, s0, s1, s2
+; CHECK-SD-NOFP16-NEXT:    fmadd d0, d0, d1, d2
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, d5
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, d0
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], v0.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v3.16b
@@ -301,34 +301,34 @@ define <4 x half> @fma_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v2.h[1]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[1]
 ; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt d6, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt d7, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt d16, h0
 ; CHECK-SD-NOFP16-NEXT:    mov h17, v2.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h18, v1.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v2.h[3]
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fmadd s6, s16, s7, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt d3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt d4, h4
+; CHECK-SD-NOFP16-NEXT:    fcvt d5, h5
+; CHECK-SD-NOFP16-NEXT:    fmadd d6, d16, d7, d6
 ; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fmadd s3, s5, s4, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s6
-; CHECK-SD-NOFP16-NEXT:    fmadd s4, s7, s5, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s3
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt d7, h19
+; CHECK-SD-NOFP16-NEXT:    fcvt d2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt d1, h1
+; CHECK-SD-NOFP16-NEXT:    fmadd d3, d5, d4, d3
+; CHECK-SD-NOFP16-NEXT:    fcvt d4, h17
+; CHECK-SD-NOFP16-NEXT:    fcvt d5, h18
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, d6
+; CHECK-SD-NOFP16-NEXT:    fmadd d4, d7, d5, d4
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, d3
+; CHECK-SD-NOFP16-NEXT:    fcvt d5, h16
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[1], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s4
-; CHECK-SD-NOFP16-NEXT:    fmadd s1, s5, s1, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, d4
+; CHECK-SD-NOFP16-NEXT:    fmadd d1, d5, d1, d2
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[2], v3.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, d1
 ; CHECK-SD-NOFP16-NEXT:    mov v0.h[3], v1.h[0]
 ; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NOFP16-NEXT:    ret
@@ -364,69 +364,69 @@ define <8 x half> @fma_v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v2.h[1]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v1.h[1]
 ; CHECK-SD-NOFP16-NEXT:    mov h5, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt d6, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt d7, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt d16, h0
 ; CHECK-SD-NOFP16-NEXT:    mov h17, v2.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h18, v1.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fmadd s6, s16, s7, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h19
+; CHECK-SD-NOFP16-NEXT:    fcvt d3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt d4, h4
+; CHECK-SD-NOFP16-NEXT:    fcvt d5, h5
+; CHECK-SD-NOFP16-NEXT:    fmadd d6, d16, d7, d6
+; CHECK-SD-NOFP16-NEXT:    fcvt d7, h17
+; CHECK-SD-NOFP16-NEXT:    fcvt d16, h18
+; CHECK-SD-NOFP16-NEXT:    fcvt d17, h19
 ; CHECK-SD-NOFP16-NEXT:    mov h18, v1.h[3]
 ; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fmadd s4, s5, s4, s3
+; CHECK-SD-NOFP16-NEXT:    fmadd d4, d5, d4, d3
 ; CHECK-SD-NOFP16-NEXT:    mov h5, v2.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt h3, s6
-; CHECK-SD-NOFP16-NEXT:    fmadd s6, s17, s16, s7
+; CHECK-SD-NOFP16-NEXT:    fcvt h3, d6
+; CHECK-SD-NOFP16-NEXT:    fmadd d6, d17, d16, d7
 ; CHECK-SD-NOFP16-NEXT:    mov h17, v2.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h19
+; CHECK-SD-NOFP16-NEXT:    fcvt d7, h18
+; CHECK-SD-NOFP16-NEXT:    fcvt d16, h19
 ; CHECK-SD-NOFP16-NEXT:    mov h18, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, d4
+; CHECK-SD-NOFP16-NEXT:    fcvt d5, h5
 ; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s6
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
+; CHECK-SD-NOFP16-NEXT:    fcvt h6, d6
+; CHECK-SD-NOFP16-NEXT:    fcvt d17, h17
+; CHECK-SD-NOFP16-NEXT:    fcvt d18, h18
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[1], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v2.h[5]
-; CHECK-SD-NOFP16-NEXT:    fmadd s5, s16, s7, s5
+; CHECK-SD-NOFP16-NEXT:    fmadd d5, d16, d7, d5
 ; CHECK-SD-NOFP16-NEXT:    mov h7, v1.h[5]
 ; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h19
+; CHECK-SD-NOFP16-NEXT:    fcvt d19, h19
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[2], v6.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h6, v2.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt h5, s5
-; CHECK-SD-NOFP16-NEXT:    fmadd s17, s19, s18, s17
+; CHECK-SD-NOFP16-NEXT:    fcvt d4, h4
+; CHECK-SD-NOFP16-NEXT:    fcvt d7, h7
+; CHECK-SD-NOFP16-NEXT:    fcvt d16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt h5, d5
+; CHECK-SD-NOFP16-NEXT:    fmadd d17, d19, d18, d17
 ; CHECK-SD-NOFP16-NEXT:    mov h18, v1.h[6]
 ; CHECK-SD-NOFP16-NEXT:    mov h19, v0.h[6]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v2.h[7]
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    fmadd s4, s16, s7, s4
+; CHECK-SD-NOFP16-NEXT:    fmadd d4, d16, d7, d4
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[3], v5.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h19
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s4
-; CHECK-SD-NOFP16-NEXT:    fmadd s5, s7, s6, s5
+; CHECK-SD-NOFP16-NEXT:    fcvt d5, h6
+; CHECK-SD-NOFP16-NEXT:    fcvt d6, h18
+; CHECK-SD-NOFP16-NEXT:    fcvt d7, h19
+; CHECK-SD-NOFP16-NEXT:    fcvt h16, d17
+; CHECK-SD-NOFP16-NEXT:    fcvt d2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt d1, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt d0, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, d4
+; CHECK-SD-NOFP16-NEXT:    fmadd d5, d7, d6, d5
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[4], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fmadd s0, s0, s1, s2
+; CHECK-SD-NOFP16-NEXT:    fmadd d0, d0, d1, d2
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[5], v4.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s5
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, d5
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, d0
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[6], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v3.h[7], v0.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v0.16b, v3.16b
@@ -468,136 +468,136 @@ define <16 x half> @fma_v16f16(<16 x half> %a, <16 x half> %b, <16 x half> %c) {
 ; CHECK-SD-NOFP16-NEXT:    mov h6, v4.h[1]
 ; CHECK-SD-NOFP16-NEXT:    mov h7, v2.h[1]
 ; CHECK-SD-NOFP16-NEXT:    mov h16, v0.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h4
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt d17, h4
+; CHECK-SD-NOFP16-NEXT:    fcvt d18, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt d19, h0
 ; CHECK-SD-NOFP16-NEXT:    mov h20, v4.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h21, v2.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h22, v0.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h23, v4.h[3]
 ; CHECK-SD-NOFP16-NEXT:    mov h24, v2.h[3]
 ; CHECK-SD-NOFP16-NEXT:    mov h25, v0.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-SD-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fmadd s17, s19, s18, s17
+; CHECK-SD-NOFP16-NEXT:    fcvt d6, h6
+; CHECK-SD-NOFP16-NEXT:    fcvt d7, h7
+; CHECK-SD-NOFP16-NEXT:    fcvt d16, h16
+; CHECK-SD-NOFP16-NEXT:    fmadd d17, d19, d18, d17
 ; CHECK-SD-NOFP16-NEXT:    mov h26, v1.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt s27, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h22
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h23
-; CHECK-SD-NOFP16-NEXT:    fcvt s22, h24
-; CHECK-SD-NOFP16-NEXT:    fcvt s23, h25
-; CHECK-SD-NOFP16-NEXT:    fmadd s7, s16, s7, s6
+; CHECK-SD-NOFP16-NEXT:    fcvt d27, h5
+; CHECK-SD-NOFP16-NEXT:    fcvt d18, h20
+; CHECK-SD-NOFP16-NEXT:    fcvt d19, h21
+; CHECK-SD-NOFP16-NEXT:    fcvt d20, h22
+; CHECK-SD-NOFP16-NEXT:    fcvt d21, h23
+; CHECK-SD-NOFP16-NEXT:    fcvt d22, h24
+; CHECK-SD-NOFP16-NEXT:    fcvt d23, h25
+; CHECK-SD-NOFP16-NEXT:    fmadd d7, d16, d7, d6
 ; CHECK-SD-NOFP16-NEXT:    mov h24, v5.h[1]
 ; CHECK-SD-NOFP16-NEXT:    mov h25, v3.h[1]
-; CHECK-SD-NOFP16-NEXT:    fcvt h6, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s28, h3
-; CHECK-SD-NOFP16-NEXT:    fcvt s29, h1
-; CHECK-SD-NOFP16-NEXT:    fmadd s19, s20, s19, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt s26, h26
+; CHECK-SD-NOFP16-NEXT:    fcvt h6, d17
+; CHECK-SD-NOFP16-NEXT:    fcvt d28, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt d29, h1
+; CHECK-SD-NOFP16-NEXT:    fmadd d19, d20, d19, d18
+; CHECK-SD-NOFP16-NEXT:    fcvt d26, h26
 ; CHECK-SD-NOFP16-NEXT:    mov h16, v4.h[4]
-; CHECK-SD-NOFP16-NEXT:    fmadd s21, s23, s22, s21
+; CHECK-SD-NOFP16-NEXT:    fmadd d21, d23, d22, d21
 ; CHECK-SD-NOFP16-NEXT:    mov h22, v3.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h23, v1.h[2]
-; CHECK-SD-NOFP16-NEXT:    fcvt h20, s7
-; CHECK-SD-NOFP16-NEXT:    fcvt s24, h24
-; CHECK-SD-NOFP16-NEXT:    fcvt s25, h25
+; CHECK-SD-NOFP16-NEXT:    fcvt h20, d7
+; CHECK-SD-NOFP16-NEXT:    fcvt d24, h24
+; CHECK-SD-NOFP16-NEXT:    fcvt d25, h25
 ; CHECK-SD-NOFP16-NEXT:    mov h17, v2.h[4]
 ; CHECK-SD-NOFP16-NEXT:    mov h18, v0.h[4]
 ; CHECK-SD-NOFP16-NEXT:    mov h7, v4.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt h19, s19
+; CHECK-SD-NOFP16-NEXT:    fcvt h19, d19
 ; CHECK-SD-NOFP16-NEXT:    mov h30, v2.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-SD-NOFP16-NEXT:    fcvt h21, s21
+; CHECK-SD-NOFP16-NEXT:    fcvt d16, h16
+; CHECK-SD-NOFP16-NEXT:    fcvt h21, d21
 ; CHECK-SD-NOFP16-NEXT:    mov h31, v1.h[4]
-; CHECK-SD-NOFP16-NEXT:    fmadd s24, s26, s25, s24
-; CHECK-SD-NOFP16-NEXT:    fmadd s25, s29, s28, s27
+; CHECK-SD-NOFP16-NEXT:    fmadd d24, d26, d25, d24
+; CHECK-SD-NOFP16-NEXT:    fmadd d25, d29, d28, d27
 ; CHECK-SD-NOFP16-NEXT:    mov v6.h[1], v20.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h20, v5.h[2]
 ; CHECK-SD-NOFP16-NEXT:    mov h26, v5.h[3]
 ; CHECK-SD-NOFP16-NEXT:    mov h27, v3.h[3]
 ; CHECK-SD-NOFP16-NEXT:    mov h28, v1.h[3]
-; CHECK-SD-NOFP16-NEXT:    fcvt s17, h17
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
-; CHECK-SD-NOFP16-NEXT:    fcvt s29, h7
-; CHECK-SD-NOFP16-NEXT:    fcvt s30, h30
+; CHECK-SD-NOFP16-NEXT:    fcvt d17, h17
+; CHECK-SD-NOFP16-NEXT:    fcvt d18, h18
+; CHECK-SD-NOFP16-NEXT:    fcvt d29, h7
+; CHECK-SD-NOFP16-NEXT:    fcvt d30, h30
 ; CHECK-SD-NOFP16-NEXT:    mov v6.h[2], v19.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h24, s24
-; CHECK-SD-NOFP16-NEXT:    fcvt h7, s25
-; CHECK-SD-NOFP16-NEXT:    fcvt s19, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h22
-; CHECK-SD-NOFP16-NEXT:    fcvt s22, h23
-; CHECK-SD-NOFP16-NEXT:    fmadd s16, s18, s17, s16
+; CHECK-SD-NOFP16-NEXT:    fcvt h24, d24
+; CHECK-SD-NOFP16-NEXT:    fcvt h7, d25
+; CHECK-SD-NOFP16-NEXT:    fcvt d19, h20
+; CHECK-SD-NOFP16-NEXT:    fcvt d20, h22
+; CHECK-SD-NOFP16-NEXT:    fcvt d22, h23
+; CHECK-SD-NOFP16-NEXT:    fmadd d16, d18, d17, d16
 ; CHECK-SD-NOFP16-NEXT:    mov h23, v0.h[5]
-; CHECK-SD-NOFP16-NEXT:    fcvt s25, h26
-; CHECK-SD-NOFP16-NEXT:    fcvt s26, h27
-; CHECK-SD-NOFP16-NEXT:    fcvt s27, h28
+; CHECK-SD-NOFP16-NEXT:    fcvt d25, h26
+; CHECK-SD-NOFP16-NEXT:    fcvt d26, h27
+; CHECK-SD-NOFP16-NEXT:    fcvt d27, h28
 ; CHECK-SD-NOFP16-NEXT:    mov h18, v4.h[6]
 ; CHECK-SD-NOFP16-NEXT:    mov v6.h[3], v21.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v7.h[1], v24.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov h24, v5.h[5]
-; CHECK-SD-NOFP16-NEXT:    fmadd s19, s22, s20, s19
+; CHECK-SD-NOFP16-NEXT:    fmadd d19, d22, d20, d19
 ; CHECK-SD-NOFP16-NEXT:    mov h20, v5.h[4]
 ; CHECK-SD-NOFP16-NEXT:    mov h22, v3.h[4]
-; CHECK-SD-NOFP16-NEXT:    fcvt s23, h23
+; CHECK-SD-NOFP16-NEXT:    fcvt d23, h23
 ; CHECK-SD-NOFP16-NEXT:    mov h28, v0.h[6]
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s16
-; CHECK-SD-NOFP16-NEXT:    fcvt s18, h18
+; CHECK-SD-NOFP16-NEXT:    fcvt h16, d16
+; CHECK-SD-NOFP16-NEXT:    fcvt d18, h18
 ; CHECK-SD-NOFP16-NEXT:    mov h4, v4.h[7]
 ; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s20, h20
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h22
-; CHECK-SD-NOFP16-NEXT:    fcvt s22, h31
-; CHECK-SD-NOFP16-NEXT:    fmadd s17, s23, s30, s29
-; CHECK-SD-NOFP16-NEXT:    fmadd s23, s27, s26, s25
-; CHECK-SD-NOFP16-NEXT:    fcvt h19, s19
+; CHECK-SD-NOFP16-NEXT:    fcvt d20, h20
+; CHECK-SD-NOFP16-NEXT:    fcvt d21, h22
+; CHECK-SD-NOFP16-NEXT:    fcvt d22, h31
+; CHECK-SD-NOFP16-NEXT:    fmadd d17, d23, d30, d29
+; CHECK-SD-NOFP16-NEXT:    fmadd d23, d27, d26, d25
+; CHECK-SD-NOFP16-NEXT:    fcvt h19, d19
 ; CHECK-SD-NOFP16-NEXT:    mov h25, v3.h[5]
 ; CHECK-SD-NOFP16-NEXT:    mov h26, v1.h[5]
 ; CHECK-SD-NOFP16-NEXT:    mov h27, v2.h[6]
 ; CHECK-SD-NOFP16-NEXT:    mov h29, v1.h[6]
 ; CHECK-SD-NOFP16-NEXT:    mov h2, v2.h[7]
 ; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-SD-NOFP16-NEXT:    fmadd s20, s22, s21, s20
+; CHECK-SD-NOFP16-NEXT:    fmadd d20, d22, d21, d20
 ; CHECK-SD-NOFP16-NEXT:    mov h21, v5.h[6]
 ; CHECK-SD-NOFP16-NEXT:    mov h22, v3.h[6]
 ; CHECK-SD-NOFP16-NEXT:    mov v7.h[2], v19.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h19, s23
-; CHECK-SD-NOFP16-NEXT:    fcvt s23, h24
-; CHECK-SD-NOFP16-NEXT:    fcvt s24, h25
-; CHECK-SD-NOFP16-NEXT:    fcvt s25, h26
-; CHECK-SD-NOFP16-NEXT:    fcvt s26, h27
-; CHECK-SD-NOFP16-NEXT:    fcvt s27, h28
-; CHECK-SD-NOFP16-NEXT:    fcvt s28, h29
+; CHECK-SD-NOFP16-NEXT:    fcvt h19, d23
+; CHECK-SD-NOFP16-NEXT:    fcvt d23, h24
+; CHECK-SD-NOFP16-NEXT:    fcvt d24, h25
+; CHECK-SD-NOFP16-NEXT:    fcvt d25, h26
+; CHECK-SD-NOFP16-NEXT:    fcvt d26, h27
+; CHECK-SD-NOFP16-NEXT:    fcvt d27, h28
+; CHECK-SD-NOFP16-NEXT:    fcvt d28, h29
 ; CHECK-SD-NOFP16-NEXT:    mov h5, v5.h[7]
-; CHECK-SD-NOFP16-NEXT:    fcvt s21, h21
-; CHECK-SD-NOFP16-NEXT:    fcvt s22, h22
+; CHECK-SD-NOFP16-NEXT:    fcvt d21, h21
+; CHECK-SD-NOFP16-NEXT:    fcvt d22, h22
 ; CHECK-SD-NOFP16-NEXT:    mov h3, v3.h[7]
 ; CHECK-SD-NOFP16-NEXT:    mov v7.h[3], v19.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h19, s20
+; CHECK-SD-NOFP16-NEXT:    fcvt h19, d20
 ; CHECK-SD-NOFP16-NEXT:    mov v6.h[4], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fmadd s20, s25, s24, s23
-; CHECK-SD-NOFP16-NEXT:    fcvt h16, s17
-; CHECK-SD-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-SD-NOFP16-NEXT:    fmadd s18, s27, s26, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-SD-NOFP16-NEXT:    fmadd s21, s28, s22, s21
-; CHECK-SD-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fmadd d20, d25, d24, d23
+; CHECK-SD-NOFP16-NEXT:    fcvt h16, d17
+; CHECK-SD-NOFP16-NEXT:    fcvt d4, h4
+; CHECK-SD-NOFP16-NEXT:    fmadd d18, d27, d26, d18
+; CHECK-SD-NOFP16-NEXT:    fcvt d2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt d0, h0
+; CHECK-SD-NOFP16-NEXT:    fmadd d21, d28, d22, d21
+; CHECK-SD-NOFP16-NEXT:    fcvt d5, h5
+; CHECK-SD-NOFP16-NEXT:    fcvt d3, h3
 ; CHECK-SD-NOFP16-NEXT:    mov v7.h[4], v19.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-SD-NOFP16-NEXT:    fcvt h17, s20
+; CHECK-SD-NOFP16-NEXT:    fcvt d1, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt h17, d20
 ; CHECK-SD-NOFP16-NEXT:    mov v6.h[5], v16.h[0]
-; CHECK-SD-NOFP16-NEXT:    fmadd s0, s0, s2, s4
-; CHECK-SD-NOFP16-NEXT:    fcvt h2, s18
-; CHECK-SD-NOFP16-NEXT:    fcvt h4, s21
-; CHECK-SD-NOFP16-NEXT:    fmadd s1, s1, s3, s5
+; CHECK-SD-NOFP16-NEXT:    fmadd d0, d0, d2, d4
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, d18
+; CHECK-SD-NOFP16-NEXT:    fcvt h4, d21
+; CHECK-SD-NOFP16-NEXT:    fmadd d1, d1, d3, d5
 ; CHECK-SD-NOFP16-NEXT:    mov v7.h[5], v17.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v6.h[6], v2.h[0]
-; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
-; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, d0
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, d1
 ; CHECK-SD-NOFP16-NEXT:    mov v7.h[6], v4.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v6.h[7], v0.h[0]
 ; CHECK-SD-NOFP16-NEXT:    mov v7.h[7], v1.h[0]
diff --git a/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll
index 86029a7169abb..368fa0a0cfae9 100644
--- a/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll
+++ b/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll
@@ -170,11 +170,11 @@ define half @frem_f16(half %x, half %y) #0 {
 define half @fma_f16(half %x, half %y, half %z) #0 {
 ; CHECK-NOFP16-LABEL: fma_f16:
 ; CHECK-NOFP16:       // %bb.0:
-; CHECK-NOFP16-NEXT:    fcvt s2, h2
-; CHECK-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-NOFP16-NEXT:    fmadd s0, s0, s1, s2
-; CHECK-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-NOFP16-NEXT:    fcvt d2, h2
+; CHECK-NOFP16-NEXT:    fcvt d1, h1
+; CHECK-NOFP16-NEXT:    fcvt d0, h0
+; CHECK-NOFP16-NEXT:    fmadd d0, d0, d1, d2
+; CHECK-NOFP16-NEXT:    fcvt h0, d0
 ; CHECK-NOFP16-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: fma_f16:
@@ -1382,3 +1382,5 @@ declare i1 @llvm.experimental.constrained.fcmp.f16(half, half, metadata, metadat
 
 declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-GI: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
index 20c06f0a1aff5..2f708cbda1f2b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
@@ -1043,38 +1043,38 @@ define <2 x half> @fma_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3)
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #4]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #32
@@ -1103,38 +1103,38 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #14]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #6]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #12]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #4]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #10]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #8]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    ldr d0, [sp, #24]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #32
@@ -1163,74 +1163,74 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #30]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #14]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #28]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #12]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #26]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #10]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #24]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #22]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #6]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #20]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #4]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #18]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    ldr q0, [sp, #48]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #64
@@ -1264,146 +1264,146 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 ; NONEON-NOSVE-NEXT:    stp q1, q5, [sp, #32]
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #78]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #62]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #76]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #60]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #126]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #92]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #74]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #58]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #124]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #90]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #72]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #56]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #122]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #88]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #70]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #54]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #120]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #86]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #68]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #52]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #118]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #84]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #66]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #50]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #116]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #82]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #64]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #48]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #114]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #80]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #30]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #14]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #112]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #28]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #12]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #110]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #26]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #10]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #108]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #24]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #8]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #106]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #22]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #6]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #104]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #20]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #4]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #102]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #18]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp, #2]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #100]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
 ; NONEON-NOSVE-NEXT:    ldr h1, [sp, #16]
 ; NONEON-NOSVE-NEXT:    ldr h2, [sp]
-; NONEON-NOSVE-NEXT:    fcvt s1, h1
-; NONEON-NOSVE-NEXT:    fcvt s2, h2
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d1, h1
+; NONEON-NOSVE-NEXT:    fcvt d2, h2
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #98]
 ; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
-; NONEON-NOSVE-NEXT:    fcvt s0, h0
-; NONEON-NOSVE-NEXT:    fmadd s0, s2, s1, s0
-; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    fcvt d0, h0
+; NONEON-NOSVE-NEXT:    fmadd d0, d2, d1, d0
+; NONEON-NOSVE-NEXT:    fcvt h0, d0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #96]
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
 ; NONEON-NOSVE-NEXT:    stp q0, q1, [x0]
diff --git a/llvm/test/CodeGen/Generic/half-op.ll b/llvm/test/CodeGen/Generic/half-op.ll
index 30509efcba8bb..f8ad39f9456aa 100644
--- a/llvm/test/CodeGen/Generic/half-op.ll
+++ b/llvm/test/CodeGen/Generic/half-op.ll
@@ -35,12 +35,12 @@
 ; RUN: %if sparc-registered-target       %{ llc %s -o - -mtriple=sparc-unknown-linux-gnu         | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
 ; RUN: %if sparc-registered-target       %{ llc %s -o - -mtriple=sparc64-unknown-linux-gnu       | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
 ; RUN: %if spirv-registered-target       %{ llc %s -o - -mtriple=spirv-unknown-unknown           | FileCheck %s --check-prefixes=NOCRASH %}
-; RUN: %if systemz-registered-target     %{ llc %s -o - -mtriple=s390x-unknown-linux-gnu         | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if systemz-registered-target     %{ llc %s -o - -mtriple=s390x-unknown-linux-gnu         | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
 ; RUN: %if ve-registered-target          %{ llc %s -o - -mtriple=ve-unknown-unknown              | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
 ; RUN: %if webassembly-registered-target %{ llc %s -o - -mtriple=wasm32-unknown-unknown          | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
 ; RUN: %if x86-registered-target         %{ llc %s -o - -mtriple=i686-unknown-linux-gnu          | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
-; RUN: %if x86-registered-target         %{ llc %s -o - -mtriple=x86_64-pc-windows-msvc          | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
-; RUN: %if x86-registered-target         %{ llc %s -o - -mtriple=x86_64-unknown-linux-gnu        | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if x86-registered-target         %{ llc %s -o - -mtriple=x86_64-pc-windows-msvc          | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
+; RUN: %if x86-registered-target         %{ llc %s -o - -mtriple=x86_64-unknown-linux-gnu        | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
 ; RUN: %if xcore-registered-target       %{ llc %s -o - -mtriple=xcore-unknown-unknown           | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
 ; RUN: %if xtensa-registered-target      %{ llc %s -o - -mtriple=xtensa-none-elf                 | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,CHECK-FMA %}
 
diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
index 4e2f7ea9e5208..53288b35d55a4 100644
--- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; ## Full FP16 support enabled by default.
 ; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
 ; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
@@ -953,11 +954,11 @@ define half @test_cos(half %a) #0 {
 ; CHECK-DAG:  ld.param.b16    [[C:%rs[0-9]+]], [test_fma_param_2];
 ; CHECK-F16-NOFTZ:      fma.rn.f16      [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
 ; CHECK-F16-FTZ:      fma.rn.ftz.f16      [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%r[0-9]+]], [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%r[0-9]+]], [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%r[0-9]+]], [[C]]
-; CHECK-NOF16-NEXT: fma.rn.f32     [[R32:%r[0-9]+]], [[A32]], [[B32]], [[C32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
+; CHECK-NOF16-DAG:  cvt.f64.f16    [[A64:%rd[0-9]+]], [[A]]
+; CHECK-NOF16-DAG:  cvt.f64.f16    [[B64:%rd[0-9]+]], [[B]]
+; CHECK-NOF16-DAG:  cvt.f64.f16    [[C64:%rd[0-9]+]], [[C]]
+; CHECK-NOF16-NEXT: fma.rn.f64     [[R64:%rd[0-9]+]], [[A64]], [[B64]], [[C64]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f64 [[R:%rs[0-9]+]], [[R64]]
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret
 define half @test_fma(half %a, half %b, half %c) #0 {
@@ -1151,11 +1152,11 @@ define half @test_round(half %a) #0 {
 ; CHECK-DAG:  ld.param.b16    [[C:%rs[0-9]+]], [test_fmuladd_param_2];
 ; CHECK-F16-NOFTZ:        fma.rn.f16     [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
 ; CHECK-F16-FTZ:        fma.rn.ftz.f16     [[R:%rs[0-9]+]], [[A]], [[B]], [[C]];
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[A32:%r[0-9]+]], [[A]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[B32:%r[0-9]+]], [[B]]
-; CHECK-NOF16-DAG:  cvt.f32.f16    [[C32:%r[0-9]+]], [[C]]
-; CHECK-NOF16-NEXT: fma.rn.f32     [[R32:%r[0-9]+]], [[A32]], [[B32]], [[C32]];
-; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[R32]]
+; CHECK-NOF16-DAG:  cvt.f64.f16    [[A64:%rd[0-9]+]], [[A]]
+; CHECK-NOF16-DAG:  cvt.f64.f16    [[B64:%rd[0-9]+]], [[B]]
+; CHECK-NOF16-DAG:  cvt.f64.f16    [[C64:%rd[0-9]+]], [[C]]
+; CHECK-NOF16-NEXT: fma.rn.f64     [[R64:%rd[0-9]+]], [[A64]], [[B64]], [[C64]];
+; CHECK-NOF16-NEXT: cvt.rn.f16.f64 [[R:%rs[0-9]+]], [[R64]]
 ; CHECK:      st.param.b16    [func_retval0], [[R]];
 ; CHECK:      ret;
 define half @test_fmuladd(half %a, half %b, half %c) #0 {
@@ -1183,3 +1184,9 @@ define <2 x half> @test_neg_f16x2(<2 x half> noundef %arg) #0 {
 }
 
 attributes #0 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
+; CHECK-F16-FTZ: {{.*}}
+; CHECK-F16-NOFTZ: {{.*}}
+; CHECK-NOF16: {{.*}}
+; CHECK-NOFTZ: {{.*}}
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index e9143d540b047..3ebaf68d4a15f 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -1766,27 +1766,28 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
 ; CHECK-NOF16-LABEL: test_fma(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<9>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<13>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOF16-NEXT:    .reg .b64 %rd<9>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_fma_param_2];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fma_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fma_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f64.f16 %rd1, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f64.f16 %rd2, %rs4;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs6;
-; CHECK-NOF16-NEXT:    fma.rn.f32 %r7, %r6, %r5, %r4;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs7, %r7;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r9, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r10, %rs5;
-; CHECK-NOF16-NEXT:    fma.rn.f32 %r11, %r10, %r9, %r8;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs8, %r11;
-; CHECK-NOF16-NEXT:    mov.b32 %r12, {%rs8, %rs7};
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r12;
+; CHECK-NOF16-NEXT:    cvt.f64.f16 %rd3, %rs6;
+; CHECK-NOF16-NEXT:    fma.rn.f64 %rd4, %rd3, %rd2, %rd1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f64 %rs7, %rd4;
+; CHECK-NOF16-NEXT:    cvt.f64.f16 %rd5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f64.f16 %rd6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f64.f16 %rd7, %rs5;
+; CHECK-NOF16-NEXT:    fma.rn.f64 %rd8, %rd7, %rd6, %rd5;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f64 %rs8, %rd8;
+; CHECK-NOF16-NEXT:    mov.b32 %r4, {%rs8, %rs7};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NOF16-NEXT:    ret;
   %r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
   ret <2 x half> %r
@@ -2203,27 +2204,28 @@ define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
 ; CHECK-NOF16-LABEL: test_fmuladd(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .b16 %rs<9>;
-; CHECK-NOF16-NEXT:    .reg .b32 %r<13>;
+; CHECK-NOF16-NEXT:    .reg .b32 %r<5>;
+; CHECK-NOF16-NEXT:    .reg .b64 %rd<9>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_fmuladd_param_2];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fmuladd_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fmuladd_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f64.f16 %rd1, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs4;
+; CHECK-NOF16-NEXT:    cvt.f64.f16 %rd2, %rs4;
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs6;
-; CHECK-NOF16-NEXT:    fma.rn.f32 %r7, %r6, %r5, %r4;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs7, %r7;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs1;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r9, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r10, %rs5;
-; CHECK-NOF16-NEXT:    fma.rn.f32 %r11, %r10, %r9, %r8;
-; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs8, %r11;
-; CHECK-NOF16-NEXT:    mov.b32 %r12, {%rs8, %rs7};
-; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r12;
+; CHECK-NOF16-NEXT:    cvt.f64.f16 %rd3, %rs6;
+; CHECK-NOF16-NEXT:    fma.rn.f64 %rd4, %rd3, %rd2, %rd1;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f64 %rs7, %rd4;
+; CHECK-NOF16-NEXT:    cvt.f64.f16 %rd5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f64.f16 %rd6, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f64.f16 %rd7, %rs5;
+; CHECK-NOF16-NEXT:    fma.rn.f64 %rd8, %rd7, %rd6, %rd5;
+; CHECK-NOF16-NEXT:    cvt.rn.f16.f64 %rs8, %rd8;
+; CHECK-NOF16-NEXT:    mov.b32 %r4, {%rs8, %rs7};
+; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r4;
 ; CHECK-NOF16-NEXT:    ret;
   %r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
   ret <2 x half> %r
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
index 6b285a49057dc..3b0c63749d378 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-06.ll
@@ -8,12 +8,12 @@ declare float @llvm.fma.f32(float %f1, float %f2, float %f3)
 
 define half @f0(half %f1, half %f2, half %acc) {
 ; CHECK-LABEL: f0:
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK-SCALAR: maebr %f0, %f9, %f10
-; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK-SCALAR: madbr %f0, %f9, %f10
+; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2 at PLT
 ; CHECK: br %r14
   %res = call half @llvm.fma.f16 (half %f1, half %f2, half %acc)
   ret half %res
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
index e739bddd4f18f..542cae41d4745 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-08.ll
@@ -10,12 +10,12 @@ define half @f0(half %f1, half %f2, half %acc) {
 ; CHECK-LABEL: f0:
 ; CHECK-NOT: brasl
 ; CHECK: lcdfr %f{{[0-9]+}}, %f4
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK-SCALAR: maebr %f0, %f8, %f10
-; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK-SCALAR: madbr %f0, %f8, %f10
+; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2 at PLT
 ; CHECK: br %r14
   %negacc = fneg half %acc
   %res = call half @llvm.fma.f16 (half %f1, half %f2, half %negacc)
diff --git a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
index 8f2cd23112cd0..0badf2993cca7 100644
--- a/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-mul-10.ll
@@ -25,11 +25,11 @@ define double @f2(double %f1, double %f2, double %acc) {
 
 define half @f3_half(half %f1, half %f2, half %acc) {
 ; CHECK-LABEL: f3_half:
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2 at PLT
 ; CHECK-NOT: brasl
 ; CHECK:      lcdfr %f0, %f0
 ; CHECK-NEXT: lmg
@@ -52,11 +52,11 @@ define half @f4_half(half %f1, half %f2, half %acc) {
 ; CHECK-LABEL: f4_half:
 ; CHECK-NOT: brasl
 ; CHECK: lcdfr %f0, %f4
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2 at PLT
 ; CHECK-NOT: brasl
 ; CHECK:      lcdfr %f0, %f0
 ; CHECK-NEXT: lmg
diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll b/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll
index c951c79aeb7c6..05ce53c98db13 100644
--- a/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-strict-mul-06.ll
@@ -8,13 +8,13 @@ declare float @llvm.experimental.constrained.fma.f32(float, float, float, metada
 
 define half @f0(half %f1, half %f2, half %acc) #0 {
 ; CHECK-LABEL: f0:
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK: brasl %r14, __extendhfsf2 at PLT
-; CHECK-SCALAR: maebr %f10, %f0, %f8
-; CHECK-SCALAR: ler %f0, %f10
-; CHECK-VECTOR: wfmasb %f0, %f0, %f8, %f10
-; CHECK: brasl %r14, __truncsfhf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK: brasl %r14, __extendhfdf2 at PLT
+; CHECK-SCALAR: madbr %f10, %f0, %f8
+; CHECK-SCALAR: ldr %f0, %f10
+; CHECK-VECTOR: wfmadb %f0, %f0, %f8, %f10
+; CHECK: brasl %r14, __truncdfhf2 at PLT
 ; CHECK: br %r14
   %res = call half @llvm.experimental.constrained.fma.f16 (
                         half %f1, half %f2, half %acc,
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
index b013ddad19a95..61a0c4eda8c72 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
@@ -432,8 +432,7 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    subq $24, %rsp
 ; SSE2-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; SSE2-NEXT:    callq __extendhfsf2 at PLT
 ; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
@@ -443,12 +442,17 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
 ; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    callq __extendhfsf2 at PLT
-; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
-; SSE2-NEXT:    # xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload
-; SSE2-NEXT:    # xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    callq fmaf at PLT
-; SSE2-NEXT:    callq __truncsfhf2 at PLT
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    cvtss2sd %xmm0, %xmm2
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    cvtss2sd %xmm0, %xmm1
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    cvtss2sd %xmm0, %xmm0
+; SSE2-NEXT:    callq fma at PLT
+; SSE2-NEXT:    callq __truncdfhf2 at PLT
 ; SSE2-NEXT:    addq $24, %rsp
 ; SSE2-NEXT:    retq
 ;
@@ -460,38 +464,42 @@ define half @fma_f16(half %a, half %b, half %c) nounwind strictfp {
 ; F16C-NEXT:    vpextrw $0, %xmm2, %edx
 ; F16C-NEXT:    movzwl %dx, %edx
 ; F16C-NEXT:    vmovd %edx, %xmm0
-; F16C-NEXT:    vcvtph2ps %xmm0, %xmm2
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm2
 ; F16C-NEXT:    movzwl %cx, %ecx
 ; F16C-NEXT:    vmovd %ecx, %xmm0
-; F16C-NEXT:    vcvtph2ps %xmm0, %xmm1
+; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm1
 ; F16C-NEXT:    movzwl %ax, %eax
 ; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT:    callq fmaf at PLT
-; F16C-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; F16C-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; F16C-NEXT:    callq fma at PLT
+; F16C-NEXT:    callq __truncdfhf2 at PLT
 ; F16C-NEXT:    popq %rax
 ; F16C-NEXT:    retq
 ;
 ; AVX512-LABEL: fma_f16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpextrw $0, %xmm1, %eax
-; AVX512-NEXT:    vpextrw $0, %xmm0, %ecx
+; AVX512-NEXT:    pushq %rax
+; AVX512-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX512-NEXT:    vpextrw $0, %xmm1, %ecx
 ; AVX512-NEXT:    vpextrw $0, %xmm2, %edx
 ; AVX512-NEXT:    movzwl %dx, %edx
 ; AVX512-NEXT:    vmovd %edx, %xmm0
 ; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm1
 ; AVX512-NEXT:    movzwl %cx, %ecx
-; AVX512-NEXT:    vmovd %ecx, %xmm1
-; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT:    vmovd %ecx, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm2
 ; AVX512-NEXT:    movzwl %ax, %eax
-; AVX512-NEXT:    vmovd %eax, %xmm2
-; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
-; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm0
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT:    vmovd %eax, %xmm0
+; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
+; AVX512-NEXT:    callq __truncdfhf2 at PLT
+; AVX512-NEXT:    popq %rax
 ; AVX512-NEXT:    retq
 ;
 ; X86-LABEL: fma_f16:
diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll
index 9b5c45f44acd0..6abdf9a5ba652 100644
--- a/llvm/test/CodeGen/X86/fp16-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll
@@ -421,10 +421,13 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
 ; F16C-NEXT:    pushq %rbx
 ; F16C-NEXT:    movq %rdi, %rbx
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; F16C-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
+; F16C-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
 ; F16C-NEXT:    vcvtph2ps %xmm2, %xmm2
-; F16C-NEXT:    callq fmaf at PLT
-; F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; F16C-NEXT:    vcvtss2sd %xmm2, %xmm2, %xmm2
+; F16C-NEXT:    callq fma at PLT
+; F16C-NEXT:    callq __truncdfhf2 at PLT
 ; F16C-NEXT:    vpextrw $0, %xmm0, (%rbx)
 ; F16C-NEXT:    popq %rbx
 ; F16C-NEXT:    retq
@@ -440,24 +443,27 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
 ; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    subq $16, %rsp
 ; X64-NEXT:    movq %rdi, %rbx
-; X64-NEXT:    movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; X64-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    movaps %xmm2, %xmm0
 ; X64-NEXT:    callq __extendhfsf2 at PLT
 ; X64-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; X64-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; X64-NEXT:    callq __extendhfsf2 at PLT
 ; X64-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; X64-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; X64-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; X64-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; X64-NEXT:    callq __extendhfsf2 at PLT
+; X64-NEXT:    cvtss2sd %xmm0, %xmm0
 ; X64-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload
 ; X64-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    cvtss2sd %xmm1, %xmm1
 ; X64-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload
 ; X64-NEXT:    # xmm2 = mem[0],zero,zero,zero
-; X64-NEXT:    callq fmaf at PLT
-; X64-NEXT:    callq __truncsfhf2 at PLT
+; X64-NEXT:    cvtss2sd %xmm2, %xmm2
+; X64-NEXT:    callq fma at PLT
+; X64-NEXT:    callq __truncdfhf2 at PLT
 ; X64-NEXT:    pextrw $0, %xmm0, %eax
 ; X64-NEXT:    movw %ax, (%rbx)
 ; X64-NEXT:    addq $16, %rsp
@@ -467,7 +473,7 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
 ; X86-LABEL: test_half_fma:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $72, %esp
+; X86-NEXT:    subl $88, %esp
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    pinsrw $0, {{[0-9]+}}(%esp), %xmm0
@@ -487,17 +493,17 @@ define void @test_half_fma(half %a0, half %a1, half %a2, ptr %p0) nounwind {
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esp)
 ; X86-NEXT:    calll __extendhfsf2
-; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
-; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl {{[0-9]+}}(%esp)
 ; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
-; X86-NEXT:    fstps (%esp)
-; X86-NEXT:    calll fmaf
-; X86-NEXT:    fstps (%esp)
-; X86-NEXT:    calll __truncsfhf2
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    calll fma
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    calll __truncdfhf2
 ; X86-NEXT:    pextrw $0, %xmm0, %eax
 ; X86-NEXT:    movw %ax, (%esi)
-; X86-NEXT:    addl $72, %esp
+; X86-NEXT:    addl $88, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
   %res = call half @llvm.fma.half(half %a0, half %a1, half %a2)



More information about the llvm-commits mailing list