[llvm] fix `llvm.fma.f16` double rounding issue when there is no native support (PR #171904)
Folkert de Vries via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 12 06:58:20 PST 2025
https://github.com/folkertdev updated https://github.com/llvm/llvm-project/pull/171904
>From 88b91597075324da2065edfcc80f747581b6849e Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Thu, 11 Dec 2025 20:45:57 +0100
Subject: [PATCH 1/5] promote f16 fma to f64 if there is no instruction support
---
.../SelectionDAG/LegalizeFloatTypes.cpp | 26 +++++-
llvm/test/CodeGen/ARM/fp16-promote.ll | 80 ++++++++++++-------
2 files changed, 73 insertions(+), 33 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index bf1abfe50327e..51bc335550a16 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -3495,10 +3495,30 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) {
Op1 = DAG.getNode(PromotionOpcode, dl, NVT, Op1);
Op2 = DAG.getNode(PromotionOpcode, dl, NVT, Op2);
- SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2);
+ SDValue Res;
+ if (NVT == MVT::f32) {
+ // An f16 fma must go via f64 to prevent double rounding issues.
+ SDValue A64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op0);
+ SDValue B64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op1);
+ SDValue C64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op2);
+
+ // Prefer a wide FMA node if available; otherwise expand to mul+add.
+ SDValue WideRes;
+ if (TLI.isOperationLegalOrCustom(ISD::FMA, MVT::f64)) {
+ WideRes =
+ DAG.getNode(ISD::FMA, dl, MVT::f64, A64, B64, C64, N->getFlags());
+ } else {
+ SDValue Mul =
+ DAG.getNode(ISD::FMUL, dl, MVT::f64, A64, B64, N->getFlags());
+ WideRes = DAG.getNode(ISD::FADD, dl, MVT::f64, Mul, C64, N->getFlags());
+ }
- // Convert back to FP16 as an integer.
- return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
+ return DAG.getNode(GetPromotionOpcode(MVT::f64, OVT), dl, MVT::i16,
+ WideRes);
+ } else {
+ Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2, N->getFlags());
+ return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
+ }
}
SDValue DAGTypeLegalizer::SoftPromoteHalfRes_ExpOp(SDNode *N) {
diff --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll
index 8230e47259dd8..27a0bf2eb9037 100644
--- a/llvm/test/CodeGen/ARM/fp16-promote.ll
+++ b/llvm/test/CodeGen/ARM/fp16-promote.ll
@@ -1508,61 +1508,81 @@ define void @test_fma(ptr %p, ptr %q, ptr %r) #0 {
; CHECK-FP16-NEXT: push {r4, lr}
; CHECK-FP16-NEXT: mov r4, r0
; CHECK-FP16-NEXT: ldrh r0, [r1]
-; CHECK-FP16-NEXT: ldrh r1, [r4]
-; CHECK-FP16-NEXT: ldrh r2, [r2]
-; CHECK-FP16-NEXT: vmov s2, r0
+; CHECK-FP16-NEXT: ldrh r1, [r2]
+; CHECK-FP16-NEXT: vmov s0, r0
+; CHECK-FP16-NEXT: ldrh r0, [r4]
+; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT: vcvt.f64.f32 d16, s0
+; CHECK-FP16-NEXT: vmov s0, r0
+; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT: vcvt.f64.f32 d17, s0
; CHECK-FP16-NEXT: vmov s0, r1
-; CHECK-FP16-NEXT: vcvtb.f32.f16 s1, s2
-; CHECK-FP16-NEXT: vmov s2, r2
; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
-; CHECK-FP16-NEXT: vcvtb.f32.f16 s2, s2
-; CHECK-FP16-NEXT: bl fmaf
-; CHECK-FP16-NEXT: vcvtb.f16.f32 s0, s0
-; CHECK-FP16-NEXT: vmov r0, s0
+; CHECK-FP16-NEXT: vcvt.f64.f32 d18, s0
+; CHECK-FP16-NEXT: vmla.f64 d18, d17, d16
+; CHECK-FP16-NEXT: vmov r0, r1, d18
+; CHECK-FP16-NEXT: bl __aeabi_d2h
; CHECK-FP16-NEXT: strh r0, [r4]
; CHECK-FP16-NEXT: pop {r4, pc}
;
; CHECK-LIBCALL-VFP-LABEL: test_fma:
; CHECK-LIBCALL-VFP: .save {r4, r5, r6, lr}
; CHECK-LIBCALL-VFP-NEXT: push {r4, r5, r6, lr}
+; CHECK-LIBCALL-VFP-NEXT: .vsave {d8, d9}
+; CHECK-LIBCALL-VFP-NEXT: vpush {d8, d9}
; CHECK-LIBCALL-VFP-NEXT: mov r4, r0
-; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r2]
-; CHECK-LIBCALL-VFP-NEXT: mov r5, r1
+; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r0]
+; CHECK-LIBCALL-VFP-NEXT: mov r5, r2
+; CHECK-LIBCALL-VFP-NEXT: mov r6, r1
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f
-; CHECK-LIBCALL-VFP-NEXT: mov r6, r0
-; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r5]
+; CHECK-LIBCALL-VFP-NEXT: ldrh r1, [r6]
+; CHECK-LIBCALL-VFP-NEXT: vmov s16, r0
+; CHECK-LIBCALL-VFP-NEXT: ldrh r5, [r5]
+; CHECK-LIBCALL-VFP-NEXT: mov r0, r1
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f
-; CHECK-LIBCALL-VFP-NEXT: mov r5, r0
-; CHECK-LIBCALL-VFP-NEXT: ldrh r0, [r4]
+; CHECK-LIBCALL-VFP-NEXT: vmov s18, r0
+; CHECK-LIBCALL-VFP-NEXT: mov r0, r5
; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_h2f
; CHECK-LIBCALL-VFP-NEXT: vmov s0, r0
-; CHECK-LIBCALL-VFP-NEXT: vmov s1, r5
-; CHECK-LIBCALL-VFP-NEXT: vmov s2, r6
-; CHECK-LIBCALL-VFP-NEXT: bl fmaf
-; CHECK-LIBCALL-VFP-NEXT: vmov r0, s0
-; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_f2h
+; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d16, s18
+; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d17, s16
+; CHECK-LIBCALL-VFP-NEXT: vcvt.f64.f32 d18, s0
+; CHECK-LIBCALL-VFP-NEXT: vmla.f64 d18, d17, d16
+; CHECK-LIBCALL-VFP-NEXT: vmov r0, r1, d18
+; CHECK-LIBCALL-VFP-NEXT: bl __aeabi_d2h
; CHECK-LIBCALL-VFP-NEXT: strh r0, [r4]
+; CHECK-LIBCALL-VFP-NEXT: vpop {d8, d9}
; CHECK-LIBCALL-VFP-NEXT: pop {r4, r5, r6, pc}
;
; CHECK-NOVFP-LABEL: test_fma:
-; CHECK-NOVFP: .save {r4, r5, r6, lr}
-; CHECK-NOVFP-NEXT: push {r4, r5, r6, lr}
+; CHECK-NOVFP: .save {r4, r5, r6, r7, r11, lr}
+; CHECK-NOVFP-NEXT: push {r4, r5, r6, r7, r11, lr}
; CHECK-NOVFP-NEXT: mov r4, r0
; CHECK-NOVFP-NEXT: ldrh r0, [r1]
; CHECK-NOVFP-NEXT: mov r5, r2
; CHECK-NOVFP-NEXT: bl __aeabi_h2f
+; CHECK-NOVFP-NEXT: bl __aeabi_f2d
; CHECK-NOVFP-NEXT: mov r6, r0
-; CHECK-NOVFP-NEXT: ldrh r0, [r5]
-; CHECK-NOVFP-NEXT: bl __aeabi_h2f
-; CHECK-NOVFP-NEXT: mov r5, r0
; CHECK-NOVFP-NEXT: ldrh r0, [r4]
+; CHECK-NOVFP-NEXT: mov r7, r1
; CHECK-NOVFP-NEXT: bl __aeabi_h2f
-; CHECK-NOVFP-NEXT: mov r1, r6
-; CHECK-NOVFP-NEXT: mov r2, r5
-; CHECK-NOVFP-NEXT: bl fmaf
-; CHECK-NOVFP-NEXT: bl __aeabi_f2h
+; CHECK-NOVFP-NEXT: bl __aeabi_f2d
+; CHECK-NOVFP-NEXT: mov r2, r6
+; CHECK-NOVFP-NEXT: mov r3, r7
+; CHECK-NOVFP-NEXT: bl __aeabi_dmul
+; CHECK-NOVFP-NEXT: mov r6, r0
+; CHECK-NOVFP-NEXT: ldrh r0, [r5]
+; CHECK-NOVFP-NEXT: mov r7, r1
+; CHECK-NOVFP-NEXT: bl __aeabi_h2f
+; CHECK-NOVFP-NEXT: bl __aeabi_f2d
+; CHECK-NOVFP-NEXT: mov r2, r0
+; CHECK-NOVFP-NEXT: mov r3, r1
+; CHECK-NOVFP-NEXT: mov r0, r6
+; CHECK-NOVFP-NEXT: mov r1, r7
+; CHECK-NOVFP-NEXT: bl __aeabi_dadd
+; CHECK-NOVFP-NEXT: bl __aeabi_d2h
; CHECK-NOVFP-NEXT: strh r0, [r4]
-; CHECK-NOVFP-NEXT: pop {r4, r5, r6, pc}
+; CHECK-NOVFP-NEXT: pop {r4, r5, r6, r7, r11, pc}
%a = load half, ptr %p, align 2
%b = load half, ptr %q, align 2
%c = load half, ptr %r, align 2
>From ac049af9f877dbaa5275f6e3c7847e0bd47b6714 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Fri, 12 Dec 2025 14:15:00 +0100
Subject: [PATCH 2/5] respect flags in the fp_extend
---
.../SelectionDAG/LegalizeFloatTypes.cpp | 23 +++++++++----------
1 file changed, 11 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 51bc335550a16..3d9222ee88104 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -3487,6 +3487,7 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) {
SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
+ SDNodeFlags Flags = N->getFlags();
SDLoc dl(N);
// Promote to the larger FP type.
@@ -3498,27 +3499,25 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) {
SDValue Res;
if (NVT == MVT::f32) {
// An f16 fma must go via f64 to prevent double rounding issues.
- SDValue A64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op0);
- SDValue B64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op1);
- SDValue C64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op2);
+ SDValue A64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op0, Flags);
+ SDValue B64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op1, Flags);
+ SDValue C64 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Op2, Flags);
// Prefer a wide FMA node if available; otherwise expand to mul+add.
SDValue WideRes;
- if (TLI.isOperationLegalOrCustom(ISD::FMA, MVT::f64)) {
- WideRes =
- DAG.getNode(ISD::FMA, dl, MVT::f64, A64, B64, C64, N->getFlags());
+ if (TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), MVT::f64)) {
+ WideRes = DAG.getNode(ISD::FMA, dl, MVT::f64, A64, B64, C64, Flags);
} else {
- SDValue Mul =
- DAG.getNode(ISD::FMUL, dl, MVT::f64, A64, B64, N->getFlags());
- WideRes = DAG.getNode(ISD::FADD, dl, MVT::f64, Mul, C64, N->getFlags());
+ SDValue Mul = DAG.getNode(ISD::FMUL, dl, MVT::f64, A64, B64, Flags);
+ WideRes = DAG.getNode(ISD::FADD, dl, MVT::f64, Mul, C64, Flags);
}
return DAG.getNode(GetPromotionOpcode(MVT::f64, OVT), dl, MVT::i16,
WideRes);
- } else {
- Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2, N->getFlags());
- return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
}
+
+ Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2, Flags);
+ return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res);
}
SDValue DAGTypeLegalizer::SoftPromoteHalfRes_ExpOp(SDNode *N) {
>From 72c5b15e06b7e5e0d571425382fbb4369ac23105 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Fri, 12 Dec 2025 15:27:22 +0100
Subject: [PATCH 3/5] update riscv tests
---
llvm/test/CodeGen/RISCV/half-arith.ll | 366 ++++++++++++++-------
llvm/test/CodeGen/RISCV/half-intrinsics.ll | 44 ++-
2 files changed, 282 insertions(+), 128 deletions(-)
diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll
index d089e3678756c..a0a51f4f994dc 100644
--- a/llvm/test/CodeGen/RISCV/half-arith.ll
+++ b/llvm/test/CodeGen/RISCV/half-arith.ll
@@ -1105,28 +1105,41 @@ define half @fmadd_h(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: lui a1, 16
-; RV32I-NEXT: addi s3, a1, -1
-; RV32I-NEXT: and a0, a0, s3
+; RV32I-NEXT: addi s4, a1, -1
+; RV32I-NEXT: and a0, a0, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s2, a0
-; RV32I-NEXT: and a0, s1, s3
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: and a0, s1, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s2
+; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: call __muldf3
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: and a0, s0, s3
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: and a0, s0, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
-; RV32I-NEXT: mv a0, s2
-; RV32I-NEXT: mv a1, s1
-; RV32I-NEXT: call fmaf
-; RV32I-NEXT: call __truncsfhf2
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a1, s2
+; RV32I-NEXT: call __adddf3
+; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@@ -1144,17 +1157,22 @@ define half @fmadd_h(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: addi s3, a1, -1
; RV64I-NEXT: and a0, a0, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s2, a0
; RV64I-NEXT: and a0, s1, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s2
+; RV64I-NEXT: call __muldf3
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: and a0, s0, s3
; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv a2, a0
-; RV64I-NEXT: mv a0, s2
-; RV64I-NEXT: mv a1, s1
-; RV64I-NEXT: call fmaf
-; RV64I-NEXT: call __truncsfhf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s1
+; RV64I-NEXT: call __adddf3
+; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
@@ -1206,35 +1224,48 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a1
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: lui a0, 16
-; RV32I-NEXT: addi s2, a0, -1
-; RV32I-NEXT: and a0, a2, s2
+; RV32I-NEXT: addi s3, a0, -1
+; RV32I-NEXT: and a0, a2, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: call __addsf3
; RV32I-NEXT: call __truncsfhf2
; RV32I-NEXT: lui a1, 8
-; RV32I-NEXT: xor s3, a0, a1
-; RV32I-NEXT: and a0, s1, s2
+; RV32I-NEXT: xor s4, a0, a1
+; RV32I-NEXT: and a0, s1, s3
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: and a0, s0, s2
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: and a0, s0, s3
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a1, s2
+; RV32I-NEXT: call __muldf3
; RV32I-NEXT: mv s0, a0
-; RV32I-NEXT: and a0, s3, s2
+; RV32I-NEXT: mv s1, a1
+; RV32I-NEXT: and a0, s4, s3
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
-; RV32I-NEXT: mv a0, s1
-; RV32I-NEXT: mv a1, s0
-; RV32I-NEXT: call fmaf
-; RV32I-NEXT: call __truncsfhf2
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s0
+; RV32I-NEXT: mv a1, s1
+; RV32I-NEXT: call __adddf3
+; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@@ -1259,17 +1290,22 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: xor s3, a0, a1
; RV64I-NEXT: and a0, s1, s2
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: and a0, s0, s2
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s1
+; RV64I-NEXT: call __muldf3
; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s3, s2
; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv a2, a0
-; RV64I-NEXT: mv a0, s1
-; RV64I-NEXT: mv a1, s0
-; RV64I-NEXT: call fmaf
-; RV64I-NEXT: call __truncsfhf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s0
+; RV64I-NEXT: call __adddf3
+; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
@@ -1341,8 +1377,8 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a1
-; RV32I-NEXT: lui a1, 16
-; RV32I-NEXT: addi s3, a1, -1
+; RV32I-NEXT: lui s3, 16
+; RV32I-NEXT: addi s3, s3, -1
; RV32I-NEXT: and a0, a0, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: li a1, 0
@@ -1359,17 +1395,26 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: xor s4, a0, a1
; RV32I-NEXT: and a0, s1, s3
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s0, a0
+; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: and a0, s2, s3
; RV32I-NEXT: call __extendhfsf2
-; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, s0
+; RV32I-NEXT: mv a3, s1
+; RV32I-NEXT: call __muldf3
+; RV32I-NEXT: mv s0, a0
+; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: and a0, s4, s3
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
-; RV32I-NEXT: mv a0, s1
-; RV32I-NEXT: mv a1, s0
-; RV32I-NEXT: call fmaf
-; RV32I-NEXT: call __truncsfhf2
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s0
+; RV32I-NEXT: mv a1, s1
+; RV32I-NEXT: call __adddf3
+; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
@@ -1390,8 +1435,8 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill
; RV64I-NEXT: mv s0, a2
; RV64I-NEXT: mv s1, a1
-; RV64I-NEXT: lui a1, 16
-; RV64I-NEXT: addi s3, a1, -1
+; RV64I-NEXT: lui s3, 16
+; RV64I-NEXT: addi s3, s3, -1
; RV64I-NEXT: and a0, a0, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: li a1, 0
@@ -1408,17 +1453,21 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: xor s4, a0, a1
; RV64I-NEXT: and a0, s1, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s2, s3
; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv s1, a0
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, s0
+; RV64I-NEXT: call __muldf3
+; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s4, s3
; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv a2, a0
-; RV64I-NEXT: mv a0, s1
-; RV64I-NEXT: mv a1, s0
-; RV64I-NEXT: call fmaf
-; RV64I-NEXT: call __truncsfhf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s0
+; RV64I-NEXT: call __adddf3
+; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
@@ -1503,8 +1552,8 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: lui a0, 16
-; RV32I-NEXT: addi s3, a0, -1
+; RV32I-NEXT: lui s3, 16
+; RV32I-NEXT: addi s3, s3, -1
; RV32I-NEXT: and a0, a1, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: li a1, 0
@@ -1521,17 +1570,28 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: xor s4, a0, a1
; RV32I-NEXT: and a0, s1, s3
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s0, a0
+; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: and a0, s2, s3
; RV32I-NEXT: call __extendhfsf2
-; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s0
+; RV32I-NEXT: mv a1, s1
+; RV32I-NEXT: call __muldf3
+; RV32I-NEXT: mv s0, a0
+; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: and a0, s4, s3
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s0
; RV32I-NEXT: mv a1, s1
-; RV32I-NEXT: call fmaf
-; RV32I-NEXT: call __truncsfhf2
+; RV32I-NEXT: call __adddf3
+; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
@@ -1552,8 +1612,8 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill
; RV64I-NEXT: mv s0, a2
; RV64I-NEXT: mv s1, a0
-; RV64I-NEXT: lui a0, 16
-; RV64I-NEXT: addi s3, a0, -1
+; RV64I-NEXT: lui s3, 16
+; RV64I-NEXT: addi s3, s3, -1
; RV64I-NEXT: and a0, a1, s3
; RV64I-NEXT: call __extendhfsf2
; RV64I-NEXT: li a1, 0
@@ -1570,17 +1630,22 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: xor s4, a0, a1
; RV64I-NEXT: and a0, s1, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s2, s3
; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv s1, a0
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s0
+; RV64I-NEXT: call __muldf3
+; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s4, s3
; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv a2, a0
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s0
-; RV64I-NEXT: mv a1, s1
-; RV64I-NEXT: call fmaf
-; RV64I-NEXT: call __truncsfhf2
+; RV64I-NEXT: call __adddf3
+; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
@@ -1671,23 +1736,35 @@ define half @fnmadd_h_3(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: lui a1, 16
-; RV32I-NEXT: addi s3, a1, -1
-; RV32I-NEXT: and a0, a0, s3
+; RV32I-NEXT: addi s4, a1, -1
+; RV32I-NEXT: and a0, a0, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s2, a0
-; RV32I-NEXT: and a0, s1, s3
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: and a0, s1, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s2
+; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: call __muldf3
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: and a0, s0, s3
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: and a0, s0, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
-; RV32I-NEXT: mv a0, s2
-; RV32I-NEXT: mv a1, s1
-; RV32I-NEXT: call fmaf
-; RV32I-NEXT: call __truncsfhf2
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a1, s2
+; RV32I-NEXT: call __adddf3
+; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lui a1, 1048568
; RV32I-NEXT: xor a0, a0, a1
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
@@ -1695,6 +1772,7 @@ define half @fnmadd_h_3(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@@ -1712,17 +1790,22 @@ define half @fnmadd_h_3(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: addi s3, a1, -1
; RV64I-NEXT: and a0, a0, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s2, a0
; RV64I-NEXT: and a0, s1, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s2
+; RV64I-NEXT: call __muldf3
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: and a0, s0, s3
; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv a2, a0
-; RV64I-NEXT: mv a0, s2
-; RV64I-NEXT: mv a1, s1
-; RV64I-NEXT: call fmaf
-; RV64I-NEXT: call __truncsfhf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s1
+; RV64I-NEXT: call __adddf3
+; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: lui a1, 1048568
; RV64I-NEXT: xor a0, a0, a1
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
@@ -1792,23 +1875,35 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: lui a1, 16
-; RV32I-NEXT: addi s3, a1, -1
-; RV32I-NEXT: and a0, a0, s3
+; RV32I-NEXT: addi s4, a1, -1
+; RV32I-NEXT: and a0, a0, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s2, a0
-; RV32I-NEXT: and a0, s1, s3
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: and a0, s1, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s2
+; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: call __muldf3
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: and a0, s0, s3
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: and a0, s0, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
-; RV32I-NEXT: mv a0, s2
-; RV32I-NEXT: mv a1, s1
-; RV32I-NEXT: call fmaf
-; RV32I-NEXT: call __truncsfhf2
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a1, s2
+; RV32I-NEXT: call __adddf3
+; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lui a1, 1048568
; RV32I-NEXT: xor a0, a0, a1
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
@@ -1816,6 +1911,7 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@@ -1833,17 +1929,22 @@ define half @fnmadd_nsz(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: addi s3, a1, -1
; RV64I-NEXT: and a0, a0, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s2, a0
; RV64I-NEXT: and a0, s1, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s2
+; RV64I-NEXT: call __muldf3
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: and a0, s0, s3
; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv a2, a0
-; RV64I-NEXT: mv a0, s2
-; RV64I-NEXT: mv a1, s1
-; RV64I-NEXT: call fmaf
-; RV64I-NEXT: call __truncsfhf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s1
+; RV64I-NEXT: call __adddf3
+; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: lui a1, 1048568
; RV64I-NEXT: xor a0, a0, a1
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
@@ -1905,34 +2006,46 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: lui a1, 16
-; RV32I-NEXT: addi s2, a1, -1
-; RV32I-NEXT: and a0, a0, s2
+; RV32I-NEXT: addi s3, a1, -1
+; RV32I-NEXT: and a0, a0, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: call __addsf3
; RV32I-NEXT: call __truncsfhf2
; RV32I-NEXT: lui a1, 8
-; RV32I-NEXT: xor s3, a0, a1
-; RV32I-NEXT: and a0, s1, s2
+; RV32I-NEXT: xor s4, a0, a1
+; RV32I-NEXT: and a0, s1, s3
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: and a0, s0, s2
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: and a0, s4, s3
; RV32I-NEXT: call __extendhfsf2
-; RV32I-NEXT: mv s0, a0
-; RV32I-NEXT: and a0, s3, s2
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, s1
+; RV32I-NEXT: mv a3, s2
+; RV32I-NEXT: call __muldf3
+; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: and a0, s0, s3
; RV32I-NEXT: call __extendhfsf2
-; RV32I-NEXT: mv a1, s1
-; RV32I-NEXT: mv a2, s0
-; RV32I-NEXT: call fmaf
-; RV32I-NEXT: call __truncsfhf2
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a1, s2
+; RV32I-NEXT: call __adddf3
+; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@@ -1957,16 +2070,21 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: xor s3, a0, a1
; RV64I-NEXT: and a0, s1, s2
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s1, a0
-; RV64I-NEXT: and a0, s0, s2
-; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s3, s2
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, s1
-; RV64I-NEXT: mv a2, s0
-; RV64I-NEXT: call fmaf
-; RV64I-NEXT: call __truncsfhf2
+; RV64I-NEXT: call __muldf3
+; RV64I-NEXT: mv s1, a0
+; RV64I-NEXT: and a0, s0, s2
+; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s1
+; RV64I-NEXT: call __adddf3
+; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
@@ -2033,35 +2151,48 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a0
; RV32I-NEXT: lui a0, 16
-; RV32I-NEXT: addi s2, a0, -1
-; RV32I-NEXT: and a0, a1, s2
+; RV32I-NEXT: addi s3, a0, -1
+; RV32I-NEXT: and a0, a1, s3
; RV32I-NEXT: call __extendhfsf2
; RV32I-NEXT: li a1, 0
; RV32I-NEXT: call __addsf3
; RV32I-NEXT: call __truncsfhf2
; RV32I-NEXT: lui a1, 8
-; RV32I-NEXT: xor s3, a0, a1
-; RV32I-NEXT: and a0, s1, s2
+; RV32I-NEXT: xor s4, a0, a1
+; RV32I-NEXT: and a0, s1, s3
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: and a0, s0, s2
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: and a0, s4, s3
; RV32I-NEXT: call __extendhfsf2
-; RV32I-NEXT: mv s0, a0
-; RV32I-NEXT: and a0, s3, s2
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a1, s2
+; RV32I-NEXT: call __muldf3
+; RV32I-NEXT: mv s1, a0
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: and a0, s0, s3
; RV32I-NEXT: call __extendhfsf2
-; RV32I-NEXT: mv a1, a0
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
; RV32I-NEXT: mv a0, s1
-; RV32I-NEXT: mv a2, s0
-; RV32I-NEXT: call fmaf
-; RV32I-NEXT: call __truncsfhf2
+; RV32I-NEXT: mv a1, s2
+; RV32I-NEXT: call __adddf3
+; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@@ -2086,17 +2217,22 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: xor s3, a0, a1
; RV64I-NEXT: and a0, s1, s2
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s1, a0
-; RV64I-NEXT: and a0, s0, s2
-; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv s0, a0
; RV64I-NEXT: and a0, s3, s2
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv a1, a0
; RV64I-NEXT: mv a0, s1
-; RV64I-NEXT: mv a2, s0
-; RV64I-NEXT: call fmaf
-; RV64I-NEXT: call __truncsfhf2
+; RV64I-NEXT: call __muldf3
+; RV64I-NEXT: mv s1, a0
+; RV64I-NEXT: and a0, s0, s2
+; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s1
+; RV64I-NEXT: call __adddf3
+; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/half-intrinsics.ll b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
index 847054d96968a..10bcc444b5d77 100644
--- a/llvm/test/CodeGen/RISCV/half-intrinsics.ll
+++ b/llvm/test/CodeGen/RISCV/half-intrinsics.ll
@@ -1712,28 +1712,41 @@ define half @fma_f16(half %a, half %b, half %c) nounwind {
; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill
; RV32I-NEXT: mv s0, a2
; RV32I-NEXT: mv s1, a1
; RV32I-NEXT: lui a1, 16
-; RV32I-NEXT: addi s3, a1, -1
-; RV32I-NEXT: and a0, a0, s3
+; RV32I-NEXT: addi s4, a1, -1
+; RV32I-NEXT: and a0, a0, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv s2, a0
-; RV32I-NEXT: and a0, s1, s3
+; RV32I-NEXT: mv s3, a1
+; RV32I-NEXT: and a0, s1, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
+; RV32I-NEXT: mv a2, a0
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s2
+; RV32I-NEXT: mv a1, s3
+; RV32I-NEXT: call __muldf3
; RV32I-NEXT: mv s1, a0
-; RV32I-NEXT: and a0, s0, s3
+; RV32I-NEXT: mv s2, a1
+; RV32I-NEXT: and a0, s0, s4
; RV32I-NEXT: call __extendhfsf2
+; RV32I-NEXT: call __extendsfdf2
; RV32I-NEXT: mv a2, a0
-; RV32I-NEXT: mv a0, s2
-; RV32I-NEXT: mv a1, s1
-; RV32I-NEXT: call fmaf
-; RV32I-NEXT: call __truncsfhf2
+; RV32I-NEXT: mv a3, a1
+; RV32I-NEXT: mv a0, s1
+; RV32I-NEXT: mv a1, s2
+; RV32I-NEXT: call __adddf3
+; RV32I-NEXT: call __truncdfhf2
; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload
; RV32I-NEXT: addi sp, sp, 32
; RV32I-NEXT: ret
;
@@ -1751,17 +1764,22 @@ define half @fma_f16(half %a, half %b, half %c) nounwind {
; RV64I-NEXT: addi s3, a1, -1
; RV64I-NEXT: and a0, a0, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
; RV64I-NEXT: mv s2, a0
; RV64I-NEXT: and a0, s1, s3
; RV64I-NEXT: call __extendhfsf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s2
+; RV64I-NEXT: call __muldf3
; RV64I-NEXT: mv s1, a0
; RV64I-NEXT: and a0, s0, s3
; RV64I-NEXT: call __extendhfsf2
-; RV64I-NEXT: mv a2, a0
-; RV64I-NEXT: mv a0, s2
-; RV64I-NEXT: mv a1, s1
-; RV64I-NEXT: call fmaf
-; RV64I-NEXT: call __truncsfhf2
+; RV64I-NEXT: call __extendsfdf2
+; RV64I-NEXT: mv a1, a0
+; RV64I-NEXT: mv a0, s1
+; RV64I-NEXT: call __adddf3
+; RV64I-NEXT: call __truncdfhf2
; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s0, 32(sp) # 8-byte Folded Reload
; RV64I-NEXT: ld s1, 24(sp) # 8-byte Folded Reload
>From 5da8ca877e5db2490ae59db5abae24a8834eb231 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Fri, 12 Dec 2025 15:28:59 +0100
Subject: [PATCH 4/5] update thumb2 tests
---
llvm/test/CodeGen/Thumb2/bf16-instructions.ll | 111 ++++++++++++++----
1 file changed, 85 insertions(+), 26 deletions(-)
diff --git a/llvm/test/CodeGen/Thumb2/bf16-instructions.ll b/llvm/test/CodeGen/Thumb2/bf16-instructions.ll
index 313d237d54b35..3c1cb9152d0c2 100644
--- a/llvm/test/CodeGen/Thumb2/bf16-instructions.ll
+++ b/llvm/test/CodeGen/Thumb2/bf16-instructions.ll
@@ -1985,33 +1985,92 @@ define bfloat @test_fma(bfloat %a, bfloat %b, bfloat %c) {
;
; CHECK-NOFP-LABEL: test_fma:
; CHECK-NOFP: @ %bb.0:
-; CHECK-NOFP-NEXT: .save {r7, lr}
-; CHECK-NOFP-NEXT: push {r7, lr}
-; CHECK-NOFP-NEXT: lsls r0, r0, #16
-; CHECK-NOFP-NEXT: lsls r1, r1, #16
-; CHECK-NOFP-NEXT: lsls r2, r2, #16
-; CHECK-NOFP-NEXT: bl fmaf
-; CHECK-NOFP-NEXT: bl __truncsfbf2
-; CHECK-NOFP-NEXT: pop {r7, pc}
+; CHECK-NOFP-NEXT: .save {r4, r5, r6, r7, lr}
+; CHECK-NOFP-NEXT: push {r4, r5, r6, r7, lr}
+; CHECK-NOFP-NEXT: .pad #4
+; CHECK-NOFP-NEXT: sub sp, #4
+; CHECK-NOFP-NEXT: mov r5, r0
+; CHECK-NOFP-NEXT: lsls r0, r1, #16
+; CHECK-NOFP-NEXT: mov r4, r2
+; CHECK-NOFP-NEXT: bl __aeabi_f2d
+; CHECK-NOFP-NEXT: mov r6, r0
+; CHECK-NOFP-NEXT: lsls r0, r5, #16
+; CHECK-NOFP-NEXT: mov r7, r1
+; CHECK-NOFP-NEXT: bl __aeabi_f2d
+; CHECK-NOFP-NEXT: mov r2, r6
+; CHECK-NOFP-NEXT: mov r3, r7
+; CHECK-NOFP-NEXT: bl __aeabi_dmul
+; CHECK-NOFP-NEXT: mov r5, r0
+; CHECK-NOFP-NEXT: lsls r0, r4, #16
+; CHECK-NOFP-NEXT: mov r6, r1
+; CHECK-NOFP-NEXT: bl __aeabi_f2d
+; CHECK-NOFP-NEXT: mov r2, r0
+; CHECK-NOFP-NEXT: mov r3, r1
+; CHECK-NOFP-NEXT: mov r0, r5
+; CHECK-NOFP-NEXT: mov r1, r6
+; CHECK-NOFP-NEXT: bl __aeabi_dadd
+; CHECK-NOFP-NEXT: bl __truncdfbf2
+; CHECK-NOFP-NEXT: add sp, #4
+; CHECK-NOFP-NEXT: pop {r4, r5, r6, r7, pc}
;
-; CHECK-FP-LABEL: test_fma:
-; CHECK-FP: @ %bb.0:
-; CHECK-FP-NEXT: .save {r7, lr}
-; CHECK-FP-NEXT: push {r7, lr}
-; CHECK-FP-NEXT: vmov r0, s0
-; CHECK-FP-NEXT: vmov r1, s1
-; CHECK-FP-NEXT: vmov r2, s2
-; CHECK-FP-NEXT: lsls r0, r0, #16
-; CHECK-FP-NEXT: lsls r1, r1, #16
-; CHECK-FP-NEXT: vmov s4, r0
-; CHECK-FP-NEXT: lsls r0, r2, #16
-; CHECK-FP-NEXT: vmov s2, r1
-; CHECK-FP-NEXT: vmov s0, r0
-; CHECK-FP-NEXT: vfma.f32 s0, s4, s2
-; CHECK-FP-NEXT: bl __truncsfbf2
-; CHECK-FP-NEXT: vmov.f16 r0, s0
-; CHECK-FP-NEXT: vmov s0, r0
-; CHECK-FP-NEXT: pop {r7, pc}
+; CHECK-FPNO64-LABEL: test_fma:
+; CHECK-FPNO64: @ %bb.0:
+; CHECK-FPNO64-NEXT: .save {r4, r5, r6, lr}
+; CHECK-FPNO64-NEXT: push {r4, r5, r6, lr}
+; CHECK-FPNO64-NEXT: .vsave {d8}
+; CHECK-FPNO64-NEXT: vpush {d8}
+; CHECK-FPNO64-NEXT: vmov r0, s0
+; CHECK-FPNO64-NEXT: vmov.f32 s16, s2
+; CHECK-FPNO64-NEXT: vmov r6, s1
+; CHECK-FPNO64-NEXT: lsls r0, r0, #16
+; CHECK-FPNO64-NEXT: bl __aeabi_f2d
+; CHECK-FPNO64-NEXT: mov r4, r0
+; CHECK-FPNO64-NEXT: lsls r0, r6, #16
+; CHECK-FPNO64-NEXT: mov r5, r1
+; CHECK-FPNO64-NEXT: bl __aeabi_f2d
+; CHECK-FPNO64-NEXT: mov r2, r0
+; CHECK-FPNO64-NEXT: mov r3, r1
+; CHECK-FPNO64-NEXT: mov r0, r4
+; CHECK-FPNO64-NEXT: mov r1, r5
+; CHECK-FPNO64-NEXT: bl __aeabi_dmul
+; CHECK-FPNO64-NEXT: mov r4, r0
+; CHECK-FPNO64-NEXT: vmov r0, s16
+; CHECK-FPNO64-NEXT: mov r5, r1
+; CHECK-FPNO64-NEXT: lsls r0, r0, #16
+; CHECK-FPNO64-NEXT: bl __aeabi_f2d
+; CHECK-FPNO64-NEXT: mov r2, r0
+; CHECK-FPNO64-NEXT: mov r3, r1
+; CHECK-FPNO64-NEXT: mov r0, r4
+; CHECK-FPNO64-NEXT: mov r1, r5
+; CHECK-FPNO64-NEXT: bl __aeabi_dadd
+; CHECK-FPNO64-NEXT: vmov d0, r0, r1
+; CHECK-FPNO64-NEXT: bl __truncdfbf2
+; CHECK-FPNO64-NEXT: vmov.f16 r0, s0
+; CHECK-FPNO64-NEXT: vmov s0, r0
+; CHECK-FPNO64-NEXT: vpop {d8}
+; CHECK-FPNO64-NEXT: pop {r4, r5, r6, pc}
+;
+; CHECK-FP64-LABEL: test_fma:
+; CHECK-FP64: @ %bb.0:
+; CHECK-FP64-NEXT: .save {r7, lr}
+; CHECK-FP64-NEXT: push {r7, lr}
+; CHECK-FP64-NEXT: vmov r0, s1
+; CHECK-FP64-NEXT: vmov r1, s0
+; CHECK-FP64-NEXT: vmov r2, s2
+; CHECK-FP64-NEXT: lsls r0, r0, #16
+; CHECK-FP64-NEXT: lsls r1, r1, #16
+; CHECK-FP64-NEXT: vmov s2, r0
+; CHECK-FP64-NEXT: vmov s0, r1
+; CHECK-FP64-NEXT: lsls r0, r2, #16
+; CHECK-FP64-NEXT: vmov s4, r0
+; CHECK-FP64-NEXT: vcvt.f64.f32 d3, s0
+; CHECK-FP64-NEXT: vcvt.f64.f32 d1, s2
+; CHECK-FP64-NEXT: vcvt.f64.f32 d0, s4
+; CHECK-FP64-NEXT: vfma.f64 d0, d3, d1
+; CHECK-FP64-NEXT: bl __truncdfbf2
+; CHECK-FP64-NEXT: vmov.f16 r0, s0
+; CHECK-FP64-NEXT: vmov s0, r0
+; CHECK-FP64-NEXT: pop {r7, pc}
%r = call bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c)
ret bfloat %r
}
>From e33c2dd41009e8b34ce0961e452eebdb7d97771e Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Fri, 12 Dec 2025 15:57:41 +0100
Subject: [PATCH 5/5] update amdgpu tests
---
llvm/test/CodeGen/AMDGPU/bf16.ll | 2445 ++++++++++++++++----------
llvm/test/CodeGen/Generic/half-op.ll | 2 +-
2 files changed, 1541 insertions(+), 906 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 711d57baac15f..f491a8bf51cb1 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -50333,22 +50333,30 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_fma_f32 v0, v0, v1, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
+; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
+; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GCN-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fma_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX7-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -50479,12 +50487,20 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_fma_f32 v1, v1, v3, v5
-; GCN-NEXT: v_fma_f32 v0, v0, v2, v4
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v5
+; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v3
+; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
+; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
+; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
+; GCN-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GCN-NEXT: v_fma_f64 v[2:3], v[10:11], v[8:9], v[6:7]
+; GCN-NEXT: v_cvt_f32_f64_e32 v1, v[0:1]
+; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[2:3]
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -50492,21 +50508,29 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
; GFX7-LABEL: v_fma_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_fma_f32 v1, v1, v3, v5
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_fma_f32 v0, v0, v2, v3
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
+; GFX7-NEXT: v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
+; GFX7-NEXT: v_fma_f64 v[0:1], v[10:11], v[2:3], v[0:1]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v2, v[4:5]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v1, v[0:1]
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -50680,46 +50704,70 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_fma_f32 v2, v2, v5, v8
-; GCN-NEXT: v_fma_f32 v1, v1, v4, v7
-; GCN-NEXT: v_fma_f32 v0, v0, v3, v6
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v8
+; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v5
+; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v9
+; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v7
+; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v10
+; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v11
+; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
+; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
+; GCN-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
+; GCN-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GCN-NEXT: v_fma_f64 v[2:3], v[10:11], v[8:9], v[6:7]
+; GCN-NEXT: v_fma_f64 v[4:5], v[16:17], v[14:15], v[12:13]
+; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GCN-NEXT: v_cvt_f32_f64_e32 v1, v[2:3]
+; GCN-NEXT: v_cvt_f32_f64_e32 v3, v[4:5]
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fma_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v8
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v6
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v0
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_fma_f32 v2, v2, v5, v8
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX7-NEXT: v_fma_f64 v[0:1], v[8:9], v[5:6], v[2:3]
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_fma_f32 v1, v1, v4, v5
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_fma_f32 v0, v0, v3, v4
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v13
+; GFX7-NEXT: v_cvt_f32_f64_e32 v14, v[0:1]
+; GFX7-NEXT: v_fma_f64 v[0:1], v[10:11], v[8:9], v[6:7]
+; GFX7-NEXT: v_fma_f64 v[2:3], v[12:13], v[4:5], v[2:3]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v1, v[2:3]
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v14
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -50959,59 +51007,91 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_fma_f32 v3, v3, v7, v11
-; GCN-NEXT: v_fma_f32 v2, v2, v6, v10
-; GCN-NEXT: v_fma_f32 v1, v1, v5, v9
-; GCN-NEXT: v_fma_f32 v0, v0, v4, v8
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v11
+; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v7
+; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v12
+; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v10
+; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v13
+; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v14
+; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v15
+; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v16
+; GCN-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v17
+; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v18
+; GCN-NEXT: v_cvt_f64_f32_e32 v[16:17], v19
+; GCN-NEXT: v_cvt_f64_f32_e32 v[18:19], v20
+; GCN-NEXT: v_fma_f64 v[6:7], v[10:11], v[8:9], v[6:7]
+; GCN-NEXT: v_fma_f64 v[2:3], v[2:3], v[14:15], v[12:13]
+; GCN-NEXT: v_fma_f64 v[4:5], v[18:19], v[16:17], v[4:5]
+; GCN-NEXT: v_cvt_f32_f64_e32 v8, v[0:1]
+; GCN-NEXT: v_cvt_f32_f64_e32 v6, v[6:7]
+; GCN-NEXT: v_cvt_f32_f64_e32 v1, v[2:3]
+; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[4:5]
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v8
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fma_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[7:8], v5
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_fma_f32 v3, v3, v7, v11
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v10
+; GFX7-NEXT: v_fma_f64 v[0:1], v[7:8], v[3:4], v[0:1]
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_fma_f32 v2, v2, v6, v7
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v9
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_fma_f32 v1, v1, v5, v6
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v8
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_fma_f32 v0, v0, v4, v5
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[16:17], v16
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
+; GFX7-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3]
+; GFX7-NEXT: v_fma_f64 v[4:5], v[16:17], v[14:15], v[12:13]
+; GFX7-NEXT: v_fma_f64 v[6:7], v[18:19], v[10:11], v[8:9]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v4, v[4:5]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v5, v[6:7]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v3, v[0:1]
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -51305,52 +51385,84 @@ define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat>
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v11
+; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_fma_f32 v7, v7, v15, v23
-; GCN-NEXT: v_fma_f32 v6, v6, v14, v22
-; GCN-NEXT: v_fma_f32 v5, v5, v13, v21
-; GCN-NEXT: v_fma_f32 v4, v4, v12, v20
-; GCN-NEXT: v_fma_f32 v3, v3, v11, v19
-; GCN-NEXT: v_fma_f32 v2, v2, v10, v18
-; GCN-NEXT: v_fma_f32 v1, v1, v9, v17
-; GCN-NEXT: v_fma_f32 v0, v0, v8, v16
+; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v17
+; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v16
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v23
+; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v15
+; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
+; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v22
+; GCN-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
+; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v14
+; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v24
+; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v21
+; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v13
+; GCN-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[6:7]
+; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v25
+; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v20
+; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
+; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v26
+; GCN-NEXT: v_fma_f64 v[4:5], v[4:5], v[10:11], v[8:9]
+; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v19
+; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v27
+; GCN-NEXT: v_cvt_f64_f32_e32 v[16:17], v28
+; GCN-NEXT: v_cvt_f64_f32_e32 v[18:19], v18
+; GCN-NEXT: v_fma_f64 v[6:7], v[14:15], v[12:13], v[6:7]
+; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v29
+; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v30
+; GCN-NEXT: v_cvt_f64_f32_e32 v[20:21], v31
+; GCN-NEXT: v_cvt_f64_f32_e32 v[22:23], v32
+; GCN-NEXT: v_fma_f64 v[8:9], v[16:17], v[10:11], v[8:9]
+; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v33
+; GCN-NEXT: v_cvt_f64_f32_e32 v[16:17], v34
+; GCN-NEXT: v_cvt_f64_f32_e32 v[24:25], v35
+; GCN-NEXT: v_cvt_f64_f32_e32 v[26:27], v36
+; GCN-NEXT: v_fma_f64 v[12:13], v[14:15], v[12:13], v[18:19]
+; GCN-NEXT: v_fma_f64 v[10:11], v[10:11], v[22:23], v[20:21]
+; GCN-NEXT: v_fma_f64 v[14:15], v[26:27], v[24:25], v[16:17]
+; GCN-NEXT: v_cvt_f32_f64_e32 v16, v[0:1]
+; GCN-NEXT: v_cvt_f32_f64_e32 v17, v[2:3]
+; GCN-NEXT: v_cvt_f32_f64_e32 v5, v[4:5]
+; GCN-NEXT: v_cvt_f32_f64_e32 v4, v[6:7]
+; GCN-NEXT: v_cvt_f32_f64_e32 v3, v[8:9]
+; GCN-NEXT: v_cvt_f32_f64_e32 v2, v[12:13]
+; GCN-NEXT: v_cvt_f32_f64_e32 v1, v[10:11]
+; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[14:15]
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v17
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v16
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fma_v8bf16:
@@ -51362,64 +51474,96 @@ define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat>
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_fma_f32 v7, v7, v15, v23
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[23:24], v23
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[25:26], v15
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[27:28], v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v22
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_fma_f32 v6, v6, v14, v15
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v6
+; GFX7-NEXT: v_fma_f64 v[23:24], v[27:28], v[25:26], v[23:24]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v14
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[25:26], v22
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_fma_f64 v[6:7], v[25:26], v[14:15], v[6:7]
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v21
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_fma_f32 v5, v5, v13, v14
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v13
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v14
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[21:22], v15
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[25:26], v5
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v20
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v12
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v20
+; GFX7-NEXT: v_fma_f64 v[13:14], v[25:26], v[21:22], v[13:14]
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_fma_f32 v4, v4, v12, v13
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v4
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v12
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v15
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[25:26], v22
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v19
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_fma_f32 v3, v3, v11, v12
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v12
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[27:28], v15
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[29:30], v3
+; GFX7-NEXT: v_fma_f64 v[3:4], v[25:26], v[20:21], v[4:5]
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v18
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v18
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_fma_f32 v2, v2, v10, v11
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v17
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[18:19], v10
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v5
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[25:26], v2
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v16
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_fma_f32 v1, v1, v9, v11
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v16
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_fma_f32 v0, v0, v8, v9
+; GFX7-NEXT: v_fma_f64 v[11:12], v[29:30], v[27:28], v[11:12]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[15:16], v5
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[27:28], v2
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[29:30], v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v17
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_fma_f64 v[18:19], v[25:26], v[20:21], v[18:19]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v5
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT: v_fma_f64 v[15:16], v[29:30], v[27:28], v[15:16]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v10, v[23:24]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v6, v[6:7]
+; GFX7-NEXT: v_fma_f64 v[0:1], v[20:21], v[8:9], v[0:1]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v5, v[13:14]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v4, v[3:4]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v2, v[15:16]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v1, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v3, v[18:19]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v7, v[11:12]
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v8bf16:
@@ -51924,18 +52068,24 @@ define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bf
; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GCN-NEXT: v_fma_f32 v15, v15, v31, v32
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v32
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v31
+; GCN-NEXT: v_cvt_f64_f32_e32 v[31:32], v15
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GCN-NEXT: v_fma_f64 v[31:32], v[31:32], v[35:36], v[33:34]
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GCN-NEXT: v_fma_f32 v14, v14, v30, v31
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v14
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v15
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v14
+; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v30
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GCN-NEXT: v_fma_f64 v[14:15], v[33:34], v[14:15], v[35:36]
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:56
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
@@ -51943,17 +52093,23 @@ define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bf
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT: v_fma_f32 v13, v13, v29, v30
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v30
+; GCN-NEXT: v_cvt_f64_f32_e32 v[29:30], v29
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v13
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GCN-NEXT: v_fma_f64 v[29:30], v[33:34], v[29:30], v[35:36]
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52
+; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT: v_fma_f32 v12, v12, v28, v29
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v12
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v12
+; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v28
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GCN-NEXT: v_fma_f64 v[12:13], v[33:34], v[12:13], v[35:36]
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:48
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
@@ -51961,17 +52117,23 @@ define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bf
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT: v_fma_f32 v11, v11, v27, v28
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v28
+; GCN-NEXT: v_cvt_f64_f32_e32 v[27:28], v27
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v11
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GCN-NEXT: v_fma_f64 v[27:28], v[33:34], v[27:28], v[35:36]
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:44
+; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_fma_f32 v10, v10, v26, v27
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v10
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v11
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v10
+; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v26
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GCN-NEXT: v_fma_f64 v[10:11], v[33:34], v[10:11], v[35:36]
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
@@ -51979,17 +52141,23 @@ define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bf
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_fma_f32 v9, v9, v25, v26
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v26
+; GCN-NEXT: v_cvt_f64_f32_e32 v[25:26], v25
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v9
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GCN-NEXT: v_fma_f64 v[25:26], v[33:34], v[25:26], v[35:36]
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:36
+; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_fma_f32 v8, v8, v24, v25
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v8
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v9
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v8
+; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v24
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GCN-NEXT: v_fma_f64 v[8:9], v[33:34], v[8:9], v[35:36]
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:32
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
@@ -51997,17 +52165,23 @@ define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bf
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_fma_f32 v7, v7, v23, v24
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v24
+; GCN-NEXT: v_cvt_f64_f32_e32 v[23:24], v23
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v7
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GCN-NEXT: v_fma_f64 v[23:24], v[33:34], v[23:24], v[35:36]
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:28
+; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_fma_f32 v6, v6, v22, v23
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v6
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v7
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v6
+; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v22
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GCN-NEXT: v_fma_f64 v[6:7], v[33:34], v[6:7], v[35:36]
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
@@ -52015,17 +52189,23 @@ define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bf
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_fma_f32 v5, v5, v21, v22
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v22
+; GCN-NEXT: v_cvt_f64_f32_e32 v[21:22], v21
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v5
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GCN-NEXT: v_fma_f64 v[21:22], v[33:34], v[21:22], v[35:36]
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:20
+; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_fma_f32 v4, v4, v20, v21
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v4
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v4
+; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v20
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GCN-NEXT: v_fma_f64 v[4:5], v[33:34], v[4:5], v[35:36]
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
@@ -52033,51 +52213,79 @@ define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bf
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GCN-NEXT: v_fma_f32 v3, v3, v19, v20
-; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:12
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v20
+; GCN-NEXT: v_cvt_f64_f32_e32 v[19:20], v19
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v3
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GCN-NEXT: v_fma_f64 v[19:20], v[33:34], v[19:20], v[35:36]
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT: v_fma_f32 v2, v2, v18, v19
-; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v2
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v2
+; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v18
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GCN-NEXT: v_fma_f64 v[2:3], v[33:34], v[2:3], v[35:36]
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT: v_fma_f32 v1, v1, v17, v18
-; GCN-NEXT: v_fma_f32 v0, v0, v16, v19
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v18
+; GCN-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GCN-NEXT: v_fma_f64 v[17:18], v[33:34], v[17:18], v[35:36]
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v16
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GCN-NEXT: v_fma_f64 v[0:1], v[33:34], v[0:1], v[35:36]
+; GCN-NEXT: v_cvt_f32_f64_e32 v16, v[31:32]
+; GCN-NEXT: v_cvt_f32_f64_e32 v14, v[14:15]
+; GCN-NEXT: v_cvt_f32_f64_e32 v15, v[29:30]
+; GCN-NEXT: v_cvt_f32_f64_e32 v12, v[12:13]
+; GCN-NEXT: v_cvt_f32_f64_e32 v13, v[27:28]
+; GCN-NEXT: v_cvt_f32_f64_e32 v10, v[10:11]
+; GCN-NEXT: v_cvt_f32_f64_e32 v11, v[25:26]
+; GCN-NEXT: v_cvt_f32_f64_e32 v8, v[8:9]
+; GCN-NEXT: v_cvt_f32_f64_e32 v9, v[23:24]
+; GCN-NEXT: v_cvt_f32_f64_e32 v6, v[6:7]
+; GCN-NEXT: v_cvt_f32_f64_e32 v7, v[21:22]
+; GCN-NEXT: v_cvt_f32_f64_e32 v4, v[4:5]
+; GCN-NEXT: v_cvt_f32_f64_e32 v5, v[19:20]
+; GCN-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
+; GCN-NEXT: v_cvt_f32_f64_e32 v3, v[17:18]
+; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v15
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v16
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fma_v16bf16:
@@ -52088,161 +52296,225 @@ define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bf
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GFX7-NEXT: v_fma_f32 v15, v15, v31, v32
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v32
+; GFX7-NEXT: v_and_b32_e32 v35, 0xffff0000, v31
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[31:32], v15
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v30
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v14
+; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GFX7-NEXT: v_fma_f32 v14, v14, v30, v31
+; GFX7-NEXT: v_fma_f64 v[31:32], v[31:32], v[35:36], v[33:34]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v30
; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:56
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_and_b32_e32 v35, 0xffff0000, v14
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v15
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GFX7-NEXT: v_fma_f64 v[14:15], v[33:34], v[14:15], v[35:36]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v13
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v28
+; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v12
+; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX7-NEXT: v_fma_f32 v13, v13, v29, v30
-; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52
+; GFX7-NEXT: v_and_b32_e32 v35, 0xffff0000, v30
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[29:30], v29
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX7-NEXT: v_fma_f32 v12, v12, v28, v29
+; GFX7-NEXT: v_cvt_f32_f64_e32 v14, v[14:15]
+; GFX7-NEXT: v_fma_f64 v[29:30], v[33:34], v[29:30], v[35:36]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v28
; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:48
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f64_e32 v15, v[29:30]
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_and_b32_e32 v35, 0xffff0000, v12
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v13
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GFX7-NEXT: v_fma_f64 v[12:13], v[33:34], v[12:13], v[35:36]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v11
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v26
+; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:44
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX7-NEXT: v_fma_f32 v11, v11, v27, v28
-; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:44
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v35, 0xffff0000, v28
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[27:28], v27
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GFX7-NEXT: v_cvt_f32_f64_e32 v12, v[12:13]
+; GFX7-NEXT: v_fma_f64 v[27:28], v[33:34], v[27:28], v[35:36]
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v35, 0xffff0000, v10
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GFX7-NEXT: v_cvt_f32_f64_e32 v13, v[27:28]
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_fma_f32 v10, v10, v26, v27
+; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v26
; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40
+; GFX7-NEXT: v_fma_f64 v[10:11], v[35:36], v[33:34], v[10:11]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v9
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v24
+; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:36
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v25
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GFX7-NEXT: v_cvt_f32_f64_e32 v10, v[10:11]
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: v_fma_f32 v9, v9, v25, v26
-; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:36
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[25:26], v26
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_fma_f32 v8, v8, v24, v25
+; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_fma_f64 v[25:26], v[35:36], v[33:34], v[25:26]
+; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v35, 0xffff0000, v8
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v24
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:32
+; GFX7-NEXT: v_cvt_f32_f64_e32 v11, v[25:26]
+; GFX7-NEXT: v_fma_f64 v[8:9], v[35:36], v[33:34], v[8:9]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v7
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v22
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v23
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GFX7-NEXT: v_cvt_f32_f64_e32 v8, v[8:9]
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX7-NEXT: v_fma_f32 v7, v7, v23, v24
-; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:28
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[23:24], v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX7-NEXT: v_fma_f32 v6, v6, v22, v23
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_fma_f64 v[23:24], v[35:36], v[33:34], v[23:24]
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v35, 0xffff0000, v6
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v22
; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GFX7-NEXT: v_cvt_f32_f64_e32 v9, v[23:24]
+; GFX7-NEXT: v_fma_f64 v[6:7], v[35:36], v[33:34], v[6:7]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v5
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v20
+; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v21
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GFX7-NEXT: v_cvt_f32_f64_e32 v6, v[6:7]
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_fma_f32 v5, v5, v21, v22
-; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:20
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[21:22], v22
+; GFX7-NEXT: v_fma_f64 v[21:22], v[35:36], v[33:34], v[21:22]
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v35, 0xffff0000, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX7-NEXT: v_fma_f32 v4, v4, v20, v21
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v20
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16
+; GFX7-NEXT: v_cvt_f32_f64_e32 v7, v[21:22]
+; GFX7-NEXT: v_fma_f64 v[4:5], v[35:36], v[33:34], v[4:5]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v18
+; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v19
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GFX7-NEXT: v_cvt_f32_f64_e32 v4, v[4:5]
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v15
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX7-NEXT: v_fma_f32 v3, v3, v19, v20
-; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:12
-; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX7-NEXT: v_fma_f32 v2, v2, v18, v19
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v20
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT: v_fma_f64 v[19:20], v[35:36], v[33:34], v[19:20]
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v35, 0xffff0000, v2
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v18
; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v20
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GFX7-NEXT: v_fma_f64 v[2:3], v[35:36], v[33:34], v[2:3]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v16
+; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v17
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GFX7-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v3, v[19:20]
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: v_fma_f32 v1, v1, v17, v18
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v19
-; GFX7-NEXT: v_fma_f32 v0, v0, v16, v17
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v18
+; GFX7-NEXT: v_fma_f64 v[17:18], v[35:36], v[33:34], v[17:18]
+; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v35, 0xffff0000, v0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v16
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v33
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v35
+; GFX7-NEXT: v_cvt_f32_f64_e32 v16, v[31:32]
+; GFX7-NEXT: v_fma_f64 v[0:1], v[35:36], v[33:34], v[0:1]
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v16
+; GFX7-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v1, v[17:18]
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -53162,783 +53434,1146 @@ define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bf
; GCN-LABEL: v_fma_v32bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:256
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v47, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256
; GCN-NEXT: s_waitcnt vmcnt(2)
-; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GCN-NEXT: v_fma_f32 v31, v31, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:252
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[31:32], v31
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v0
+; GCN-NEXT: v_fma_f64 v[0:1], v[35:36], v[33:34], v[31:32]
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:252
+; GCN-NEXT: s_waitcnt expcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v30
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v33
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v30, v30, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:248
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v34
+; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v30
+; GCN-NEXT: v_cvt_f64_f32_e32 v[37:38], v1
+; GCN-NEXT: v_fma_f64 v[0:1], v[33:34], v[37:38], v[35:36]
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(1)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v29, v29, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:244
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[29:30], v29
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[37:38], v0
+; GCN-NEXT: v_fma_f64 v[0:1], v[29:30], v[37:38], v[35:36]
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GCN-NEXT: s_waitcnt expcnt(1)
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116
+; GCN-NEXT: s_waitcnt expcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v28, v28, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:240
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[35:36], v28
+; GCN-NEXT: v_cvt_f64_f32_e32 v[37:38], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[48:49], v0
+; GCN-NEXT: v_fma_f64 v[35:36], v[35:36], v[48:49], v[37:38]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v27, v27, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:236
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[27:28], v27
+; GCN-NEXT: v_cvt_f64_f32_e32 v[37:38], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[48:49], v0
+; GCN-NEXT: v_fma_f64 v[27:28], v[27:28], v[48:49], v[37:38]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v26, v26, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:232
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[37:38], v26
+; GCN-NEXT: v_cvt_f64_f32_e32 v[48:49], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[50:51], v0
+; GCN-NEXT: v_fma_f64 v[37:38], v[37:38], v[50:51], v[48:49]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v25, v25, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:228
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[25:26], v25
+; GCN-NEXT: v_cvt_f64_f32_e32 v[48:49], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[50:51], v0
+; GCN-NEXT: v_fma_f64 v[25:26], v[25:26], v[50:51], v[48:49]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v24, v24, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:224
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[48:49], v24
+; GCN-NEXT: v_cvt_f64_f32_e32 v[50:51], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[52:53], v0
+; GCN-NEXT: v_fma_f64 v[48:49], v[48:49], v[52:53], v[50:51]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v23, v23, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[23:24], v23
+; GCN-NEXT: v_cvt_f64_f32_e32 v[50:51], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[52:53], v0
+; GCN-NEXT: v_fma_f64 v[23:24], v[23:24], v[52:53], v[50:51]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v22, v22, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:216
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[50:51], v22
+; GCN-NEXT: v_cvt_f64_f32_e32 v[52:53], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[54:55], v0
+; GCN-NEXT: v_fma_f64 v[50:51], v[50:51], v[54:55], v[52:53]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v21, v21, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:212
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[21:22], v21
+; GCN-NEXT: v_cvt_f64_f32_e32 v[52:53], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[54:55], v0
+; GCN-NEXT: v_fma_f64 v[21:22], v[21:22], v[54:55], v[52:53]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v20, v20, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:208
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[52:53], v20
+; GCN-NEXT: v_cvt_f64_f32_e32 v[54:55], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[39:40], v0
+; GCN-NEXT: v_fma_f64 v[52:53], v[52:53], v[39:40], v[54:55]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v19, v19, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:204
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[19:20], v19
+; GCN-NEXT: v_cvt_f64_f32_e32 v[54:55], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[39:40], v0
+; GCN-NEXT: v_fma_f64 v[19:20], v[19:20], v[39:40], v[54:55]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v18, v18, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:200
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[54:55], v18
+; GCN-NEXT: v_cvt_f64_f32_e32 v[39:40], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[41:42], v0
+; GCN-NEXT: v_fma_f64 v[54:55], v[54:55], v[41:42], v[39:40]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v17, v17, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:196
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
+; GCN-NEXT: v_cvt_f64_f32_e32 v[39:40], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[41:42], v0
+; GCN-NEXT: v_fma_f64 v[17:18], v[17:18], v[41:42], v[39:40]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v16, v16, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:192
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[39:40], v16
+; GCN-NEXT: v_cvt_f64_f32_e32 v[41:42], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[43:44], v0
+; GCN-NEXT: v_fma_f64 v[39:40], v[39:40], v[43:44], v[41:42]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v15, v15, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
+; GCN-NEXT: v_cvt_f64_f32_e32 v[41:42], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[43:44], v0
+; GCN-NEXT: v_fma_f64 v[15:16], v[15:16], v[43:44], v[41:42]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v14, v14, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:184
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[41:42], v14
+; GCN-NEXT: v_cvt_f64_f32_e32 v[43:44], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[45:46], v0
+; GCN-NEXT: v_fma_f64 v[41:42], v[41:42], v[45:46], v[43:44]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v13, v13, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:180
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[13:14], v13
+; GCN-NEXT: v_cvt_f64_f32_e32 v[43:44], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[45:46], v0
+; GCN-NEXT: v_fma_f64 v[13:14], v[13:14], v[45:46], v[43:44]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v12, v12, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[43:44], v12
+; GCN-NEXT: v_cvt_f64_f32_e32 v[45:46], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[56:57], v0
+; GCN-NEXT: v_fma_f64 v[43:44], v[43:44], v[56:57], v[45:46]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v11, v11, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[11:12], v11
+; GCN-NEXT: v_cvt_f64_f32_e32 v[45:46], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[56:57], v0
+; GCN-NEXT: v_fma_f64 v[11:12], v[11:12], v[56:57], v[45:46]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v10, v10, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:168
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[45:46], v10
+; GCN-NEXT: v_cvt_f64_f32_e32 v[56:57], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[58:59], v0
+; GCN-NEXT: v_fma_f64 v[45:46], v[45:46], v[58:59], v[56:57]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v9, v9, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:164
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[9:10], v9
+; GCN-NEXT: v_cvt_f64_f32_e32 v[56:57], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[58:59], v0
+; GCN-NEXT: v_fma_f64 v[9:10], v[9:10], v[58:59], v[56:57]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v8, v8, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:160
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[56:57], v8
+; GCN-NEXT: v_cvt_f64_f32_e32 v[58:59], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[60:61], v0
+; GCN-NEXT: v_fma_f64 v[56:57], v[56:57], v[60:61], v[58:59]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v7, v7, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[7:8], v7
+; GCN-NEXT: v_cvt_f64_f32_e32 v[58:59], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[60:61], v0
+; GCN-NEXT: v_fma_f64 v[7:8], v[7:8], v[60:61], v[58:59]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v6, v6, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:152
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[58:59], v6
+; GCN-NEXT: v_cvt_f64_f32_e32 v[60:61], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[62:63], v0
+; GCN-NEXT: v_fma_f64 v[58:59], v[58:59], v[62:63], v[60:61]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v5, v5, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
+; GCN-NEXT: v_cvt_f64_f32_e32 v[60:61], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[62:63], v0
+; GCN-NEXT: v_fma_f64 v[5:6], v[5:6], v[62:63], v[60:61]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v4, v4, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[60:61], v4
+; GCN-NEXT: v_cvt_f64_f32_e32 v[62:63], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[31:32], v0
+; GCN-NEXT: v_fma_f64 v[60:61], v[60:61], v[31:32], v[62:63]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v3, v3, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
+; GCN-NEXT: v_cvt_f64_f32_e32 v[31:32], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[62:63], v0
+; GCN-NEXT: v_fma_f64 v[3:4], v[3:4], v[62:63], v[31:32]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v2, v2, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[31:32], v2
+; GCN-NEXT: v_cvt_f64_f32_e32 v[62:63], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v0
+; GCN-NEXT: v_fma_f64 v[31:32], v[31:32], v[33:34], v[62:63]
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136
+; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v1, v1, v32, v33
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v2
+; GCN-NEXT: v_cvt_f64_f32_e32 v[62:63], v1
+; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GCN-NEXT: v_fma_f64 v[0:1], v[33:34], v[0:1], v[62:63]
+; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v47
+; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_fma_f32 v0, v0, v32, v33
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_cvt_f64_f32_e32 v[33:34], v30
+; GCN-NEXT: v_cvt_f64_f32_e32 v[62:63], v29
+; GCN-NEXT: v_cvt_f64_f32_e32 v[29:30], v2
+; GCN-NEXT: v_fma_f64 v[29:30], v[33:34], v[29:30], v[62:63]
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f64_e32 v33, v[33:34]
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f64_e32 v34, v[62:63]
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cvt_f32_f64_e32 v47, v[62:63]
+; GCN-NEXT: v_cvt_f32_f64_e32 v35, v[35:36]
+; GCN-NEXT: v_cvt_f32_f64_e32 v27, v[27:28]
+; GCN-NEXT: v_cvt_f32_f64_e32 v28, v[37:38]
+; GCN-NEXT: v_cvt_f32_f64_e32 v25, v[25:26]
+; GCN-NEXT: v_cvt_f32_f64_e32 v26, v[48:49]
+; GCN-NEXT: v_cvt_f32_f64_e32 v23, v[23:24]
+; GCN-NEXT: v_cvt_f32_f64_e32 v24, v[50:51]
+; GCN-NEXT: v_cvt_f32_f64_e32 v21, v[21:22]
+; GCN-NEXT: v_cvt_f32_f64_e32 v22, v[52:53]
+; GCN-NEXT: v_cvt_f32_f64_e32 v19, v[19:20]
+; GCN-NEXT: v_cvt_f32_f64_e32 v20, v[54:55]
+; GCN-NEXT: v_cvt_f32_f64_e32 v17, v[17:18]
+; GCN-NEXT: v_cvt_f32_f64_e32 v18, v[39:40]
+; GCN-NEXT: v_cvt_f32_f64_e32 v15, v[15:16]
+; GCN-NEXT: v_cvt_f32_f64_e32 v16, v[41:42]
+; GCN-NEXT: v_cvt_f32_f64_e32 v13, v[13:14]
+; GCN-NEXT: v_cvt_f32_f64_e32 v14, v[43:44]
+; GCN-NEXT: v_cvt_f32_f64_e32 v11, v[11:12]
+; GCN-NEXT: v_cvt_f32_f64_e32 v12, v[45:46]
+; GCN-NEXT: v_cvt_f32_f64_e32 v9, v[9:10]
+; GCN-NEXT: v_cvt_f32_f64_e32 v10, v[56:57]
+; GCN-NEXT: v_cvt_f32_f64_e32 v7, v[7:8]
+; GCN-NEXT: v_cvt_f32_f64_e32 v8, v[58:59]
+; GCN-NEXT: v_cvt_f32_f64_e32 v5, v[5:6]
+; GCN-NEXT: v_cvt_f32_f64_e32 v6, v[60:61]
+; GCN-NEXT: v_cvt_f32_f64_e32 v3, v[3:4]
+; GCN-NEXT: v_cvt_f32_f64_e32 v2, v[31:32]
+; GCN-NEXT: v_cvt_f32_f64_e32 v1, v[0:1]
+; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[29:30]
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v10
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v18
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v26
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v28
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v35
+; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v47
+; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v34
+; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v33
+; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fma_v32bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:256
+; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX7-NEXT: v_mov_b32_e32 v47, v0
+; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:256
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[31:32], v31
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v0
+; GFX7-NEXT: v_fma_f64 v[0:1], v[35:36], v[33:34], v[31:32]
+; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v30
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124
+; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:252
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v30
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[37:38], v1
+; GFX7-NEXT: v_fma_f64 v[0:1], v[33:34], v[37:38], v[35:36]
+; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v29
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120
+; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:248
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v29
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[29:30], v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v31
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[37:38], v1
+; GFX7-NEXT: v_fma_f64 v[0:1], v[29:30], v[37:38], v[35:36]
+; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX7-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v28
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116
+; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[35:36], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v27
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[48:49], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112
+; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:240
+; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[37:38], v28
+; GFX7-NEXT: v_fma_f64 v[35:36], v[35:36], v[48:49], v[37:38]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[48:49], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v26
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[50:51], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v25
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[37:38], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108
+; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:236
+; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[27:28], v27
+; GFX7-NEXT: v_fma_f64 v[27:28], v[48:49], v[37:38], v[27:28]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v27, v[27:28]
+; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[48:49], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104
+; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:232
+; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[37:38], v26
+; GFX7-NEXT: v_fma_f64 v[37:38], v[50:51], v[48:49], v[37:38]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[50:51], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v24
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[52:53], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v23
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f32_f64_e32 v28, v[37:38]
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[48:49], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100
+; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:228
+; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[25:26], v25
+; GFX7-NEXT: v_fma_f64 v[25:26], v[50:51], v[48:49], v[25:26]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v25, v[25:26]
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[50:51], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96
+; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224
+; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[48:49], v24
+; GFX7-NEXT: v_fma_f64 v[48:49], v[52:53], v[50:51], v[48:49]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[52:53], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[54:55], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v21
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f32_f64_e32 v26, v[48:49]
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[50:51], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:220
+; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[23:24], v23
+; GFX7-NEXT: v_fma_f64 v[23:24], v[52:53], v[50:51], v[23:24]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v23, v[23:24]
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[52:53], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88
+; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216
+; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[50:51], v22
+; GFX7-NEXT: v_fma_f64 v[50:51], v[54:55], v[52:53], v[50:51]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[54:55], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v20
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[39:40], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v19
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f32_f64_e32 v24, v[50:51]
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[52:53], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84
+; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:212
+; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[21:22], v21
+; GFX7-NEXT: v_fma_f64 v[21:22], v[54:55], v[52:53], v[21:22]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v21, v[21:22]
+; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[54:55], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80
+; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208
+; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[52:53], v20
+; GFX7-NEXT: v_fma_f64 v[52:53], v[39:40], v[54:55], v[52:53]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[39:40], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v18
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[41:42], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v17
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f32_f64_e32 v22, v[52:53]
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[54:55], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76
+; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204
+; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[19:20], v19
+; GFX7-NEXT: v_fma_f64 v[19:20], v[39:40], v[54:55], v[19:20]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v19, v[19:20]
+; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[39:40], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72
+; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200
+; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[54:55], v18
+; GFX7-NEXT: v_fma_f64 v[54:55], v[41:42], v[39:40], v[54:55]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[41:42], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v16
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[43:44], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v15
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f32_f64_e32 v20, v[54:55]
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[39:40], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68
+; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:196
+; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[17:18], v17
+; GFX7-NEXT: v_fma_f64 v[17:18], v[41:42], v[39:40], v[17:18]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v17, v[17:18]
+; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[41:42], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64
+; GFX7-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192
+; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[39:40], v16
+; GFX7-NEXT: v_fma_f64 v[39:40], v[43:44], v[41:42], v[39:40]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[43:44], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v14
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[45:46], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v13
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f32_f64_e32 v18, v[39:40]
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[41:42], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60
+; GFX7-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188
+; GFX7-NEXT: s_waitcnt vmcnt(2)
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[15:16], v15
+; GFX7-NEXT: v_fma_f64 v[15:16], v[43:44], v[41:42], v[15:16]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v15, v[15:16]
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[43:44], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56
+; GFX7-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184
; GFX7-NEXT: s_waitcnt vmcnt(2)
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GFX7-NEXT: v_fma_f32 v31, v31, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:252
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v30, v30, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:248
-; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v29, v29, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:244
-; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v28, v28, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:240
-; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v27, v27, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:236
-; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v26, v26, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:232
-; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v25, v25, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:228
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v24, v24, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:224
-; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v23, v23, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220
-; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v22, v22, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:216
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v21, v21, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:212
-; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v20, v20, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:208
-; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v19, v19, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:204
-; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v18, v18, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:200
-; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v17, v17, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:196
-; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v16, v16, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:192
-; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v15, v15, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v14, v14, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:184
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[41:42], v14
+; GFX7-NEXT: v_fma_f64 v[41:42], v[45:46], v[43:44], v[41:42]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[45:46], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v12
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[56:57], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v11
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f32_f64_e32 v16, v[41:42]
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v13, v13, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:180
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[43:44], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52
+; GFX7-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[13:14], v13
+; GFX7-NEXT: v_fma_f64 v[13:14], v[45:46], v[43:44], v[13:14]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v13, v[13:14]
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v12, v12, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[45:46], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48
+; GFX7-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[43:44], v12
+; GFX7-NEXT: v_fma_f64 v[43:44], v[56:57], v[45:46], v[43:44]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[56:57], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v10
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[58:59], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v9
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f32_f64_e32 v14, v[43:44]
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v11, v11, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[45:46], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44
+; GFX7-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[11:12], v11
+; GFX7-NEXT: v_fma_f64 v[11:12], v[56:57], v[45:46], v[11:12]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v11, v[11:12]
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v10, v10, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:168
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[56:57], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40
+; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[45:46], v10
+; GFX7-NEXT: v_fma_f64 v[45:46], v[58:59], v[56:57], v[45:46]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[58:59], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v8
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[60:61], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v7
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f32_f64_e32 v12, v[45:46]
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v9, v9, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:164
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[56:57], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36
+; GFX7-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[9:10], v9
+; GFX7-NEXT: v_fma_f64 v[9:10], v[58:59], v[56:57], v[9:10]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v9, v[9:10]
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v8, v8, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:160
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[58:59], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32
+; GFX7-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[56:57], v8
+; GFX7-NEXT: v_fma_f64 v[56:57], v[60:61], v[58:59], v[56:57]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[60:61], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v6
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[62:63], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f32_f64_e32 v10, v[56:57]
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v7, v7, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[58:59], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28
+; GFX7-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:156
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[7:8], v7
+; GFX7-NEXT: v_fma_f64 v[7:8], v[60:61], v[58:59], v[7:8]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v7, v[7:8]
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v6, v6, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:152
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[60:61], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24
+; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[58:59], v6
+; GFX7-NEXT: v_fma_f64 v[58:59], v[62:63], v[60:61], v[58:59]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[62:63], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v4
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[31:32], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f32_f64_e32 v8, v[58:59]
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v5, v5, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[60:61], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20
+; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[5:6], v5
+; GFX7-NEXT: v_fma_f64 v[5:6], v[62:63], v[60:61], v[5:6]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v5, v[5:6]
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v4, v4, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[62:63], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16
+; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[60:61], v4
+; GFX7-NEXT: v_fma_f64 v[60:61], v[31:32], v[62:63], v[60:61]
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[62:63], v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f32_f64_e32 v6, v[60:61]
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v3, v3, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[31:32], v1
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12
+; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[3:4], v3
+; GFX7-NEXT: v_fma_f64 v[3:4], v[62:63], v[31:32], v[3:4]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v3, v[3:4]
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v10
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v16
+; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v18
+; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v20
+; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v22
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v24
+; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v26
+; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v28
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v2, v2, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[31:32], v2
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[62:63], v1
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX7-NEXT: v_fma_f64 v[62:63], v[0:1], v[62:63], v[31:32]
+; GFX7-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8
+; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v1, v1, v32, v33
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v0
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[31:32], v29
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v30
+; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:4
+; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:132
+; GFX7-NEXT: s_waitcnt vmcnt(2)
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v47
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_fma_f64 v[0:1], v[33:34], v[31:32], v[0:1]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v1, v[0:1]
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[31:32], v30
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[33:34], v29
+; GFX7-NEXT: v_cvt_f64_f32_e32 v[29:30], v2
+; GFX7-NEXT: v_fma_f64 v[29:30], v[29:30], v[33:34], v[31:32]
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX7-NEXT: v_cvt_f32_f64_e32 v2, v[29:30]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v29, v[62:63]
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v29
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f64_e32 v31, v[31:32]
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f64_e32 v32, v[32:33]
+; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v32
+; GFX7-NEXT: s_waitcnt vmcnt(14)
+; GFX7-NEXT: v_cvt_f32_f64_e32 v33, v[33:34]
+; GFX7-NEXT: v_cvt_f32_f64_e32 v34, v[35:36]
+; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v33
+; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v34
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33
-; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_fma_f32 v0, v0, v32, v33
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_v32bf16:
diff --git a/llvm/test/CodeGen/Generic/half-op.ll b/llvm/test/CodeGen/Generic/half-op.ll
index 1037d8e20cc10..bd5961a2d7c0e 100644
--- a/llvm/test/CodeGen/Generic/half-op.ll
+++ b/llvm/test/CodeGen/Generic/half-op.ll
@@ -8,7 +8,7 @@
; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=arm64ec-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if amdgpu-registered-target %{ llc %s -o - -mtriple=amdgcn-amd-amdhsa | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %}
; RUN: %if arc-registered-target %{ llc %s -o - -mtriple=arc-elf | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %}
-; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
+; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN %}
; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=thumbv7em-none-eabi | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; RUN: %if avr-registered-target %{ llc %s -o - -mtriple=avr-none | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %}
; FIXME: BPF has a compiler error
More information about the llvm-commits
mailing list