[llvm] Adding support for G_STRICT_FMA in new reg bank select (PR #170330)

Abhinav Garg via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 2 09:16:02 PST 2025


https://github.com/abhigargrepo created https://github.com/llvm/llvm-project/pull/170330

This patch add legalization rules for G_STRICT_FMA opcode.

>From 8da77aaa3095644cdab937f1640994d459a9366e Mon Sep 17 00:00:00 2001
From: Abhinav Garg <abhigarg at amd.com>
Date: Tue, 2 Dec 2025 15:10:42 +0000
Subject: [PATCH] Adding support for G_STRICT_FMA in new reg bank select

---
 .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp    |    3 +
 .../AMDGPU/AMDGPURegBankLegalizeRules.cpp     |   16 +
 .../AMDGPU/GlobalISel/strict_fma.f16.ll       |  821 ++++++++++--
 .../AMDGPU/GlobalISel/strict_fma.f32.ll       |  966 +++++++++++++-
 .../AMDGPU/GlobalISel/strict_fma.f64.ll       | 1119 ++++++++++++++++-
 5 files changed, 2696 insertions(+), 229 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 123fc5bf37a19..dd9ea1e11a9af 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -910,6 +910,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
     return LLT::fixed_vector(2, 16);
   case SgprV2S32:
   case VgprV2S32:
+  case UniInVgprV2S32:
     return LLT::fixed_vector(2, 32);
   case SgprV4S32:
   case SgprV4S32_WF:
@@ -1013,6 +1014,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
   case UniInVgprS32:
   case UniInVgprS64:
   case UniInVgprV2S16:
+  case UniInVgprV2S32:
   case UniInVgprV4S32:
   case UniInVgprB32:
   case UniInVgprB64:
@@ -1148,6 +1150,7 @@ void RegBankLegalizeHelper::applyMappingDst(
     case UniInVgprS32:
     case UniInVgprS64:
     case UniInVgprV2S16:
+    case UniInVgprV2S32:
     case UniInVgprV4S32: {
       assert(Ty == getTyFromID(MethodIDs[OpIdx]));
       assert(RB == SgprRB);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 0c1daefd493b6..feda4058149ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -120,6 +120,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return isAnyPtr(MRI.getType(Reg), 128) && MUI.isUniform(Reg);
   case UniV2S16:
     return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg);
+  case UniV2S32:
+    return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isUniform(Reg);
   case UniB32:
     return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg);
   case UniB64:
@@ -160,6 +162,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return isAnyPtr(MRI.getType(Reg), 128) && MUI.isDivergent(Reg);
   case DivV2S16:
     return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg);
+  case DivV2S32:
+    return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isDivergent(Reg);
   case DivB32:
     return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg);
   case DivB64:
@@ -926,6 +930,18 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
   addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard)
       .Uni(S64, {{Sgpr64}, {}});
 
+  addRulesForGOpcs({G_STRICT_FMA}, Standard)
+    .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16, VgprV2S16}})
+    .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16, VgprV2S16}})
+    .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32, VgprV2S32}}})
+    .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32, VgprV2S32}}})
+    .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
+    .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}})
+    .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64, Vgpr64}})
+    .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}})
+    .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}}) 
+    .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64, Vgpr64}});
+  
   addRulesForGOpcs({G_BLOCK_ADDR}).Any({{UniP0}, {{SgprP0}, {}}});
 
   addRulesForGOpcs({G_GLOBAL_VALUE})
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f16.ll
index 15740ee5476e8..5fbbc8c8f8dcd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f16.ll
@@ -1,152 +1,783 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-
-define half @v_constained_fma_f16_fpexcept_strict(half %x, half %y, half %z) #0 {
-; GCN-LABEL: v_constained_fma_f16_fpexcept_strict:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f16 v0, v0, v1, v2
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx942 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX942 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+
+define void @v_constained_fma_f16_fpexcept_strict_uni(half inreg %x, half inreg %y, half inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f16_fpexcept_strict_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s17
+; GFX8-NEXT:    v_mov_b32_e32 v3, s18
+; GFX8-NEXT:    v_fma_f16 v2, s16, v2, v3
+; GFX8-NEXT:    flat_store_short v[0:1], v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f16_fpexcept_strict_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s17
+; GFX900-NEXT:    v_mov_b32_e32 v3, s18
+; GFX900-NEXT:    v_fma_f16 v2, s16, v2, v3
+; GFX900-NEXT:    global_store_short v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f16_fpexcept_strict_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942-NEXT:    v_fma_f16 v2, s0, v2, v3
+; GFX942-NEXT:    global_store_short v[0:1], v2, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b16_e32 v2.l, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f16 v2.l, s0, s1, v2.l
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f16_fpexcept_strict_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_fma_f16 v2, s0, s1, v2
+; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret half %val
+  store half %val, ptr addrspace(1) %out
+  ret void
 }
 
-define <2 x half> @v_constained_fma_v2f16_fpexcept_strict(<2 x half> %x, <2 x half> %y, <2 x half> %z) #0 {
-; GFX9-LABEL: v_constained_fma_v2f16_fpexcept_strict:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+define void @v_constained_fma_f16_fpexcept_strict_div(half %x, half %y, half %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f16_fpexcept_strict_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX8-NEXT:    flat_store_short v[3:4], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f16_fpexcept_strict_div:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX900-NEXT:    global_store_short v[3:4], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict:
+; GFX942-LABEL: v_constained_fma_f16_fpexcept_strict_div:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, v3
+; GFX942-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX942-NEXT:    global_store_short v[6:7], v0, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f16 v0.l, v0.l, v1.l, v2.l
+; GFX11-NEXT:    global_store_b16 v[3:4], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f16_fpexcept_strict_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX12-NEXT:    global_store_b16 v[3:4], v0, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store half %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_v2f16_fpexcept_strict_uni(<2 x half> inreg %x, <2 x half> inreg %y, <2 x half> inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s17
+; GFX8-NEXT:    v_mov_b32_e32 v3, s18
+; GFX8-NEXT:    s_lshr_b32 s5, s17, 16
+; GFX8-NEXT:    s_lshr_b32 s6, s18, 16
+; GFX8-NEXT:    v_fma_f16 v2, s16, v2, v3
+; GFX8-NEXT:    s_lshr_b32 s4, s16, 16
+; GFX8-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_mov_b32_e32 v3, s6
+; GFX8-NEXT:    v_fma_f16 v2, s4, v2, v3
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT:    s_and_b32 s5, 0xffff, s7
+; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX8-NEXT:    s_or_b32 s4, s5, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_v2f16_fpexcept_strict_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s17
+; GFX900-NEXT:    v_mov_b32_e32 v3, s18
+; GFX900-NEXT:    v_pk_fma_f16 v2, s16, v2, v3
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_v2f16_fpexcept_strict_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942-NEXT:    v_pk_fma_f16 v2, s0, v2, v3
+; GFX942-NEXT:    global_store_dword v[0:1], v2, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v2f16_fpexcept_strict_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_fma_f16 v2, s0, s1, v2
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v2f16_fpexcept_strict_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_fma_f16 v2, s0, s1, v2
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %val = call <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store <2 x half> %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_v2f16_fpexcept_strict_div(<2 x half> %x, <2 x half> %y, <2 x half> %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict_div:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
 ; GFX8-NEXT:    v_fma_f16 v0, v0, v1, v2
-; GFX8-NEXT:    v_fma_f16 v1, v3, v4, v5
+; GFX8-NEXT:    v_fma_f16 v1, v5, v6, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    flat_store_dword v[3:4], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_v2f16_fpexcept_strict_div:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX900-NEXT:    global_store_dword v[3:4], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_v2f16_fpexcept_strict_div:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, v3
+; GFX942-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX942-NEXT:    global_store_dword v[6:7], v0, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v2f16_fpexcept_strict_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX11-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v2f16_fpexcept_strict_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret <2 x half> %val
+  store <2 x half> %val, ptr addrspace(1) %out
+  ret void
 }
 
-define <3 x half> @v_constained_fma_v3f16_fpexcept_strict(<3 x half> %x, <3 x half> %y, <3 x half> %z) #0 {
-; GFX9-LABEL: v_constained_fma_v3f16_fpexcept_strict:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
-; GFX9-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_constained_fma_v3f16_fpexcept_strict:
+;define void @v_constained_fma_v3f16_fpexcept_strict_uni(<3 x half> inreg %x, <3 x half> inreg %y, <3 x half> inreg %z, ptr addrspace(1) %out) #0 {
+;  %val = call <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+;  store <3 x half> %val, ptr addrspace(1) %out
+;  ret void
+;}
+
+define void @v_constained_fma_v3f16_fpexcept_strict_div(<3 x half> %x, <3 x half> %y, <3 x half> %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v3f16_fpexcept_strict_div:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
 ; GFX8-NEXT:    v_fma_f16 v0, v0, v2, v4
-; GFX8-NEXT:    v_fma_f16 v2, v6, v7, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_fma_f16 v1, v1, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
+; GFX8-NEXT:    flat_store_short v[6:7], v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v6
+; GFX8-NEXT:    v_fma_f16 v2, v8, v9, v10
+; GFX8-NEXT:    v_fma_f16 v3, v1, v3, v5
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GFX8-NEXT:    flat_store_short v[0:1], v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 4, v6
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GFX8-NEXT:    flat_store_short v[0:1], v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-  %val = call <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret <3 x half> %val
-}
-
-define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x half> %y, <4 x half> %z) #0 {
-; GFX9-LABEL: v_constained_fma_v4f16_fpexcept_strict:
+;
+; GFX9-LABEL: v_constained_fma_v3f16_fpexcept_strict_div:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
 ; GFX9-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-NEXT:    global_store_short v[6:7], v0, off
+; GFX9-NEXT:    global_store_short_d16_hi v[6:7], v0, off offset:2
+; GFX9-NEXT:    global_store_short v[6:7], v1, off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_constained_fma_v4f16_fpexcept_strict:
+; GFX11-LABEL: v_constained_fma_v3f16_fpexcept_strict_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX11-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    global_store_b16 v[6:7], v0, off
+; GFX11-NEXT:    global_store_d16_hi_b16 v[6:7], v0, off offset:2
+; GFX11-NEXT:    global_store_b16 v[6:7], v1, off offset:4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v3f16_fpexcept_strict_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_fma_f16 v0, v0, v2, v4
+; GFX12-NEXT:    v_pk_fma_f16 v1, v1, v3, v5
+; GFX12-NEXT:    s_clause 0x2
+; GFX12-NEXT:    global_store_b16 v[6:7], v0, off
+; GFX12-NEXT:    global_store_d16_hi_b16 v[6:7], v0, off offset:2
+; GFX12-NEXT:    global_store_b16 v[6:7], v1, off offset:4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %val = call <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store <3 x half> %val, ptr addrspace(1) %out
+  ret void
+}
+
+;define void @v_constained_fma_v4f16_fpexcept_strict_uni(<4 x half> inreg %x, <4 x half> inreg %y, <4 x half> inreg %z, ptr addrspace(1) %out) #0 {
+;  %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+;  store <4 x half> %val, ptr addrspace(1) %out
+;  ret void
+;}
+;
+;define void @v_constained_fma_v4f16_fpexcept_strict_div(<4 x half> %x, <4 x half> %y, <4 x half> %z, ptr addrspace(1) %out) #0 {
+;  %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+;  store <4 x half> %val, ptr addrspace(1) %out
+;  ret void
+;}
+
+define void @v_constained_fma_f16_fpexcept_strict_fneg_uni(half inreg %x, half inreg %y, half inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_uni:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
-; GFX8-NEXT:    v_fma_f16 v0, v0, v2, v4
-; GFX8-NEXT:    v_fma_f16 v2, v6, v8, v10
-; GFX8-NEXT:    v_fma_f16 v1, v1, v3, v5
-; GFX8-NEXT:    v_fma_f16 v3, v7, v9, v11
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s16
+; GFX8-NEXT:    v_mov_b32_e32 v3, s17
+; GFX8-NEXT:    v_fma_f16 v2, v2, v3, -s18
+; GFX8-NEXT:    flat_store_short v[0:1], v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-  %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret <4 x half> %val
+;
+; GFX900-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s16
+; GFX900-NEXT:    v_mov_b32_e32 v3, s17
+; GFX900-NEXT:    v_fma_f16 v2, v2, v3, -s18
+; GFX900-NEXT:    global_store_short v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    v_fma_f16 v2, v2, v3, -s2
+; GFX942-NEXT:    global_store_short v[0:1], v2, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b16_e32 v2.l, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f16 v2.l, s0, v2.l, -s2
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_fma_f16 v2, s0, v2, -s2
+; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %neg.z = fneg half %z
+  %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %neg.z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store half %val, ptr addrspace(1) %out
+  ret void
 }
 
-define half @v_constained_fma_f16_fpexcept_strict_fneg(half %x, half %y, half %z) #0 {
-; GCN-LABEL: v_constained_fma_f16_fpexcept_strict_fneg:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f16 v0, v0, v1, -v2
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+define void @v_constained_fma_f16_fpexcept_strict_fneg_div(half %x, half %y, half %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; GFX8-NEXT:    flat_store_short v[3:4], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_div:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; GFX900-NEXT:    global_store_short v[3:4], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_div:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, v3
+; GFX942-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; GFX942-NEXT:    global_store_short v[6:7], v0, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f16 v0.l, v0.l, v1.l, -v2.l
+; GFX11-NEXT:    global_store_b16 v[3:4], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f16 v0, v0, v1, -v2
+; GFX12-NEXT:    global_store_b16 v[3:4], v0, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %neg.z = fneg half %z
   %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %neg.z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret half %val
+  store half %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_f16_fpexcept_strict_fneg_fneg_uni(half inreg %x, half inreg %y, half inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s17
+; GFX8-NEXT:    v_mov_b32_e32 v3, s18
+; GFX8-NEXT:    v_fma_f16 v2, -s16, -v2, v3
+; GFX8-NEXT:    flat_store_short v[0:1], v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s17
+; GFX900-NEXT:    v_mov_b32_e32 v3, s18
+; GFX900-NEXT:    v_fma_f16 v2, -s16, -v2, v3
+; GFX900-NEXT:    global_store_short v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942-NEXT:    v_fma_f16 v2, -s0, -v2, v3
+; GFX942-NEXT:    global_store_short v[0:1], v2, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b16_e32 v2.l, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f16 v2.l, -s0, -s1, v2.l
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_fma_f16 v2, -s0, -s1, v2
+; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %neg.x = fneg half %x
+  %neg.y = fneg half %y
+  %val = call half @llvm.experimental.constrained.fma.f16(half %neg.x, half %neg.y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store half %val, ptr addrspace(1) %out
+  ret void
 }
 
-define half @v_constained_fma_f16_fpexcept_strict_fneg_fneg(half %x, half %y, half %z) #0 {
-; GCN-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f16 v0, -v0, -v1, v2
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+define void @v_constained_fma_f16_fpexcept_strict_fneg_fneg_div(half %x, half %y, half %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f16 v0, -v0, -v1, v2
+; GFX8-NEXT:    flat_store_short v[3:4], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg_div:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_fma_f16 v0, -v0, -v1, v2
+; GFX900-NEXT:    global_store_short v[3:4], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg_div:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, v3
+; GFX942-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-NEXT:    v_fma_f16 v0, -v0, -v1, v2
+; GFX942-NEXT:    global_store_short v[6:7], v0, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f16 v0.l, -v0.l, -v1.l, v2.l
+; GFX11-NEXT:    global_store_b16 v[3:4], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f16 v0, -v0, -v1, v2
+; GFX12-NEXT:    global_store_b16 v[3:4], v0, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg half %x
   %neg.y = fneg half %y
   %val = call half @llvm.experimental.constrained.fma.f16(half %neg.x, half %neg.y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret half %val
+  store half %val, ptr addrspace(1) %out
+  ret void
 }
 
-define half @v_constained_fma_f16_fpexcept_strict_fabs_fabs(half %x, half %y, half %z) #0 {
-; GCN-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f16 v0, |v0|, |v1|, v2
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+define void @v_constained_fma_f16_fpexcept_strict_fabs_fabs_uni(half inreg %x, half inreg %y, half inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s17
+; GFX8-NEXT:    v_mov_b32_e32 v3, s18
+; GFX8-NEXT:    v_fma_f16 v2, |s16|, |v2|, v3
+; GFX8-NEXT:    flat_store_short v[0:1], v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s17
+; GFX900-NEXT:    v_mov_b32_e32 v3, s18
+; GFX900-NEXT:    v_fma_f16 v2, |s16|, |v2|, v3
+; GFX900-NEXT:    global_store_short v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942-NEXT:    v_fma_f16 v2, |s0|, |v2|, v3
+; GFX942-NEXT:    global_store_short v[0:1], v2, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b16_e32 v2.l, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f16 v2.l, |s0|, |s1|, v2.l
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_fma_f16 v2, |s0|, |s1|, v2
+; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = call half @llvm.fabs.f16(half %x) #0
   %neg.y = call half @llvm.fabs.f16(half %y) #0
   %val = call half @llvm.experimental.constrained.fma.f16(half %neg.x, half %neg.y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret half %val
+  store half %val, ptr addrspace(1) %out
+  ret void
 }
 
-define <2 x half> @v_constained_fma_v2f16_fpexcept_strict_fneg_fneg(<2 x half> %x, <2 x half> %y, <2 x half> %z) #0 {
-; GFX9-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+define void @v_constained_fma_f16_fpexcept_strict_fabs_fabs_div(half %x, half %y, half %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f16 v0, |v0|, |v1|, v2
+; GFX8-NEXT:    flat_store_short v[3:4], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs_div:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_fma_f16 v0, |v0|, |v1|, v2
+; GFX900-NEXT:    global_store_short v[3:4], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs_div:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, v3
+; GFX942-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-NEXT:    v_fma_f16 v0, |v0|, |v1|, v2
+; GFX942-NEXT:    global_store_short v[6:7], v0, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg:
+; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f16 v0.l, |v0.l|, |v1.l|, v2.l
+; GFX11-NEXT:    global_store_b16 v[3:4], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f16 v0, |v0|, |v1|, v2
+; GFX12-NEXT:    global_store_b16 v[3:4], v0, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %neg.x = call half @llvm.fabs.f16(half %x) #0
+  %neg.y = call half @llvm.fabs.f16(half %y) #0
+  %val = call half @llvm.experimental.constrained.fma.f16(half %neg.x, half %neg.y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store half %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_v2f16_fpexcept_strict_fneg_fneg_uni(<2 x half> inreg %x, <2 x half> inreg %y, <2 x half> inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s16
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s17
+; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    v_mov_b32_e32 v3, s18
+; GFX8-NEXT:    s_lshr_b32 s7, s5, 16
+; GFX8-NEXT:    s_lshr_b32 s8, s18, 16
+; GFX8-NEXT:    v_fma_f16 v2, s4, v2, v3
+; GFX8-NEXT:    s_lshr_b32 s6, s4, 16
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s8
+; GFX8-NEXT:    v_fma_f16 v2, s6, v2, v3
+; GFX8-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX8-NEXT:    s_or_b32 s4, s4, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s17
+; GFX900-NEXT:    v_mov_b32_e32 v3, s18
+; GFX900-NEXT:    v_pk_fma_f16 v2, s16, v2, v3 neg_lo:[1,1,0] neg_hi:[1,1,0]
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942-NEXT:    v_pk_fma_f16 v2, s0, v2, v3 neg_lo:[1,1,0] neg_hi:[1,1,0]
+; GFX942-NEXT:    global_store_dword v[0:1], v2, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_fma_f16 v2, s0, s1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_fma_f16 v2, s0, s1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %neg.x = fneg <2 x half> %x
+  %neg.y = fneg <2 x half> %y
+  %val = call <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half> %neg.x, <2 x half> %neg.y, <2 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store <2 x half> %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_v2f16_fpexcept_strict_fneg_fneg_div(<2 x half> %x, <2 x half> %y, <2 x half> %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg_div:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
 ; GFX8-NEXT:    v_fma_f16 v0, v0, v1, v2
-; GFX8-NEXT:    v_fma_f16 v1, v3, v4, v5
+; GFX8-NEXT:    v_fma_f16 v1, v5, v6, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    flat_store_dword v[3:4], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg_div:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
+; GFX900-NEXT:    global_store_dword v[3:4], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg_div:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, v3
+; GFX942-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
+; GFX942-NEXT:    global_store_dword v[6:7], v0, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
+; GFX11-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v2f16_fpexcept_strict_fneg_fneg_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg <2 x half> %x
   %neg.y = fneg <2 x half> %y
   %val = call <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half> %neg.x, <2 x half> %neg.y, <2 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret <2 x half> %val
+  store <2 x half> %val, ptr addrspace(1) %out
+  ret void
 }
 
 declare half @llvm.fabs.f16(half)
@@ -156,3 +787,5 @@ declare <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half>, <3 x hal
 declare <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half>, <4 x half>, <4 x half>, metadata, metadata)
 
 attributes #0 = { strictfp }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f32.ll
index 5955c590e9d1c..2bf4816e2ca5e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f32.ll
@@ -1,98 +1,927 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
-
-define float @v_constained_fma_f32_fpexcept_strict(float %x, float %y, float %z) #0 {
-; GCN-LABEL: v_constained_fma_f32_fpexcept_strict:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx942 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX942 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+
+define void @v_constained_fma_f32_fpexcept_strict_uni(float inreg %x, float inreg %y, float inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f32_fpexcept_strict_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s17
+; GFX8-NEXT:    v_mov_b32_e32 v3, s18
+; GFX8-NEXT:    v_fma_f32 v2, s16, v2, v3
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f32_fpexcept_strict_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s17
+; GFX900-NEXT:    v_mov_b32_e32 v3, s18
+; GFX900-NEXT:    v_fma_f32 v2, s16, v2, v3
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f32_fpexcept_strict_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942-NEXT:    v_fma_f32 v2, s0, v2, v3
+; GFX942-NEXT:    global_store_dword v[0:1], v2, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f32_fpexcept_strict_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v2, s0, s1, v2
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f32_fpexcept_strict_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_fma_f32 v2, s0, s1, v2
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %val = call float @llvm.experimental.constrained.fma.f32(float %x, float %y, float %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store float %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_f32_fpexcept_strict_div(float %x, float %y, float %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f32_fpexcept_strict_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX8-NEXT:    flat_store_dword v[3:4], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f32_fpexcept_strict_div:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX900-NEXT:    global_store_dword v[3:4], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f32_fpexcept_strict_div:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, v3
+; GFX942-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX942-NEXT:    global_store_dword v[6:7], v0, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f32_fpexcept_strict_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX11-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f32_fpexcept_strict_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %val = call float @llvm.experimental.constrained.fma.f32(float %x, float %y, float %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret float %val
+  store float %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_v2f32_fpexcept_strict_uni(<2 x float> inreg %x, <2 x float> inreg %y, <2 x float> inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v2f32_fpexcept_strict_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s18
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
+; GFX8-NEXT:    v_fma_f32 v2, s16, v2, v3
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s19
+; GFX8-NEXT:    v_mov_b32_e32 v3, s21
+; GFX8-NEXT:    v_fma_f32 v2, s17, v2, v3
+; GFX8-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_v2f32_fpexcept_strict_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s18
+; GFX900-NEXT:    v_mov_b32_e32 v3, s20
+; GFX900-NEXT:    v_fma_f32 v2, s16, v2, v3
+; GFX900-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX900-NEXT:    v_mov_b32_e32 v2, s19
+; GFX900-NEXT:    v_mov_b32_e32 v3, s21
+; GFX900-NEXT:    v_fma_f32 v2, s17, v2, v3
+; GFX900-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX900-NEXT:    v_mov_b32_e32 v2, s4
+; GFX900-NEXT:    v_mov_b32_e32 v3, s5
+; GFX900-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_v2f32_fpexcept_strict_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
+; GFX942-NEXT:    v_pk_fma_f32 v[2:3], s[0:1], v[2:3], v[4:5]
+; GFX942-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v2f32_fpexcept_strict_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_f32 v2, s0, s2, v2
+; GFX11-NEXT:    v_fma_f32 v3, s1, s3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v2f32_fpexcept_strict_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_fma_f32 v2, s0, s2, v2
+; GFX12-NEXT:    v_fma_f32 v3, s1, s3, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX12-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %val = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store <2 x float> %val, ptr addrspace(1) %out
+  ret void
 }
 
-define <2 x float> @v_constained_fma_v2f32_fpexcept_strict(<2 x float> %x, <2 x float> %y, <2 x float> %z) #0 {
-; GCN-LABEL: v_constained_fma_v2f32_fpexcept_strict:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GCN-NEXT:    v_fma_f32 v1, v1, v3, v5
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+define void @v_constained_fma_v2f32_fpexcept_strict_div(<2 x float> %x, <2 x float> %y, <2 x float> %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v2f32_fpexcept_strict_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX8-NEXT:    flat_store_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_v2f32_fpexcept_strict_div:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX900-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX900-NEXT:    global_store_dwordx2 v[6:7], v[0:1], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_v2f32_fpexcept_strict_div:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX942-NEXT:    global_store_dwordx2 v[6:7], v[0:1], off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v2f32_fpexcept_strict_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX11-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX11-NEXT:    global_store_b64 v[6:7], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v2f32_fpexcept_strict_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f32 v0, v0, v2, v4
+; GFX12-NEXT:    v_fma_f32 v1, v1, v3, v5
+; GFX12-NEXT:    global_store_b64 v[6:7], v[0:1], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret <2 x float> %val
+  store <2 x float> %val, ptr addrspace(1) %out
+  ret void
 }
 
-define <3 x float> @v_constained_fma_v3f32_fpexcept_strict(<3 x float> %x, <3 x float> %y, <3 x float> %z) #0 {
-; GCN-LABEL: v_constained_fma_v3f32_fpexcept_strict:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f32 v0, v0, v3, v6
-; GCN-NEXT:    v_fma_f32 v1, v1, v4, v7
-; GCN-NEXT:    v_fma_f32 v2, v2, v5, v8
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+define void @v_constained_fma_v3f32_fpexcept_strict_uni(<3 x float> inreg %x, <3 x float> inreg %y, <3 x float> inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v3f32_fpexcept_strict_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s19
+; GFX8-NEXT:    v_mov_b32_e32 v3, s22
+; GFX8-NEXT:    v_fma_f32 v2, s16, v2, v3
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s20
+; GFX8-NEXT:    v_mov_b32_e32 v3, s23
+; GFX8-NEXT:    v_fma_f32 v2, s17, v2, v3
+; GFX8-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s21
+; GFX8-NEXT:    v_mov_b32_e32 v3, s24
+; GFX8-NEXT:    v_fma_f32 v2, s18, v2, v3
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    flat_store_dwordx3 v[0:1], v[2:4]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_v3f32_fpexcept_strict_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s19
+; GFX900-NEXT:    v_mov_b32_e32 v3, s22
+; GFX900-NEXT:    v_fma_f32 v2, s16, v2, v3
+; GFX900-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX900-NEXT:    v_mov_b32_e32 v2, s20
+; GFX900-NEXT:    v_mov_b32_e32 v3, s23
+; GFX900-NEXT:    v_fma_f32 v2, s17, v2, v3
+; GFX900-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX900-NEXT:    v_mov_b32_e32 v2, s21
+; GFX900-NEXT:    v_mov_b32_e32 v3, s24
+; GFX900-NEXT:    v_fma_f32 v2, s18, v2, v3
+; GFX900-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX900-NEXT:    v_mov_b32_e32 v2, s4
+; GFX900-NEXT:    v_mov_b32_e32 v3, s5
+; GFX900-NEXT:    v_mov_b32_e32 v4, s6
+; GFX900-NEXT:    global_store_dwordx3 v[0:1], v[2:4], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_v3f32_fpexcept_strict_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s4, s3
+; GFX942-NEXT:    s_mov_b32 s5, s16
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[18:19]
+; GFX942-NEXT:    s_mov_b32 s6, s17
+; GFX942-NEXT:    v_pk_fma_f32 v[2:3], s[0:1], v[2:3], v[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[20:21]
+; GFX942-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT:    v_pk_fma_f32 v[2:3], s[2:3], v[2:3], v[4:5]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    global_store_dwordx3 v[0:1], v[2:4], off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v3f32_fpexcept_strict_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_f32 v2, s0, s3, v2
+; GFX11-NEXT:    v_fma_f32 v3, s1, s16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX11-NEXT:    v_mov_b32_e32 v4, s20
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v4, s2, s17, v4
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v3f32_fpexcept_strict_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_fma_f32 v2, s0, s3, v2
+; GFX12-NEXT:    v_fma_f32 v3, s1, s16, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX12-NEXT:    v_mov_b32_e32 v4, s20
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_fma_f32 v4, s2, s17, v4
+; GFX12-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX12-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    global_store_b96 v[0:1], v[2:4], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x float> @llvm.experimental.constrained.fma.v3f32(<3 x float> %x, <3 x float> %y, <3 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret <3 x float> %val
+  store <3 x float> %val, ptr addrspace(1) %out
+  ret void
 }
 
-define <4 x float> @v_constained_fma_v4f32_fpexcept_strict(<4 x float> %x, <4 x float> %y, <4 x float> %z) #0 {
-; GCN-LABEL: v_constained_fma_v4f32_fpexcept_strict:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f32 v0, v0, v4, v8
-; GCN-NEXT:    v_fma_f32 v1, v1, v5, v9
-; GCN-NEXT:    v_fma_f32 v2, v2, v6, v10
-; GCN-NEXT:    v_fma_f32 v3, v3, v7, v11
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %val = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret <4 x float> %val
+define void @v_constained_fma_v3f32_fpexcept_strict_div(<3 x float> %x, <3 x float> %y, <3 x float> %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v3f32_fpexcept_strict_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f32 v0, v0, v3, v6
+; GFX8-NEXT:    v_fma_f32 v1, v1, v4, v7
+; GFX8-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GFX8-NEXT:    flat_store_dwordx3 v[9:10], v[0:2]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_v3f32_fpexcept_strict_div:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_fma_f32 v0, v0, v3, v6
+; GFX900-NEXT:    v_fma_f32 v1, v1, v4, v7
+; GFX900-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GFX900-NEXT:    global_store_dwordx3 v[9:10], v[0:2], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_v3f32_fpexcept_strict_div:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v12, v3
+; GFX942-NEXT:    v_mov_b32_e32 v13, v4
+; GFX942-NEXT:    v_mov_b32_e32 v4, v5
+; GFX942-NEXT:    v_mov_b32_e32 v14, v9
+; GFX942-NEXT:    v_mov_b32_e32 v15, v10
+; GFX942-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], v[12:13], v[6:7]
+; GFX942-NEXT:    v_pk_fma_f32 v[2:3], v[2:3], v[4:5], v[8:9]
+; GFX942-NEXT:    global_store_dwordx3 v[14:15], v[0:2], off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v3f32_fpexcept_strict_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f32 v0, v0, v3, v6
+; GFX11-NEXT:    v_fma_f32 v1, v1, v4, v7
+; GFX11-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GFX11-NEXT:    global_store_b96 v[9:10], v[0:2], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v3f32_fpexcept_strict_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f32 v0, v0, v3, v6
+; GFX12-NEXT:    v_fma_f32 v1, v1, v4, v7
+; GFX12-NEXT:    v_fma_f32 v2, v2, v5, v8
+; GFX12-NEXT:    global_store_b96 v[9:10], v[0:2], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %val = call <3 x float> @llvm.experimental.constrained.fma.v3f32(<3 x float> %x, <3 x float> %y, <3 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store <3 x float> %val, ptr addrspace(1) %out
+  ret void
 }
 
-define float @v_constained_fma_f32_fpexcept_strict_fneg(float %x, float %y, float %z) #0 {
-; GCN-LABEL: v_constained_fma_f32_fpexcept_strict_fneg:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f32 v0, v0, v1, -v2
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+;define void @v_constained_fma_v4f32_fpexcept_strict_uni(<4 x float> inreg %x, <4 x float> inreg %y, <4 x float> inreg %z, ptr addrspace(1) %out) #0 {
+;  %val = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+;  store <4 x float> %val, ptr addrspace(1) %out
+;  ret void
+;}
+;
+;define void @v_constained_fma_v4f32_fpexcept_strict_div(<4 x float> %x, <4 x float> %y, <4 x float> %z, ptr addrspace(1) %out) #0 {
+;  %val = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+;  store <4 x float> %val, ptr addrspace(1) %out
+;  ret void
+;}
+
+define void @v_constained_fma_f32_fpexcept_strict_fneg_uni(float inreg %x, float inreg %y, float inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s16
+; GFX8-NEXT:    v_mov_b32_e32 v3, s17
+; GFX8-NEXT:    v_fma_f32 v2, v2, v3, -s18
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s16
+; GFX900-NEXT:    v_mov_b32_e32 v3, s17
+; GFX900-NEXT:    v_fma_f32 v2, v2, v3, -s18
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    v_fma_f32 v2, v2, v3, -s2
+; GFX942-NEXT:    global_store_dword v[0:1], v2, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v2, s0, v2, -s2
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_fma_f32 v2, s0, s1, -v2
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %neg.z = fneg float %z
   %val = call float @llvm.experimental.constrained.fma.f32(float %x, float %y, float %neg.z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret float %val
+  store float %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_f32_fpexcept_strict_fneg_div(float %x, float %y, float %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GFX8-NEXT:    flat_store_dword v[3:4], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_div:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GFX900-NEXT:    global_store_dword v[3:4], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_div:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, v3
+; GFX942-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GFX942-NEXT:    global_store_dword v[6:7], v0, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GFX11-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f32 v0, v0, v1, -v2
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %neg.z = fneg float %z
+  %val = call float @llvm.experimental.constrained.fma.f32(float %x, float %y, float %neg.z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store float %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_f32_fpexcept_strict_fneg_fneg_uni(float inreg %x, float inreg %y, float inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_fneg_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s17
+; GFX8-NEXT:    v_mov_b32_e32 v3, s18
+; GFX8-NEXT:    v_fma_f32 v2, -s16, -v2, v3
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_fneg_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s17
+; GFX900-NEXT:    v_mov_b32_e32 v3, s18
+; GFX900-NEXT:    v_fma_f32 v2, -s16, -v2, v3
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_fneg_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942-NEXT:    v_fma_f32 v2, -s0, -v2, v3
+; GFX942-NEXT:    global_store_dword v[0:1], v2, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_fneg_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v2, -s0, -s1, v2
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_fneg_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_fma_f32 v2, -s0, -v2, s2
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %neg.x = fneg float %x
+  %neg.y = fneg float %y
+  %val = call float @llvm.experimental.constrained.fma.f32(float %neg.x, float %neg.y, float %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store float %val, ptr addrspace(1) %out
+  ret void
 }
 
-define float @v_constained_fma_f32_fpexcept_strict_fneg_fneg(float %x, float %y, float %z) #0 {
-; GCN-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_fneg:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f32 v0, -v0, -v1, v2
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+define void @v_constained_fma_f32_fpexcept_strict_fneg_fneg_div(float %x, float %y, float %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_fneg_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f32 v0, -v0, -v1, v2
+; GFX8-NEXT:    flat_store_dword v[3:4], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_fneg_div:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_fma_f32 v0, -v0, -v1, v2
+; GFX900-NEXT:    global_store_dword v[3:4], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_fneg_div:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, v3
+; GFX942-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-NEXT:    v_fma_f32 v0, -v0, -v1, v2
+; GFX942-NEXT:    global_store_dword v[6:7], v0, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_fneg_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f32 v0, -v0, -v1, v2
+; GFX11-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f32_fpexcept_strict_fneg_fneg_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f32 v0, -v0, -v1, v2
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg float %x
   %neg.y = fneg float %y
   %val = call float @llvm.experimental.constrained.fma.f32(float %neg.x, float %neg.y, float %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret float %val
+  store float %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_f32_fpexcept_strict_fabs_fabs_uni(float inreg %x, float inreg %y, float inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f32_fpexcept_strict_fabs_fabs_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s17
+; GFX8-NEXT:    v_mov_b32_e32 v3, s18
+; GFX8-NEXT:    v_fma_f32 v2, |s16|, |v2|, v3
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f32_fpexcept_strict_fabs_fabs_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s17
+; GFX900-NEXT:    v_mov_b32_e32 v3, s18
+; GFX900-NEXT:    v_fma_f32 v2, |s16|, |v2|, v3
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f32_fpexcept_strict_fabs_fabs_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942-NEXT:    v_fma_f32 v2, |s0|, |v2|, v3
+; GFX942-NEXT:    global_store_dword v[0:1], v2, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f32_fpexcept_strict_fabs_fabs_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v2, |s0|, |s1|, v2
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f32_fpexcept_strict_fabs_fabs_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_fma_f32 v2, |s0|, |v2|, s2
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %neg.x = call float @llvm.fabs.f32(float %x) #0
+  %neg.y = call float @llvm.fabs.f32(float %y) #0
+  %val = call float @llvm.experimental.constrained.fma.f32(float %neg.x, float %neg.y, float %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store float %val, ptr addrspace(1) %out
+  ret void
 }
 
-define float @v_constained_fma_f32_fpexcept_strict_fabs_fabs(float %x, float %y, float %z) #0 {
-; GCN-LABEL: v_constained_fma_f32_fpexcept_strict_fabs_fabs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f32 v0, |v0|, |v1|, v2
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+define void @v_constained_fma_f32_fpexcept_strict_fabs_fabs_div(float %x, float %y, float %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f32_fpexcept_strict_fabs_fabs_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f32 v0, |v0|, |v1|, v2
+; GFX8-NEXT:    flat_store_dword v[3:4], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f32_fpexcept_strict_fabs_fabs_div:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_fma_f32 v0, |v0|, |v1|, v2
+; GFX900-NEXT:    global_store_dword v[3:4], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f32_fpexcept_strict_fabs_fabs_div:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, v3
+; GFX942-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-NEXT:    v_fma_f32 v0, |v0|, |v1|, v2
+; GFX942-NEXT:    global_store_dword v[6:7], v0, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f32_fpexcept_strict_fabs_fabs_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f32 v0, |v0|, |v1|, v2
+; GFX11-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f32_fpexcept_strict_fabs_fabs_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f32 v0, |v0|, |v1|, v2
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = call float @llvm.fabs.f32(float %x) #0
   %neg.y = call float @llvm.fabs.f32(float %y) #0
   %val = call float @llvm.experimental.constrained.fma.f32(float %neg.x, float %neg.y, float %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret float %val
+  store float %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_v2f32_fpexcept_strict_fneg_fneg_uni(<2 x float> inreg %x, <2 x float> inreg %y, <2 x float> inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v2f32_fpexcept_strict_fneg_fneg_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s18
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
+; GFX8-NEXT:    v_fma_f32 v2, -s16, -v2, v4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s19
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s21
+; GFX8-NEXT:    v_fma_f32 v2, -s17, -v3, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_v2f32_fpexcept_strict_fneg_fneg_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s18
+; GFX900-NEXT:    v_mov_b32_e32 v4, s20
+; GFX900-NEXT:    v_fma_f32 v2, -s16, -v2, v4
+; GFX900-NEXT:    v_mov_b32_e32 v3, s19
+; GFX900-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX900-NEXT:    v_mov_b32_e32 v2, s21
+; GFX900-NEXT:    v_fma_f32 v2, -s17, -v3, v2
+; GFX900-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX900-NEXT:    v_mov_b32_e32 v2, s4
+; GFX900-NEXT:    v_mov_b32_e32 v3, s5
+; GFX900-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_v2f32_fpexcept_strict_fneg_fneg_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
+; GFX942-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX942-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT:    v_pk_fma_f32 v[2:3], s[0:1], v[2:3], v[4:5]
+; GFX942-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v2f32_fpexcept_strict_fneg_fneg_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_f32 v2, -s0, -s2, v2
+; GFX11-NEXT:    v_fma_f32 v3, -s1, -s3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v2f32_fpexcept_strict_fneg_fneg_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_fma_f32 v2, -s0, -v2, s16
+; GFX12-NEXT:    v_fma_f32 v3, -s1, -v3, s17
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX12-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %neg.x = fneg <2 x float> %x
+  %neg.y = fneg <2 x float> %y
+  %val = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %neg.x, <2 x float> %neg.y, <2 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store <2 x float> %val, ptr addrspace(1) %out
+  ret void
 }
 
-define <2 x float> @v_constained_fma_v2f32_fpexcept_strict_fneg_fneg(<2 x float> %x, <2 x float> %y, <2 x float> %z) #0 {
-; GCN-LABEL: v_constained_fma_v2f32_fpexcept_strict_fneg_fneg:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f32 v0, -v0, -v2, v4
-; GCN-NEXT:    v_fma_f32 v1, -v1, -v3, v5
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+define void @v_constained_fma_v2f32_fpexcept_strict_fneg_fneg_div(<2 x float> %x, <2 x float> %y, <2 x float> %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v2f32_fpexcept_strict_fneg_fneg_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f32 v0, -v0, -v2, v4
+; GFX8-NEXT:    v_fma_f32 v1, -v1, -v3, v5
+; GFX8-NEXT:    flat_store_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_v2f32_fpexcept_strict_fneg_fneg_div:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_fma_f32 v0, -v0, -v2, v4
+; GFX900-NEXT:    v_fma_f32 v1, -v1, -v3, v5
+; GFX900-NEXT:    global_store_dwordx2 v[6:7], v[0:1], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_v2f32_fpexcept_strict_fneg_fneg_div:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX942-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX942-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX942-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX942-NEXT:    v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX942-NEXT:    global_store_dwordx2 v[6:7], v[0:1], off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v2f32_fpexcept_strict_fneg_fneg_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f32 v0, -v0, -v2, v4
+; GFX11-NEXT:    v_fma_f32 v1, -v1, -v3, v5
+; GFX11-NEXT:    global_store_b64 v[6:7], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v2f32_fpexcept_strict_fneg_fneg_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f32 v0, -v0, -v2, v4
+; GFX12-NEXT:    v_fma_f32 v1, -v1, -v3, v5
+; GFX12-NEXT:    global_store_b64 v[6:7], v[0:1], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg <2 x float> %x
   %neg.y = fneg <2 x float> %y
   %val = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %neg.x, <2 x float> %neg.y, <2 x float> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret <2 x float> %val
+  store <2 x float> %val, ptr addrspace(1) %out
+  ret void
 }
 
 declare float @llvm.fabs.f32(float)
@@ -102,3 +931,6 @@ declare <3 x float> @llvm.experimental.constrained.fma.v3f32(<3 x float>, <3 x f
 declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
 
 attributes #0 = { strictfp }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f64.ll
index 04a07c42c934c..3a581d384f1f8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f64.ll
@@ -1,98 +1,1079 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
-
-define double @v_constained_fma_f64_fpexcept_strict(double %x, double %y, double %z) #0 {
-; GCN-LABEL: v_constained_fma_f64_fpexcept_strict:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx942 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX942 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+
+define void @v_constained_fma_f64_fpexcept_strict_uni(double inreg %x, double inreg %y, double inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f64_fpexcept_strict_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s18
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
+; GFX8-NEXT:    v_mov_b32_e32 v3, s19
+; GFX8-NEXT:    v_mov_b32_e32 v5, s21
+; GFX8-NEXT:    v_fma_f64 v[2:3], s[16:17], v[2:3], v[4:5]
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f64_fpexcept_strict_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s18
+; GFX900-NEXT:    v_mov_b32_e32 v4, s20
+; GFX900-NEXT:    v_mov_b32_e32 v3, s19
+; GFX900-NEXT:    v_mov_b32_e32 v5, s21
+; GFX900-NEXT:    v_fma_f64 v[2:3], s[16:17], v[2:3], v[4:5]
+; GFX900-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f64_fpexcept_strict_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
+; GFX942-NEXT:    v_fma_f64 v[2:3], s[0:1], v[2:3], v[4:5]
+; GFX942-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f64_fpexcept_strict_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[2:3], s[0:1], s[2:3], v[2:3]
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f64_fpexcept_strict_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_fma_f64 v[2:3], s[0:1], s[2:3], v[2:3]
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %val = call double @llvm.experimental.constrained.fma.f64(double %x, double %y, double %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret double %val
+  store double %val, ptr addrspace(1) %out
+  ret void
 }
 
-define <2 x double> @v_constained_fma_v2f64_fpexcept_strict(<2 x double> %x, <2 x double> %y, <2 x double> %z) #0 {
-; GCN-LABEL: v_constained_fma_v2f64_fpexcept_strict:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
-; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+define void @v_constained_fma_f64_fpexcept_strict_div(double %x, double %y, double %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f64_fpexcept_strict_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX8-NEXT:    flat_store_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_constained_fma_f64_fpexcept_strict_div:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f64_fpexcept_strict_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX11-NEXT:    global_store_b64 v[6:7], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f64_fpexcept_strict_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[0:1], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %val = call double @llvm.experimental.constrained.fma.f64(double %x, double %y, double %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store double %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_v2f64_fpexcept_strict_uni(<2 x double> inreg %x, <2 x double> inreg %y, <2 x double> inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v2f64_fpexcept_strict_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s20
+; GFX8-NEXT:    v_mov_b32_e32 v4, s24
+; GFX8-NEXT:    v_mov_b32_e32 v3, s21
+; GFX8-NEXT:    v_mov_b32_e32 v5, s25
+; GFX8-NEXT:    v_fma_f64 v[2:3], s[16:17], v[2:3], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s22
+; GFX8-NEXT:    v_mov_b32_e32 v6, s26
+; GFX8-NEXT:    v_mov_b32_e32 v5, s23
+; GFX8-NEXT:    v_mov_b32_e32 v7, s27
+; GFX8-NEXT:    v_fma_f64 v[4:5], s[18:19], v[4:5], v[6:7]
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v4
+; GFX8-NEXT:    v_readfirstlane_b32 s7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_v2f64_fpexcept_strict_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s20
+; GFX900-NEXT:    v_mov_b32_e32 v4, s24
+; GFX900-NEXT:    v_mov_b32_e32 v3, s21
+; GFX900-NEXT:    v_mov_b32_e32 v5, s25
+; GFX900-NEXT:    v_fma_f64 v[2:3], s[16:17], v[2:3], v[4:5]
+; GFX900-NEXT:    v_mov_b32_e32 v4, s22
+; GFX900-NEXT:    v_mov_b32_e32 v6, s26
+; GFX900-NEXT:    v_mov_b32_e32 v5, s23
+; GFX900-NEXT:    v_mov_b32_e32 v7, s27
+; GFX900-NEXT:    v_fma_f64 v[4:5], s[18:19], v[4:5], v[6:7]
+; GFX900-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX900-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX900-NEXT:    v_readfirstlane_b32 s6, v4
+; GFX900-NEXT:    v_readfirstlane_b32 s7, v5
+; GFX900-NEXT:    v_mov_b32_e32 v2, s4
+; GFX900-NEXT:    v_mov_b32_e32 v3, s5
+; GFX900-NEXT:    v_mov_b32_e32 v4, s6
+; GFX900-NEXT:    v_mov_b32_e32 v5, s7
+; GFX900-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_v2f64_fpexcept_strict_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[16:17]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[20:21]
+; GFX942-NEXT:    v_fma_f64 v[2:3], s[0:1], v[2:3], v[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[22:23]
+; GFX942-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[18:19]
+; GFX942-NEXT:    v_fma_f64 v[2:3], s[2:3], v[2:3], v[4:5]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v2f64_fpexcept_strict_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23
+; GFX11-NEXT:    v_dual_mov_b32 v2, s20 :: v_dual_mov_b32 v3, s21
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_f64 v[4:5], s[2:3], s[18:19], v[4:5]
+; GFX11-NEXT:    v_fma_f64 v[2:3], s[0:1], s[16:17], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v5
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX11-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX11-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v2f64_fpexcept_strict_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23
+; GFX12-NEXT:    v_dual_mov_b32 v2, s20 :: v_dual_mov_b32 v3, s21
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_fma_f64 v[4:5], s[2:3], s[18:19], v[4:5]
+; GFX12-NEXT:    v_fma_f64 v[2:3], s[0:1], s[16:17], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_readfirstlane_b32 s3, v5
+; GFX12-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX12-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %val = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret <2 x double> %val
+  store <2 x double> %val, ptr addrspace(1) %out
+  ret void
 }
 
-define <3 x double> @v_constained_fma_v3f64_fpexcept_strict(<3 x double> %x, <3 x double> %y, <3 x double> %z) #0 {
-; GCN-LABEL: v_constained_fma_v3f64_fpexcept_strict:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
-; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
-; GCN-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+define void @v_constained_fma_v2f64_fpexcept_strict_div(<2 x double> %x, <2 x double> %y, <2 x double> %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v2f64_fpexcept_strict_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
+; GFX8-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
+; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_constained_fma_v2f64_fpexcept_strict_div:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
+; GFX9-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
+; GFX9-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v2f64_fpexcept_strict_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
+; GFX11-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
+; GFX11-NEXT:    global_store_b128 v[12:13], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v2f64_fpexcept_strict_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9]
+; GFX12-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[0:3], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %val = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store <2 x double> %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_v3f64_fpexcept_strict_uni(<3 x double> inreg %x, <3 x double> inreg %y, <3 x double> inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v3f64_fpexcept_strict_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, s22
+; GFX8-NEXT:    v_mov_b32_e32 v8, s28
+; GFX8-NEXT:    v_mov_b32_e32 v7, s23
+; GFX8-NEXT:    v_mov_b32_e32 v9, s29
+; GFX8-NEXT:    v_fma_f64 v[6:7], s[16:17], v[6:7], v[8:9]
+; GFX8-NEXT:    v_mov_b32_e32 v8, s24
+; GFX8-NEXT:    v_mov_b32_e32 v9, s25
+; GFX8-NEXT:    v_fma_f64 v[8:9], s[18:19], v[8:9], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s26
+; GFX8-NEXT:    v_mov_b32_e32 v1, s27
+; GFX8-NEXT:    v_fma_f64 v[0:1], s[20:21], v[0:1], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 16, v4
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[6:9]
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_v3f64_fpexcept_strict_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v6, s22
+; GFX900-NEXT:    v_mov_b32_e32 v8, s28
+; GFX900-NEXT:    v_mov_b32_e32 v7, s23
+; GFX900-NEXT:    v_mov_b32_e32 v9, s29
+; GFX900-NEXT:    v_fma_f64 v[6:7], s[16:17], v[6:7], v[8:9]
+; GFX900-NEXT:    v_mov_b32_e32 v8, s24
+; GFX900-NEXT:    v_mov_b32_e32 v9, s25
+; GFX900-NEXT:    v_fma_f64 v[8:9], s[18:19], v[8:9], v[0:1]
+; GFX900-NEXT:    v_mov_b32_e32 v0, s26
+; GFX900-NEXT:    v_mov_b32_e32 v1, s27
+; GFX900-NEXT:    v_fma_f64 v[0:1], s[20:21], v[0:1], v[2:3]
+; GFX900-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
+; GFX900-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off offset:16
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_v3f64_fpexcept_strict_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[18:19]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[24:25]
+; GFX942-NEXT:    v_fma_f64 v[2:3], s[0:1], v[2:3], v[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[26:27]
+; GFX942-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[20:21]
+; GFX942-NEXT:    v_fma_f64 v[2:3], s[2:3], v[2:3], v[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[22:23]
+; GFX942-NEXT:    v_fma_f64 v[6:7], s[16:17], v[2:3], v[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v3f64_fpexcept_strict_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s26 :: v_dual_mov_b32 v5, s27
+; GFX11-NEXT:    v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25
+; GFX11-NEXT:    v_dual_mov_b32 v6, s28 :: v_dual_mov_b32 v7, s29
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_fma_f64 v[4:5], s[2:3], s[20:21], v[4:5]
+; GFX11-NEXT:    v_fma_f64 v[2:3], s[0:1], s[18:19], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_fma_f64 v[6:7], s[16:17], s[22:23], v[6:7]
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX11-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v3f64_fpexcept_strict_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v4, s26 :: v_dual_mov_b32 v5, s27
+; GFX12-NEXT:    v_dual_mov_b32 v2, s24 :: v_dual_mov_b32 v3, s25
+; GFX12-NEXT:    v_dual_mov_b32 v6, s28 :: v_dual_mov_b32 v7, s29
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_fma_f64 v[4:5], s[2:3], s[20:21], v[4:5]
+; GFX12-NEXT:    v_fma_f64 v[2:3], s[0:1], s[18:19], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_fma_f64 v[6:7], s[16:17], s[22:23], v[6:7]
+; GFX12-NEXT:    v_readfirstlane_b32 s3, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX12-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %val = call <3 x double> @llvm.experimental.constrained.fma.v3f64(<3 x double> %x, <3 x double> %y, <3 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret <3 x double> %val
-}
-
-define <4 x double> @v_constained_fma_v4f64_fpexcept_strict(<4 x double> %x, <4 x double> %y, <4 x double> %z) #0 {
-; GCN-LABEL: v_constained_fma_v4f64_fpexcept_strict:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
-; GCN-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
-; GCN-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
-; GCN-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+  store <3 x double> %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_v3f64_fpexcept_strict_div(<3 x double> %x, <3 x double> %y, <3 x double> %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v3f64_fpexcept_strict_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX8-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX8-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v18
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v19, vcc
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_constained_fma_v3f64_fpexcept_strict_div:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX9-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX9-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX9-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off
+; GFX9-NEXT:    global_store_dwordx2 v[18:19], v[4:5], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v3f64_fpexcept_strict_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX11-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX11-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v[18:19], v[0:3], off
+; GFX11-NEXT:    global_store_b64 v[18:19], v[4:5], off offset:16
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v3f64_fpexcept_strict_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f64 v[0:1], v[0:1], v[6:7], v[12:13]
+; GFX12-NEXT:    v_fma_f64 v[2:3], v[2:3], v[8:9], v[14:15]
+; GFX12-NEXT:    v_fma_f64 v[4:5], v[4:5], v[10:11], v[16:17]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[0:3], off
+; GFX12-NEXT:    global_store_b64 v[18:19], v[4:5], off offset:16
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %val = call <3 x double> @llvm.experimental.constrained.fma.v3f64(<3 x double> %x, <3 x double> %y, <3 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store <3 x double> %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_v4f64_fpexcept_strict_uni(<4 x double> inreg %x, <4 x double> inreg %y, <4 x double> inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v4f64_fpexcept_strict_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v12, s24
+; GFX8-NEXT:    v_mov_b32_e32 v13, s25
+; GFX8-NEXT:    v_fma_f64 v[2:3], s[16:17], v[12:13], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v12, s26
+; GFX8-NEXT:    v_mov_b32_e32 v13, s27
+; GFX8-NEXT:    v_fma_f64 v[4:5], s[18:19], v[12:13], v[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v12, s28
+; GFX8-NEXT:    v_mov_b32_e32 v13, s29
+; GFX8-NEXT:    v_fma_f64 v[6:7], s[20:21], v[12:13], v[6:7]
+; GFX8-NEXT:    v_fma_f64 v[8:9], s[22:23], v[0:1], v[8:9]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v10
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v11, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[2:5]
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[6:9]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_v4f64_fpexcept_strict_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v12, s24
+; GFX900-NEXT:    v_mov_b32_e32 v13, s25
+; GFX900-NEXT:    v_fma_f64 v[2:3], s[16:17], v[12:13], v[2:3]
+; GFX900-NEXT:    v_mov_b32_e32 v12, s26
+; GFX900-NEXT:    v_mov_b32_e32 v13, s27
+; GFX900-NEXT:    v_fma_f64 v[4:5], s[18:19], v[12:13], v[4:5]
+; GFX900-NEXT:    v_mov_b32_e32 v12, s28
+; GFX900-NEXT:    v_mov_b32_e32 v13, s29
+; GFX900-NEXT:    v_fma_f64 v[6:7], s[20:21], v[12:13], v[6:7]
+; GFX900-NEXT:    v_fma_f64 v[8:9], s[22:23], v[0:1], v[8:9]
+; GFX900-NEXT:    global_store_dwordx4 v[10:11], v[2:5], off
+; GFX900-NEXT:    global_store_dwordx4 v[10:11], v[6:9], off offset:16
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_v4f64_fpexcept_strict_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], s[20:21]
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], s[28:29]
+; GFX942-NEXT:    v_fma_f64 v[8:9], s[0:1], v[8:9], v[10:11]
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], s[22:23]
+; GFX942-NEXT:    v_fma_f64 v[10:11], s[2:3], v[10:11], v[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; GFX942-NEXT:    v_fma_f64 v[0:1], s[16:17], v[0:1], v[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; GFX942-NEXT:    v_fma_f64 v[2:3], s[18:19], v[2:3], v[4:5]
+; GFX942-NEXT:    global_store_dwordx4 v[6:7], v[8:11], off
+; GFX942-NEXT:    global_store_dwordx4 v[6:7], v[0:3], off offset:16
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v4f64_fpexcept_strict_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v8, s28 :: v_dual_mov_b32 v9, s29
+; GFX11-NEXT:    v_fma_f64 v[10:11], s[2:3], s[22:23], v[0:1]
+; GFX11-NEXT:    v_fma_f64 v[0:1], s[16:17], s[24:25], v[2:3]
+; GFX11-NEXT:    v_fma_f64 v[2:3], s[18:19], s[26:27], v[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_fma_f64 v[8:9], s[0:1], s[20:21], v[8:9]
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v[6:7], v[8:11], off
+; GFX11-NEXT:    global_store_b128 v[6:7], v[0:3], off offset:16
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v4f64_fpexcept_strict_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v8, s28 :: v_dual_mov_b32 v9, s29
+; GFX12-NEXT:    v_fma_f64 v[10:11], s[2:3], s[22:23], v[0:1]
+; GFX12-NEXT:    v_fma_f64 v[0:1], s[16:17], s[24:25], v[2:3]
+; GFX12-NEXT:    v_fma_f64 v[2:3], s[18:19], s[26:27], v[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT:    v_fma_f64 v[8:9], s[0:1], s[20:21], v[8:9]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[6:7], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[6:7], v[0:3], off offset:16
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %val = call <4 x double> @llvm.experimental.constrained.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store <4 x double> %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_v4f64_fpexcept_strict_div(<4 x double> %x, <4 x double> %y, <4 x double> %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v4f64_fpexcept_strict_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX8-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX8-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX8-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX8-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v24
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v25, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_constained_fma_v4f64_fpexcept_strict_div:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX9-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX9-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX9-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX9-NEXT:    global_store_dwordx4 v[24:25], v[0:3], off
+; GFX9-NEXT:    global_store_dwordx4 v[24:25], v[4:7], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v4f64_fpexcept_strict_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX11-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX11-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX11-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v[24:25], v[0:3], off
+; GFX11-NEXT:    global_store_b128 v[24:25], v[4:7], off offset:16
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v4f64_fpexcept_strict_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17]
+; GFX12-NEXT:    v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19]
+; GFX12-NEXT:    v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21]
+; GFX12-NEXT:    v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[24:25], v[0:3], off
+; GFX12-NEXT:    global_store_b128 v[24:25], v[4:7], off offset:16
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %val = call <4 x double> @llvm.experimental.constrained.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret <4 x double> %val
+  store <4 x double> %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_f64_fpexcept_strict_fneg_uni(double inreg %x, double inreg %y, double inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s16
+; GFX8-NEXT:    v_mov_b32_e32 v4, s18
+; GFX8-NEXT:    v_mov_b32_e32 v3, s17
+; GFX8-NEXT:    v_mov_b32_e32 v5, s19
+; GFX8-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], -s[20:21]
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s16
+; GFX900-NEXT:    v_mov_b32_e32 v4, s18
+; GFX900-NEXT:    v_mov_b32_e32 v3, s17
+; GFX900-NEXT:    v_mov_b32_e32 v5, s19
+; GFX900-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], -s[20:21]
+; GFX900-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
+; GFX942-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], -s[16:17]
+; GFX942-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[2:3], s[0:1], v[2:3], -s[16:17]
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_fma_f64 v[2:3], s[0:1], v[2:3], -s[16:17]
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %neg.z = fneg double %z
+  %val = call double @llvm.experimental.constrained.fma.f64(double %x, double %y, double %neg.z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store double %val, ptr addrspace(1) %out
+  ret void
 }
 
-define double @v_constained_fma_f64_fpexcept_strict_fneg(double %x, double %y, double %z) #0 {
-; GCN-LABEL: v_constained_fma_f64_fpexcept_strict_fneg:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+define void @v_constained_fma_f64_fpexcept_strict_fneg_div(double %x, double %y, double %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
+; GFX8-NEXT:    flat_store_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_div:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
+; GFX11-NEXT:    global_store_b64 v[6:7], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[0:1], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %neg.z = fneg double %z
   %val = call double @llvm.experimental.constrained.fma.f64(double %x, double %y, double %neg.z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret double %val
+  store double %val, ptr addrspace(1) %out
+  ret void
 }
 
-define double @v_constained_fma_f64_fpexcept_strict_fneg_fneg(double %x, double %y, double %z) #0 {
-; GCN-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_fneg:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f64 v[0:1], -v[0:1], -v[2:3], v[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+define void @v_constained_fma_f64_fpexcept_strict_fneg_fneg_uni(double inreg %x, double inreg %y, double inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_fneg_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s18
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
+; GFX8-NEXT:    v_mov_b32_e32 v3, s19
+; GFX8-NEXT:    v_mov_b32_e32 v5, s21
+; GFX8-NEXT:    v_fma_f64 v[2:3], -s[16:17], -v[2:3], v[4:5]
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_fneg_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s18
+; GFX900-NEXT:    v_mov_b32_e32 v4, s20
+; GFX900-NEXT:    v_mov_b32_e32 v3, s19
+; GFX900-NEXT:    v_mov_b32_e32 v5, s21
+; GFX900-NEXT:    v_fma_f64 v[2:3], -s[16:17], -v[2:3], v[4:5]
+; GFX900-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_fneg_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
+; GFX942-NEXT:    v_fma_f64 v[2:3], -s[0:1], -v[2:3], v[4:5]
+; GFX942-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_fneg_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[2:3], -s[0:1], -s[2:3], v[2:3]
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_fneg_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_fma_f64 v[2:3], -s[0:1], -s[2:3], v[2:3]
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg double %x
   %neg.y = fneg double %y
   %val = call double @llvm.experimental.constrained.fma.f64(double %neg.x, double %neg.y, double %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret double %val
+  store double %val, ptr addrspace(1) %out
+  ret void
 }
 
-define double @v_constained_fma_f64_fpexcept_strict_fabs_fabs(double %x, double %y, double %z) #0 {
-; GCN-LABEL: v_constained_fma_f64_fpexcept_strict_fabs_fabs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, |v[2:3]|, v[4:5]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+define void @v_constained_fma_f64_fpexcept_strict_fneg_fneg_div(double %x, double %y, double %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_fneg_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f64 v[0:1], -v[0:1], -v[2:3], v[4:5]
+; GFX8-NEXT:    flat_store_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_fneg_div:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_fma_f64 v[0:1], -v[0:1], -v[2:3], v[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_fneg_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f64 v[0:1], -v[0:1], -v[2:3], v[4:5]
+; GFX11-NEXT:    global_store_b64 v[6:7], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f64_fpexcept_strict_fneg_fneg_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f64 v[0:1], -v[0:1], -v[2:3], v[4:5]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[0:1], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %neg.x = fneg double %x
+  %neg.y = fneg double %y
+  %val = call double @llvm.experimental.constrained.fma.f64(double %neg.x, double %neg.y, double %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store double %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_f64_fpexcept_strict_fabs_fabs_uni(double inreg %x, double inreg %y, double inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f64_fpexcept_strict_fabs_fabs_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s18
+; GFX8-NEXT:    v_mov_b32_e32 v4, s20
+; GFX8-NEXT:    v_mov_b32_e32 v3, s19
+; GFX8-NEXT:    v_mov_b32_e32 v5, s21
+; GFX8-NEXT:    v_fma_f64 v[2:3], |s[16:17]|, |v[2:3]|, v[4:5]
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f64_fpexcept_strict_fabs_fabs_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s18
+; GFX900-NEXT:    v_mov_b32_e32 v4, s20
+; GFX900-NEXT:    v_mov_b32_e32 v3, s19
+; GFX900-NEXT:    v_mov_b32_e32 v5, s21
+; GFX900-NEXT:    v_fma_f64 v[2:3], |s[16:17]|, |v[2:3]|, v[4:5]
+; GFX900-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f64_fpexcept_strict_fabs_fabs_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
+; GFX942-NEXT:    v_fma_f64 v[2:3], |s[0:1]|, |v[2:3]|, v[4:5]
+; GFX942-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f64_fpexcept_strict_fabs_fabs_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f64 v[2:3], |s[0:1]|, |s[2:3]|, v[2:3]
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f64_fpexcept_strict_fabs_fabs_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v2, s16 :: v_dual_mov_b32 v3, s17
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_fma_f64 v[2:3], |s[0:1]|, |s[2:3]|, v[2:3]
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = call double @llvm.fabs.f64(double %x) #0
   %neg.y = call double @llvm.fabs.f64(double %y) #0
   %val = call double @llvm.experimental.constrained.fma.f64(double %neg.x, double %neg.y, double %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret double %val
+  store double %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_f64_fpexcept_strict_fabs_fabs_div(double %x, double %y, double %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f64_fpexcept_strict_fabs_fabs_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, |v[2:3]|, v[4:5]
+; GFX8-NEXT:    flat_store_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_constained_fma_f64_fpexcept_strict_fabs_fabs_div:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, |v[2:3]|, v[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f64_fpexcept_strict_fabs_fabs_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, |v[2:3]|, v[4:5]
+; GFX11-NEXT:    global_store_b64 v[6:7], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f64_fpexcept_strict_fabs_fabs_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, |v[2:3]|, v[4:5]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[0:1], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %neg.x = call double @llvm.fabs.f64(double %x) #0
+  %neg.y = call double @llvm.fabs.f64(double %y) #0
+  %val = call double @llvm.experimental.constrained.fma.f64(double %neg.x, double %neg.y, double %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store double %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_v2f64_fpexcept_strict_fneg_fneg_uni(<2 x double> inreg %x, <2 x double> inreg %y, <2 x double> inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v2f64_fpexcept_strict_fneg_fneg_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s20
+; GFX8-NEXT:    v_mov_b32_e32 v6, s24
+; GFX8-NEXT:    v_mov_b32_e32 v3, s21
+; GFX8-NEXT:    v_mov_b32_e32 v7, s25
+; GFX8-NEXT:    v_mov_b32_e32 v4, s22
+; GFX8-NEXT:    v_fma_f64 v[2:3], -s[16:17], -v[2:3], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v6, s26
+; GFX8-NEXT:    v_mov_b32_e32 v5, s23
+; GFX8-NEXT:    v_mov_b32_e32 v7, s27
+; GFX8-NEXT:    v_fma_f64 v[4:5], -s[18:19], -v[4:5], v[6:7]
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v4
+; GFX8-NEXT:    v_readfirstlane_b32 s7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_v2f64_fpexcept_strict_fneg_fneg_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s20
+; GFX900-NEXT:    v_mov_b32_e32 v6, s24
+; GFX900-NEXT:    v_mov_b32_e32 v3, s21
+; GFX900-NEXT:    v_mov_b32_e32 v7, s25
+; GFX900-NEXT:    v_mov_b32_e32 v4, s22
+; GFX900-NEXT:    v_fma_f64 v[2:3], -s[16:17], -v[2:3], v[6:7]
+; GFX900-NEXT:    v_mov_b32_e32 v6, s26
+; GFX900-NEXT:    v_mov_b32_e32 v5, s23
+; GFX900-NEXT:    v_mov_b32_e32 v7, s27
+; GFX900-NEXT:    v_fma_f64 v[4:5], -s[18:19], -v[4:5], v[6:7]
+; GFX900-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX900-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX900-NEXT:    v_readfirstlane_b32 s6, v4
+; GFX900-NEXT:    v_readfirstlane_b32 s7, v5
+; GFX900-NEXT:    v_mov_b32_e32 v2, s4
+; GFX900-NEXT:    v_mov_b32_e32 v3, s5
+; GFX900-NEXT:    v_mov_b32_e32 v4, s6
+; GFX900-NEXT:    v_mov_b32_e32 v5, s7
+; GFX900-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_v2f64_fpexcept_strict_fneg_fneg_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[16:17]
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], s[20:21]
+; GFX942-NEXT:    v_fma_f64 v[2:3], -s[0:1], -v[2:3], v[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[18:19]
+; GFX942-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[22:23]
+; GFX942-NEXT:    v_fma_f64 v[2:3], -s[2:3], -v[4:5], v[2:3]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v2f64_fpexcept_strict_fneg_fneg_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23
+; GFX11-NEXT:    v_dual_mov_b32 v2, s20 :: v_dual_mov_b32 v3, s21
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_f64 v[4:5], -s[2:3], -s[18:19], v[4:5]
+; GFX11-NEXT:    v_fma_f64 v[2:3], -s[0:1], -s[16:17], v[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v5
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX11-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX11-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v2f64_fpexcept_strict_fneg_fneg_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v5, s23
+; GFX12-NEXT:    v_dual_mov_b32 v2, s20 :: v_dual_mov_b32 v3, s21
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_fma_f64 v[4:5], -s[2:3], -s[18:19], v[4:5]
+; GFX12-NEXT:    v_fma_f64 v[2:3], -s[0:1], -s[16:17], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_readfirstlane_b32 s3, v5
+; GFX12-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX12-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %neg.x = fneg <2 x double> %x
+  %neg.y = fneg <2 x double> %y
+  %val = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %neg.x, <2 x double> %neg.y, <2 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store <2 x double> %val, ptr addrspace(1) %out
+  ret void
 }
 
-define <2 x double> @v_constained_fma_v2f64_fpexcept_strict_fneg_fneg(<2 x double> %x, <2 x double> %y, <2 x double> %z) #0 {
-; GCN-LABEL: v_constained_fma_v2f64_fpexcept_strict_fneg_fneg:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f64 v[0:1], -v[0:1], -v[4:5], v[8:9]
-; GCN-NEXT:    v_fma_f64 v[2:3], -v[2:3], -v[6:7], v[10:11]
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+define void @v_constained_fma_v2f64_fpexcept_strict_fneg_fneg_div(<2 x double> %x, <2 x double> %y, <2 x double> %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v2f64_fpexcept_strict_fneg_fneg_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f64 v[0:1], -v[0:1], -v[4:5], v[8:9]
+; GFX8-NEXT:    v_fma_f64 v[2:3], -v[2:3], -v[6:7], v[10:11]
+; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_constained_fma_v2f64_fpexcept_strict_fneg_fneg_div:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_fma_f64 v[0:1], -v[0:1], -v[4:5], v[8:9]
+; GFX9-NEXT:    v_fma_f64 v[2:3], -v[2:3], -v[6:7], v[10:11]
+; GFX9-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v2f64_fpexcept_strict_fneg_fneg_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f64 v[0:1], -v[0:1], -v[4:5], v[8:9]
+; GFX11-NEXT:    v_fma_f64 v[2:3], -v[2:3], -v[6:7], v[10:11]
+; GFX11-NEXT:    global_store_b128 v[12:13], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v2f64_fpexcept_strict_fneg_fneg_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f64 v[0:1], -v[0:1], -v[4:5], v[8:9]
+; GFX12-NEXT:    v_fma_f64 v[2:3], -v[2:3], -v[6:7], v[10:11]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[0:3], off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %neg.x = fneg <2 x double> %x
   %neg.y = fneg <2 x double> %y
   %val = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %neg.x, <2 x double> %neg.y, <2 x double> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret <2 x double> %val
+  store <2 x double> %val, ptr addrspace(1) %out
+  ret void
 }
 
 declare double @llvm.fabs.f64(double)
@@ -102,3 +1083,5 @@ declare <3 x double> @llvm.experimental.constrained.fma.v3f64(<3 x double>, <3 x
 declare <4 x double> @llvm.experimental.constrained.fma.v4f64(<4 x double>, <4 x double>, <4 x double>, metadata, metadata)
 
 attributes #0 = { strictfp }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}



More information about the llvm-commits mailing list