[llvm] Adding support for G_STRICT_FMA in new reg bank select (PR #170330)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 2 09:16:34 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Abhinav Garg (abhigargrepo)
<details>
<summary>Changes</summary>
This patch add legalization rules for G_STRICT_FMA opcode.
---
Patch is 142.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/170330.diff
5 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp (+3)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp (+16)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f16.ll (+727-94)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f32.ll (+899-67)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f64.ll (+1051-68)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 123fc5bf37a19..dd9ea1e11a9af 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -910,6 +910,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
return LLT::fixed_vector(2, 16);
case SgprV2S32:
case VgprV2S32:
+ case UniInVgprV2S32:
return LLT::fixed_vector(2, 32);
case SgprV4S32:
case SgprV4S32_WF:
@@ -1013,6 +1014,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case UniInVgprS32:
case UniInVgprS64:
case UniInVgprV2S16:
+ case UniInVgprV2S32:
case UniInVgprV4S32:
case UniInVgprB32:
case UniInVgprB64:
@@ -1148,6 +1150,7 @@ void RegBankLegalizeHelper::applyMappingDst(
case UniInVgprS32:
case UniInVgprS64:
case UniInVgprV2S16:
+ case UniInVgprV2S32:
case UniInVgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[OpIdx]));
assert(RB == SgprRB);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 0c1daefd493b6..feda4058149ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -120,6 +120,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return isAnyPtr(MRI.getType(Reg), 128) && MUI.isUniform(Reg);
case UniV2S16:
return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg);
+ case UniV2S32:
+ return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isUniform(Reg);
case UniB32:
return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg);
case UniB64:
@@ -160,6 +162,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return isAnyPtr(MRI.getType(Reg), 128) && MUI.isDivergent(Reg);
case DivV2S16:
return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg);
+ case DivV2S32:
+ return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isDivergent(Reg);
case DivB32:
return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg);
case DivB64:
@@ -926,6 +930,18 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard)
.Uni(S64, {{Sgpr64}, {}});
+ addRulesForGOpcs({G_STRICT_FMA}, Standard)
+ .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16, VgprV2S16}})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16, VgprV2S16}})
+ .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32, VgprV2S32}}})
+ .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32, VgprV2S32}}})
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}})
+ .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64, Vgpr64}})
+ .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}})
+ .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
+ .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64, Vgpr64}});
+
addRulesForGOpcs({G_BLOCK_ADDR}).Any({{UniP0}, {{SgprP0}, {}}});
addRulesForGOpcs({G_GLOBAL_VALUE})
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f16.ll
index 15740ee5476e8..5fbbc8c8f8dcd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/strict_fma.f16.ll
@@ -1,152 +1,783 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-
-define half @v_constained_fma_f16_fpexcept_strict(half %x, half %y, half %z) #0 {
-; GCN-LABEL: v_constained_fma_f16_fpexcept_strict:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_fma_f16 v0, v0, v1, v2
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx942 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX942 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+
+define void @v_constained_fma_f16_fpexcept_strict_uni(half inreg %x, half inreg %y, half inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f16_fpexcept_strict_uni:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s17
+; GFX8-NEXT: v_mov_b32_e32 v3, s18
+; GFX8-NEXT: v_fma_f16 v2, s16, v2, v3
+; GFX8-NEXT: flat_store_short v[0:1], v2
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f16_fpexcept_strict_uni:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v2, s17
+; GFX900-NEXT: v_mov_b32_e32 v3, s18
+; GFX900-NEXT: v_fma_f16 v2, s16, v2, v3
+; GFX900-NEXT: global_store_short v[0:1], v2, off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f16_fpexcept_strict_uni:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, s1
+; GFX942-NEXT: v_mov_b32_e32 v3, s2
+; GFX942-NEXT: v_fma_f16 v2, s0, v2, v3
+; GFX942-NEXT: global_store_short v[0:1], v2, off
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict_uni:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b16_e32 v2.l, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_fma_f16 v2.l, s0, s1, v2.l
+; GFX11-NEXT: global_store_b16 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f16_fpexcept_strict_uni:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_fma_f16 v2, s0, s1, v2
+; GFX12-NEXT: global_store_b16 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
- ret half %val
+ store half %val, ptr addrspace(1) %out
+ ret void
}
-define <2 x half> @v_constained_fma_v2f16_fpexcept_strict(<2 x half> %x, <2 x half> %y, <2 x half> %z) #0 {
-; GFX9-LABEL: v_constained_fma_v2f16_fpexcept_strict:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+define void @v_constained_fma_f16_fpexcept_strict_div(half %x, half %y, half %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f16_fpexcept_strict_div:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX8-NEXT: flat_store_short v[3:4], v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f16_fpexcept_strict_div:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX900-NEXT: global_store_short v[3:4], v0, off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict:
+; GFX942-LABEL: v_constained_fma_f16_fpexcept_strict_div:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX942-NEXT: global_store_short v[6:7], v0, off
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict_div:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_fma_f16 v0.l, v0.l, v1.l, v2.l
+; GFX11-NEXT: global_store_b16 v[3:4], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f16_fpexcept_strict_div:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX12-NEXT: global_store_b16 v[3:4], v0, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+ store half %val, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_constained_fma_v2f16_fpexcept_strict_uni(<2 x half> inreg %x, <2 x half> inreg %y, <2 x half> inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict_uni:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s17
+; GFX8-NEXT: v_mov_b32_e32 v3, s18
+; GFX8-NEXT: s_lshr_b32 s5, s17, 16
+; GFX8-NEXT: s_lshr_b32 s6, s18, 16
+; GFX8-NEXT: v_fma_f16 v2, s16, v2, v3
+; GFX8-NEXT: s_lshr_b32 s4, s16, 16
+; GFX8-NEXT: v_readfirstlane_b32 s7, v2
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_fma_f16 v2, s4, v2, v3
+; GFX8-NEXT: v_readfirstlane_b32 s4, v2
+; GFX8-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX8-NEXT: s_and_b32 s5, 0xffff, s7
+; GFX8-NEXT: s_lshl_b32 s4, s4, 16
+; GFX8-NEXT: s_or_b32 s4, s5, s4
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_v2f16_fpexcept_strict_uni:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v2, s17
+; GFX900-NEXT: v_mov_b32_e32 v3, s18
+; GFX900-NEXT: v_pk_fma_f16 v2, s16, v2, v3
+; GFX900-NEXT: global_store_dword v[0:1], v2, off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_v2f16_fpexcept_strict_uni:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v2, s1
+; GFX942-NEXT: v_mov_b32_e32 v3, s2
+; GFX942-NEXT: v_pk_fma_f16 v2, s0, v2, v3
+; GFX942-NEXT: global_store_dword v[0:1], v2, off
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v2f16_fpexcept_strict_uni:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_fma_f16 v2, s0, s1, v2
+; GFX11-NEXT: global_store_b32 v[0:1], v2, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v2f16_fpexcept_strict_uni:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_pk_fma_f16 v2, s0, s1, v2
+; GFX12-NEXT: global_store_b32 v[0:1], v2, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %val = call <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+ store <2 x half> %val, ptr addrspace(1) %out
+ ret void
+}
+
+define void @v_constained_fma_v2f16_fpexcept_strict_div(<2 x half> %x, <2 x half> %y, <2 x half> %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v2f16_fpexcept_strict_div:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2
-; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5
+; GFX8-NEXT: v_fma_f16 v1, v5, v6, v7
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: flat_store_dword v[3:4], v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_v2f16_fpexcept_strict_div:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_pk_fma_f16 v0, v0, v1, v2
+; GFX900-NEXT: global_store_dword v[3:4], v0, off
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_v2f16_fpexcept_strict_div:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v6, v3
+; GFX942-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-NEXT: v_pk_fma_f16 v0, v0, v1, v2
+; GFX942-NEXT: global_store_dword v[6:7], v0, off
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_v2f16_fpexcept_strict_div:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_fma_f16 v0, v0, v1, v2
+; GFX11-NEXT: global_store_b32 v[3:4], v0, off
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v2f16_fpexcept_strict_div:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_fma_f16 v0, v0, v1, v2
+; GFX12-NEXT: global_store_b32 v[3:4], v0, off
+; GFX12-NEXT: s_setpc_b64 s[30:31]
%val = call <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
- ret <2 x half> %val
+ store <2 x half> %val, ptr addrspace(1) %out
+ ret void
}
-define <3 x half> @v_constained_fma_v3f16_fpexcept_strict(<3 x half> %x, <3 x half> %y, <3 x half> %z) #0 {
-; GFX9-LABEL: v_constained_fma_v3f16_fpexcept_strict:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_fma_f16 v0, v0, v2, v4
-; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v5
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_constained_fma_v3f16_fpexcept_strict:
+;define void @v_constained_fma_v3f16_fpexcept_strict_uni(<3 x half> inreg %x, <3 x half> inreg %y, <3 x half> inreg %z, ptr addrspace(1) %out) #0 {
+; %val = call <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+; store <3 x half> %val, ptr addrspace(1) %out
+; ret void
+;}
+
+define void @v_constained_fma_v3f16_fpexcept_strict_div(<3 x half> %x, <3 x half> %y, <3 x half> %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v3f16_fpexcept_strict_div:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v0
; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4
-; GFX8-NEXT: v_fma_f16 v2, v6, v7, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v4
+; GFX8-NEXT: flat_store_short v[6:7], v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v6
+; GFX8-NEXT: v_fma_f16 v2, v8, v9, v10
+; GFX8-NEXT: v_fma_f16 v3, v1, v3, v5
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GFX8-NEXT: flat_store_short v[0:1], v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v6
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
+; GFX8-NEXT: flat_store_short v[0:1], v3
+; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
- %val = call <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
- ret <3 x half> %val
-}
-
-define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x half> %y, <4 x half> %z) #0 {
-; GFX9-LABEL: v_constained_fma_v4f16_fpexcept_strict:
+;
+; GFX9-LABEL: v_constained_fma_v3f16_fpexcept_strict_div:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_pk_fma_f16 v0, v0, v2, v4
; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX9-NEXT: global_store_short v[6:7], v0, off
+; GFX9-NEXT: global_store_short_d16_hi v[6:7], v0, off offset:2
+; GFX9-NEXT: global_store_short v[6:7], v1, off offset:4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_constained_fma_v4f16_fpexcept_strict:
+; GFX11-LABEL: v_constained_fma_v3f16_fpexcept_strict_div:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX11-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: global_store_b16 v[6:7], v0, off
+; GFX11-NEXT: global_store_d16_hi_b16 v[6:7], v0, off offset:2
+; GFX11-NEXT: global_store_b16 v[6:7], v1, off offset:4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_v3f16_fpexcept_strict_div:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_fma_f16 v0, v0, v2, v4
+; GFX12-NEXT: v_pk_fma_f16 v1, v1, v3, v5
+; GFX12-NEXT: s_clause 0x2
+; GFX12-NEXT: global_store_b16 v[6:7], v0, off
+; GFX12-NEXT: global_store_d16_hi_b16 v[6:7], v0, off offset:2
+; GFX12-NEXT: global_store_b16 v[6:7], v1, off offset:4
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %val = call <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+ store <3 x half> %val, ptr addrspace(1) %out
+ ret void
+}
+
+;define void @v_constained_fma_v4f16_fpexcept_strict_uni(<4 x half> inreg %x, <4 x half> inreg %y, <4 x half> inreg %z, ptr addrspace(1) %out) #0 {
+; %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+; store <4 x half> %val, ptr addrspace(1) %out
+; ret void
+;}
+;
+;define void @v_constained_fma_v4f16_fpexcept_strict_div(<4 x half> %x, <4 x half> %y, <4 x half> %z, ptr addrspace(1) %out) #0 {
+; %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+; store <4 x half> %val, ptr addrspace(1) %out
+; ret void
+;}
+
+define void @v_constained_fma_f16_fpexcept_strict_fneg_uni(half inreg %x, half inreg %y, half inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f16_fpexce...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/170330
More information about the llvm-commits
mailing list