[llvm] Adding support for G_STRICT_FMA in new reg bank select (PR #170330)

Tue Dec 2 10:24:26 PST 2025

================
@@ -1,98 +1,927 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
-
-define float @v_constained_fma_f32_fpexcept_strict(float %x, float %y, float %z) #0 {
-; GCN-LABEL: v_constained_fma_f32_fpexcept_strict:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx942 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX942 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+
+define void @v_constained_fma_f32_fpexcept_strict_uni(float inreg %x, float inreg %y, float inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f32_fpexcept_strict_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s17
+; GFX8-NEXT:    v_mov_b32_e32 v3, s18
+; GFX8-NEXT:    v_fma_f32 v2, s16, v2, v3
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f32_fpexcept_strict_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s17
+; GFX900-NEXT:    v_mov_b32_e32 v3, s18
+; GFX900-NEXT:    v_fma_f32 v2, s16, v2, v3
+; GFX900-NEXT:    global_store_dword v[0:1], v2, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f32_fpexcept_strict_uni:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942-NEXT:    v_fma_f32 v2, s0, v2, v3
+; GFX942-NEXT:    global_store_dword v[0:1], v2, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f32_fpexcept_strict_uni:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v2, s0, s1, v2
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f32_fpexcept_strict_uni:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_fma_f32 v2, s0, s1, v2
+; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %val = call float @llvm.experimental.constrained.fma.f32(float %x, float %y, float %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  store float %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_f32_fpexcept_strict_div(float %x, float %y, float %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_f32_fpexcept_strict_div:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX8-NEXT:    flat_store_dword v[3:4], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_f32_fpexcept_strict_div:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX900-NEXT:    global_store_dword v[3:4], v0, off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_constained_fma_f32_fpexcept_strict_div:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v6, v3
+; GFX942-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX942-NEXT:    global_store_dword v[6:7], v0, off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_constained_fma_f32_fpexcept_strict_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX11-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_constained_fma_f32_fpexcept_strict_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GFX12-NEXT:    global_store_b32 v[3:4], v0, off
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %val = call float @llvm.experimental.constrained.fma.f32(float %x, float %y, float %z, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret float %val
+  store float %val, ptr addrspace(1) %out
+  ret void
+}
+
+define void @v_constained_fma_v2f32_fpexcept_strict_uni(<2 x float> inreg %x, <2 x float> inreg %y, <2 x float> inreg %z, ptr addrspace(1) %out) #0 {
+; GFX8-LABEL: v_constained_fma_v2f32_fpexcept_strict_uni:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s18
+; GFX8-NEXT:    v_mov_b32_e32 v3, s20
+; GFX8-NEXT:    v_fma_f32 v2, s16, v2, v3
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s19
+; GFX8-NEXT:    v_mov_b32_e32 v3, s21
+; GFX8-NEXT:    v_fma_f32 v2, s17, v2, v3
+; GFX8-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_constained_fma_v2f32_fpexcept_strict_uni:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v2, s18
+; GFX900-NEXT:    v_mov_b32_e32 v3, s20
+; GFX900-NEXT:    v_fma_f32 v2, s16, v2, v3
+; GFX900-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX900-NEXT:    v_mov_b32_e32 v2, s19
+; GFX900-NEXT:    v_mov_b32_e32 v3, s21
+; GFX900-NEXT:    v_fma_f32 v2, s17, v2, v3
+; GFX900-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX900-NEXT:    v_mov_b32_e32 v2, s4
+; GFX900-NEXT:    v_mov_b32_e32 v3, s5
----------------
petar-avramovic wrote:

These readanylanes should be combined away, need to look through build_vector for readanylane sources and replace with vgpr build_vector

https://github.com/llvm/llvm-project/pull/170330