[llvm] [AMDGPU][GFX11] Do not rewrite V_FMA/FMAC_* to V_FMAAK_F16_t16 on operand legalization. (PR #66202)

Wed Sep 13 05:15:53 PDT 2023

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu
            
<details>
<summary>Changes</summary>
V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands whereas the original instructions would have VGPR_32 operands. Switching the opcodes without updating operands' register classes leads to MachineVerifier complaining about the classes not matching instruction definitions. The problem only reveals itself of builds with expensive checks enabled because of missing -verify-machineinstrs in the test.

This is the third attempt to update CodeGen/AMDGPU/fma.f16.ll to run for GFX11, following the second attempt in a1e38e0b8e3e, partially reverted in eaf737a4e004.
--
Full diff: https://github.com/llvm/llvm-project/pull/66202.diff

2 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+6) 
- (modified) llvm/test/CodeGen/AMDGPU/fma.f16.ll (+95-4) 


<pre>

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 38b5e0114903cdf..a887f7567d76c89 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3318,6 +3318,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
       if (pseudoToMCOpcode(NewOpc) == -1)
         return false;
 
+      // V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
+      // would also require restricting their register classes. For now
+      // just bail out.
+      if (NewOpc == AMDGPU::V_FMAAK_F16_t16)
+        return false;
+
       const int64_t Imm = ImmOp->getImm();
 
       // FIXME: This would be a lot easier if we could return a new instruction
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index 23971f2b681cb0c..e8423ce9fbc36a2 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX9,GFX9-SDAG
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9,GFX9-SDAG
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL
 
 declare half @llvm.fma.f16(half, half, half)
 declare half @llvm.maxnum.f16(half, half)
@@ -19,6 +21,12 @@ define half @test_fma(half %x, half %y, half %z) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fma_f16 v0, v0, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fma:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fma_f16 v0, v0, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %x, half %y, half %z)
   ret half %r
 }
@@ -36,6 +44,12 @@ define half @test_fmac(half %x, half %y, half %z) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fmac_f16_e32 v0, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmac:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fmac_f16_e32 v0, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %y, half %z, half %x)
   ret half %r
 }
@@ -61,6 +75,12 @@ define half @test_fmaak(half %x, half %y, half %z) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fmaak_f16 v0, v0, v1, 0x4200
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmaak:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fmaak_f16 v0, v0, v1, 0x4200
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %x, half %y, half 0xH4200)
   ret half %r
 }
@@ -86,6 +106,12 @@ define half @test_fmamk(half %x, half %y, half %z) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_fmamk_f16 v0, v0, 0x4200, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_fmamk:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_fmamk_f16 v0, v0, 0x4200, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %r = call half @llvm.fma.f16(half %x, half 0xH4200, half %z)
   ret half %r
 }
@@ -139,6 +165,33 @@ define i32 @test_D139469_f16(half %arg) {
 ; GFX10-GISEL-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_D139469_f16:
+; GFX11-SDAG:       ; %bb.0: ; %bb
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x211e
+; GFX11-SDAG-NEXT:    v_mul_f16_e32 v2, 0x291e, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_fmac_f16_e32 v1, 0x291e, v0
+; GFX11-SDAG-NEXT:    v_min_f16_e32 v0, v2, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_D139469_f16:
+; GFX11-GISEL:       ; %bb.0: ; %bb
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x291e
+; GFX11-GISEL-NEXT:    v_mul_f16_e32 v1, 0x291e, v0
+; GFX11-GISEL-NEXT:    v_fmaak_f16 v0, s0, v0, 0x211e
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e64 s0, 0, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = fmul contract half %arg, 0xH291E
   %i1 = fcmp olt half %i, 0xH0000
@@ -213,6 +266,44 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
 ; GFX10-GISEL-NEXT:    s_or_b32 s4, s6, s5
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: test_D139469_v2f16:
+; GFX11-SDAG:       ; %bb.0: ; %bb
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x211e
+; GFX11-SDAG-NEXT:    v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
+; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    v_pk_min_f16 v0, v1, v0
+; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-SDAG-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SDAG-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v1
+; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: test_D139469_v2f16:
+; GFX11-GISEL:       ; %bb.0: ; %bb
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0x291e291e
+; GFX11-GISEL-NEXT:    v_pk_mul_f16 v1, v0, 0x291e op_sel_hi:[1,0]
+; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v0, s0, 0x211e op_sel_hi:[1,1,0]
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v1
+; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e64 s0, 0, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e64 s1, 0, v2
+; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e64 s2, 0, v3
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_or_b32 s0, s1, s2
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = fmul contract <2 x half> %arg, <half 0xH291E, half 0xH291E>
   %i1 = fcmp olt <2 x half> %i, <half 0xH0000, half 0xH0000>
</pre>
</details>


https://github.com/llvm/llvm-project/pull/66202