[llvm] [AMDGPU][MC] Add dpp for V_PK_FMAC_F16 for GFX10 (PR #79598)

Fri Feb 9 03:21:12 PST 2024

https://github.com/ankurepa updated https://github.com/llvm/llvm-project/pull/79598

>From d3e1954f7f9c053b2af472900944e1badfa04f65 Mon Sep 17 00:00:00 2001
From: ankurepa <anja.kurepa at syrmia.com>
Date: Fri, 26 Jan 2024 15:09:08 +0100
Subject: [PATCH 1/4] [AMDGPU][MC] Add dpp for V_PK_FMAC_F16 for GFX10

Adds dpp for v_pk_fmac_f16 for gfx10 and removes them for gfx11 and gfx12
---
 llvm/lib/Target/AMDGPU/VOP2Instructions.td    | 15 ++++++--
 llvm/test/MC/AMDGPU/gfx10_asm_vop2.s          |  6 +++
 llvm/test/MC/AMDGPU/literalv216.s             |  2 +-
 .../MC/Disassembler/AMDGPU/gfx10_vop2.txt     | 38 +++++++++++--------
 .../AMDGPU/gfx10_vop3p_literalv216.txt        |  2 +-
 5 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index eba9bf64884ec..fa18a09b3831d 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1615,6 +1615,9 @@ multiclass VOP2_Real_FULL_with_name_gfx11_gfx12<bits<6> op, string opName,
 multiclass VOP2_Real_e32_gfx11_gfx12<bits<6> op> :
   VOP2Only_Real<GFX11Gen, op>, VOP2Only_Real<GFX12Gen, op>;
 
+multiclass VOP2_V_PK_FMAC_F16_gfx11_gfx12<bits<6> op> :
+  VOP2Only_Real_e32<GFX11Gen, op>, VOP2Only_Real_e32<GFX12Gen, op>;
+
 multiclass VOP3Only_Realtriple_gfx11_gfx12<bits<10> op> :
   VOP3Only_Realtriple<GFX11Gen, op>, VOP3Only_Realtriple<GFX12Gen, op>;
 
@@ -1661,7 +1664,8 @@ defm V_SUBREV_CO_CI_U32 :
 
 defm V_CVT_PK_RTZ_F16_F32  : VOP2_Real_FULL_with_name_gfx11_gfx12<0x02f,
   "V_CVT_PKRTZ_F16_F32", "v_cvt_pk_rtz_f16_f32">;
-defm V_PK_FMAC_F16     : VOP2_Real_e32_gfx11_gfx12<0x03c>;
+
+defm V_PK_FMAC_F16     : VOP2_V_PK_FMAC_F16_gfx11_gfx12<0x03c>;
 
 defm V_ADD_F16_t16         : VOP2_Real_FULL_t16_gfx11_gfx12<0x032, "v_add_f16">;
 defm V_ADD_F16_fake16      : VOP2_Real_FULL_t16_gfx11_gfx12<0x032, "v_add_f16">;
@@ -1945,6 +1949,11 @@ multiclass VOP2e_Real_gfx10<bits<6> op, string opName, string asmName> :
   VOP2be_Real_dpp_gfx10<op, opName, asmName>,
   VOP2be_Real_dpp8_gfx10<op, opName, asmName>;
 
+multiclass VOP2_FMAC_Real<bits<6> op> :
+  VOP2_Real_e32_gfx10<op>,
+  VOP2_Real_dpp_gfx10<op>,
+  VOP2_Real_dpp8_gfx10<op>;
+
 multiclass VOP2_Real_gfx10<bits<6> op> :
   VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>,
   VOP2_Real_sdwa_gfx10<op>, VOP2_Real_dpp_gfx10<op>, VOP2_Real_dpp8_gfx10<op>;
@@ -1988,9 +1997,7 @@ defm V_MAX_F16         : VOP2_Real_gfx10<0x039>;
 defm V_MIN_F16         : VOP2_Real_gfx10<0x03a>;
 defm V_LDEXP_F16       : VOP2_Real_gfx10<0x03b>;
 
-let IsSingle = 1 in {
-  defm V_PK_FMAC_F16     : VOP2_Real_e32_gfx10<0x03c>;
-}
+defm V_PK_FMAC_F16     : VOP2_FMAC_Real<0x03c>;
 
 // VOP2 no carry-in, carry-out.
 defm V_ADD_NC_U32 :
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
index bf8e18ec14512..f92b505cbd083 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
@@ -13185,3 +13185,9 @@ v_pk_fmac_f16 v5, -4.0, v2
 
 v_pk_fmac_f16 v5, v1, v255
 // GFX10: encoding: [0x01,0xff,0x0b,0x78]
+
+v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]
+
+v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x03]
\ No newline at end of file
diff --git a/llvm/test/MC/AMDGPU/literalv216.s b/llvm/test/MC/AMDGPU/literalv216.s
index c695bc3600c38..f5afaa6bd6181 100644
--- a/llvm/test/MC/AMDGPU/literalv216.s
+++ b/llvm/test/MC/AMDGPU/literalv216.s
@@ -291,4 +291,4 @@ v_pk_add_u16 v5, v1, 123456.0
 // FIXME: v_pk_fmac_f16 cannot be promoted to VOP3 so '_e32' suffix is not valid
 v_pk_fmac_f16 v5, 0x12345678, v2
 // NOGFX9: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-// GFX10: v_pk_fmac_f16 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12]
+// GFX10: v_pk_fmac_f16_e32 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt
index b759912204db8..33d89da3b3ae9 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2.txt
@@ -1779,54 +1779,60 @@
 # GFX10: v_or_b32_e32 v5, vcc_lo, v2             ; encoding: [0x6a,0x04,0x0a,0x38]
 0x6a,0x04,0x0a,0x38
 
-# GFX10: v_pk_fmac_f16 v255, v1, v2              ; encoding: [0x01,0x05,0xfe,0x79]
+# GFX10: v_pk_fmac_f16_e32 v255, v1, v2          ; encoding: [0x01,0x05,0xfe,0x79]
 0x01,0x05,0xfe,0x79
 
-# GFX10: v_pk_fmac_f16 v5, -1, v2                ; encoding: [0xc1,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, -1, v2            ; encoding: [0xc1,0x04,0x0a,0x78]
 0xc1,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, -4.0, v2              ; encoding: [0xf7,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, -4.0, v2          ; encoding: [0xf7,0x04,0x0a,0x78]
 0xf7,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, 0, v2                 ; encoding: [0x80,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, 0, v2             ; encoding: [0x80,0x04,0x0a,0x78]
 0x80,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, 0.5, v2               ; encoding: [0xf0,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, 0.5, v2           ; encoding: [0xf0,0x04,0x0a,0x78]
 0xf0,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, exec_hi, v2           ; encoding: [0x7f,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, exec_hi, v2       ; encoding: [0x7f,0x04,0x0a,0x78]
 0x7f,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, exec_lo, v2           ; encoding: [0x7e,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, exec_lo, v2       ; encoding: [0x7e,0x04,0x0a,0x78]
 0x7e,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, m0, v2                ; encoding: [0x7c,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, m0, v2            ; encoding: [0x7c,0x04,0x0a,0x78]
 0x7c,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, s1, v2                ; encoding: [0x01,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, s1, v2            ; encoding: [0x01,0x04,0x0a,0x78]
 0x01,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, s103, v2              ; encoding: [0x67,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, s103, v2          ; encoding: [0x67,0x04,0x0a,0x78]
 0x67,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, ttmp11, v2            ; encoding: [0x77,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, ttmp11, v2        ; encoding: [0x77,0x04,0x0a,0x78]
 0x77,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, v1, v2                ; encoding: [0x01,0x05,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, v1, v2            ; encoding: [0x01,0x05,0x0a,0x78]
 0x01,0x05,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, v1, v255              ; encoding: [0x01,0xff,0x0b,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, v1, v255          ; encoding: [0x01,0xff,0x0b,0x78]
 0x01,0xff,0x0b,0x78
 
-# GFX10: v_pk_fmac_f16 v5, v255, v2              ; encoding: [0xff,0x05,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, v255, v2          ; encoding: [0xff,0x05,0x0a,0x78]
 0xff,0x05,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, vcc_hi, v2            ; encoding: [0x6b,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, vcc_hi, v2        ; encoding: [0x6b,0x04,0x0a,0x78]
 0x6b,0x04,0x0a,0x78
 
-# GFX10: v_pk_fmac_f16 v5, vcc_lo, v2            ; encoding: [0x6a,0x04,0x0a,0x78]
+# GFX10: v_pk_fmac_f16_e32 v5, vcc_lo, v2        ; encoding: [0x6a,0x04,0x0a,0x78]
 0x6a,0x04,0x0a,0x78
 
+#GFX10: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff
+
+#GFX10: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x03]
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x03
+
 # W32: v_sub_co_ci_u32_e32 v255, vcc_lo, v1, v2, vcc_lo ; encoding: [0x01,0x05,0xfe,0x53]
 # W64: v_sub_co_ci_u32_e32 v255, vcc, v1, v2, vcc ; encoding: [0x01,0x05,0xfe,0x53]
 0x01,0x05,0xfe,0x53
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt
index a022c79fe97e6..97c81ed1a629a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3p_literalv216.txt
@@ -144,5 +144,5 @@
 # Packed VOP2
 #===----------------------------------------------------------------------===//
 
-# GFX10: v_pk_fmac_f16 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12]
+# GFX10: v_pk_fmac_f16_e32 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12]
 0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12

>From 7bf323cdd128594b936846e93d65c1003bc1eeb0 Mon Sep 17 00:00:00 2001
From: ankurepa <anja.kurepa at syrmia.com>
Date: Wed, 7 Feb 2024 13:47:04 +0100
Subject: [PATCH 2/4] Fix test

---
 llvm/test/MC/AMDGPU/gfx10_asm_vop2.s         | 2 +-
 llvm/test/MC/AMDGPU/gfx11_asm_err.s          | 3 +++
 llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_err.s | 3 +++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
index f92b505cbd083..cd71ac7edaef1 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
@@ -13190,4 +13190,4 @@ v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3]
 // GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]
 
 v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x03]
\ No newline at end of file
+// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x03]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
index da1989e2ee237..047267af44b98 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
@@ -187,3 +187,6 @@ v_mov_b16 v0.l, ttmp0.h
 
 v_mov_b16 v0.l, a0.h
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: dpp variant of this instruction is not supported
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_err.s
index d25411b5bfd29..95836f3c897f4 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop2_t16_err.s
@@ -224,3 +224,6 @@ v_sub_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 
 v_subrev_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+
+v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// GFX12: :[[@LINE-1]]:{{[0-9]+}}: error: dpp variant of this instruction is not supported

>From a6998cebfd196818a254f2d8a3fb836fd04811c2 Mon Sep 17 00:00:00 2001
From: ankurepa <anja.kurepa at syrmia.com>
Date: Fri, 9 Feb 2024 10:25:50 +0100
Subject: [PATCH 3/4] codegen dpp combine test

---
 llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
index c48231f3851a7..446094ed34024 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
@@ -166,6 +166,22 @@ body: |
     %6:vgpr_32 = V_FMAC_F32_e64 2, %4, 2, %1, 2, %2, 1, 2, implicit $mode, implicit $exec
 ...
 
+# GCN-LABEL: name: v_pk_fmac_f16
+# GCN: %2:vgpr_32 = V_MOV_B32_dpp %0, %1, 1, 15, 15, 1, implicit $exec
+# GCN: %3:vgpr_32 = V_PK_FMAC_F16_e32 %3, %1, implicit $mode, implicit $exec
+name: v_pk_fmac_f16
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+
+    %2:vgpr_32 = V_MOV_B32_dpp %0, %1, 1, 15, 15, 1, implicit $exec
+    %3:vgpr_32 = V_PK_FMAC_F16_e32 %3, %1, implicit $mode, implicit $exec
+...
+
 # when the DPP source isn't a src0 operand the operation should be commuted if possible
 # GCN-LABEL: name: dpp_commute_shrink
 # GCN: %4:vgpr_32 = V_MUL_U32_U24_dpp %1, %0, %1, 1, 14, 15, 0, implicit $exec

>From 9bfd6fa5d506c38865ea43ec72684324c6dcf064 Mon Sep 17 00:00:00 2001
From: ankurepa <anja.kurepa at syrmia.com>
Date: Fri, 9 Feb 2024 12:20:55 +0100
Subject: [PATCH 4/4] fix

---
 llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
index 446094ed34024..44be207dd882a 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
@@ -168,7 +168,7 @@ body: |
 
 # GCN-LABEL: name: v_pk_fmac_f16
 # GCN: %2:vgpr_32 = V_MOV_B32_dpp %0, %1, 1, 15, 15, 1, implicit $exec
-# GCN: %3:vgpr_32 = V_PK_FMAC_F16_e32 %3, %1, implicit $mode, implicit $exec
+# GCN: %3:vgpr_32 = V_PK_FMAC_F16_e32 %2, %1, implicit $mode, implicit $exec
 name: v_pk_fmac_f16
 tracksRegLiveness: true
 body: |
@@ -179,7 +179,7 @@ body: |
     %1:vgpr_32 = COPY $vgpr1
 
     %2:vgpr_32 = V_MOV_B32_dpp %0, %1, 1, 15, 15, 1, implicit $exec
-    %3:vgpr_32 = V_PK_FMAC_F16_e32 %3, %1, implicit $mode, implicit $exec
+    %3:vgpr_32 = V_PK_FMAC_F16_e32 %2, %1, implicit $mode, implicit $exec
 ...
 
 # when the DPP source isn't a src0 operand the operation should be commuted if possible