[llvm] [AMDGPU] Extend v2i16 & v2f16 support for llvm.amdgcn.update.dpp intr… (PR #65318)

Tue Sep 5 21:00:39 PDT 2023

https://github.com/pravinjagtap updated https://github.com/llvm/llvm-project/pull/65318:

>From 2dd5b5b3f34314b6bc06b53da95bde99d96d99b9 Mon Sep 17 00:00:00 2001
From: Pravin Jagtap <Pravin.Jagtap at amd.com>
Date: Tue, 5 Sep 2023 08:10:34 -0400
Subject: [PATCH] [AMDGPU] Extend v2i16 & v2f16 support for
 llvm.amdgcn.update.dpp intrinsic

---
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    |   2 +
 .../CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll  | 236 ++++++++++++++++++
 2 files changed, 238 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index cf47b2e1cd2cf31..af55d32eaeaad12 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1214,6 +1214,8 @@ class UpdateDPPPat<ValueType vt> : GCNPat <
 
 def : UpdateDPPPat<i32>;
 def : UpdateDPPPat<f32>;
+def : UpdateDPPPat<v2i16>;
+def : UpdateDPPPat<v2f16>;
 
 } // End OtherPredicates = [isGFX8Plus]
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index e1dd29962947265..c06cef4be4d7498 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -221,9 +221,245 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb8(ptr addrspace(1) %out, float %
   ret void
 }
 
+; GCN-LABEL: {{^}}dpp_test_v2i16:
+; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+define amdgpu_kernel void @dpp_test_v2i16(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
+  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 1, i32 1, i32 1, i1 false)
+  store <2 x i16> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb1:
+; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0{{$}}
+define amdgpu_kernel void @dpp_test_v2i16_imm_comb1(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
+  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 0, i32 0, i32 0, i1 false)
+  store <2 x i16> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb2:
+; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3{{$}}
+define amdgpu_kernel void @dpp_test_v2i16_imm_comb2(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
+  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 3, i32 3, i32 3, i1 false)
+  store <2 x i16> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+	; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb3:
+; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2i16_imm_comb3(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
+  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 1, i32 2, i32 3, i1 true)
+  store <2 x i16> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb4:
+; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2i16_imm_comb4(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
+  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 4, i32 3, i32 2, i1 true)
+  store <2 x i16> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb5:
+; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2i16_imm_comb5(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
+  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 62, i32 61, i1 true)
+  store <2 x i16> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb6:
+; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2i16_imm_comb6(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
+  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 63, i32 63, i1 true)
+  store <2 x i16> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb7:
+; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2i16_imm_comb7(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
+  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 64, i32 64, i32 64, i1 true)
+  store <2 x i16> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb8:
+; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2i16_imm_comb8(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
+  %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 31, i32 63, i32 128, i1 true)
+  store <2 x i16> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2f16:
+; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+define amdgpu_kernel void @dpp_test_v2f16(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
+  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 1, i32 1, i32 1, i1 false)
+  store <2 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb1:
+; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0{{$}}
+define amdgpu_kernel void @dpp_test_v2f16_imm_comb1(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
+  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 0, i32 0, i32 0, i1 false)
+  store <2 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb2:
+; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3{{$}}
+define amdgpu_kernel void @dpp_test_v2f16_imm_comb2(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
+  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 3, i32 3, i32 3, i1 false)
+  store <2 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+	; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb3:
+; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2f16_imm_comb3(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
+  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 1, i32 2, i32 3, i1 true)
+  store <2 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb4:
+; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2f16_imm_comb4(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
+  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 4, i32 3, i32 2, i1 true)
+  store <2 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb5:
+; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2f16_imm_comb5(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
+  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 62, i32 61, i1 true)
+  store <2 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb6:
+; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2f16_imm_comb6(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
+  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 63, i32 63, i1 true)
+  store <2 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb7:
+; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2f16_imm_comb7(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
+  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 64, i32 64, i32 64, i1 true)
+  store <2 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb8:
+; GCN:  v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN:  v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN:  v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2f16_imm_comb8(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
+  %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 31, i32 63, i32 128, i1 true)
+  store <2 x half> %tmp0, ptr addrspace(1) %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare void @llvm.amdgcn.s.barrier()
 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
+declare <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16>, <2 x i16>, i32, i32, i32, i1) #0
+declare <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half>, <2 x half>, i32, i32, i32, i1) #0
 declare float @llvm.amdgcn.update.dpp.f32(float, float, i32, i32, i32, i1) #0
 declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0