[llvm] [AMDGPU] Extend v2i16 & v2f16 support for llvm.amdgcn.update.dpp intr… (PR #65318)
Pravin Jagtap via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 5 05:36:37 PDT 2023
https://github.com/pravinjagtap created https://github.com/llvm/llvm-project/pull/65318:
…insic.
>From ac1aa90673702f26969a439786a0695f69af2a44 Mon Sep 17 00:00:00 2001
From: Pravin Jagtap <Pravin.Jagtap at amd.com>
Date: Tue, 5 Sep 2023 08:10:34 -0400
Subject: [PATCH] [AMDGPU] Extend v2i16 & v2f16 support for
llvm.amdgcn.update.dpp intrinsic
---
llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 +
.../CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll | 236 ++++++++++++++++++
2 files changed, 238 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index cf47b2e1cd2cf31..af55d32eaeaad12 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1214,6 +1214,8 @@ class UpdateDPPPat<ValueType vt> : GCNPat <
def : UpdateDPPPat<i32>;
def : UpdateDPPPat<f32>;
+def : UpdateDPPPat<v2i16>;
+def : UpdateDPPPat<v2f16>;
} // End OtherPredicates = [isGFX8Plus]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
index e1dd29962947265..c06cef4be4d7498 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -221,9 +221,245 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb8(ptr addrspace(1) %out, float %
ret void
}
+; GCN-LABEL: {{^}}dpp_test_v2i16:
+; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+define amdgpu_kernel void @dpp_test_v2i16(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
+ %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 1, i32 1, i32 1, i1 false)
+ store <2 x i16> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb1:
+; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0{{$}}
+define amdgpu_kernel void @dpp_test_v2i16_imm_comb1(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
+ %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 0, i32 0, i32 0, i1 false)
+ store <2 x i16> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb2:
+; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3{{$}}
+define amdgpu_kernel void @dpp_test_v2i16_imm_comb2(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
+ %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 3, i32 3, i32 3, i1 false)
+ store <2 x i16> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+ ; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb3:
+; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2i16_imm_comb3(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
+ %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 1, i32 2, i32 3, i1 true)
+ store <2 x i16> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb4:
+; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2i16_imm_comb4(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
+ %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 4, i32 3, i32 2, i1 true)
+ store <2 x i16> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb5:
+; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2i16_imm_comb5(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
+ %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 62, i32 61, i1 true)
+ store <2 x i16> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb6:
+; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2i16_imm_comb6(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
+ %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 63, i32 63, i1 true)
+ store <2 x i16> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb7:
+; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2i16_imm_comb7(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
+ %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 64, i32 64, i32 64, i1 true)
+ store <2 x i16> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb8:
+; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2i16_imm_comb8(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
+ %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 31, i32 63, i32 128, i1 true)
+ store <2 x i16> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2f16:
+; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
+define amdgpu_kernel void @dpp_test_v2f16(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
+ %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 1, i32 1, i32 1, i1 false)
+ store <2 x half> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb1:
+; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0{{$}}
+define amdgpu_kernel void @dpp_test_v2f16_imm_comb1(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
+ %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 0, i32 0, i32 0, i1 false)
+ store <2 x half> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb2:
+; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3{{$}}
+define amdgpu_kernel void @dpp_test_v2f16_imm_comb2(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
+ %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 3, i32 3, i32 3, i1 false)
+ store <2 x half> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+ ; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb3:
+; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2f16_imm_comb3(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
+ %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 1, i32 2, i32 3, i1 true)
+ store <2 x half> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb4:
+; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2f16_imm_comb4(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
+ %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 4, i32 3, i32 2, i1 true)
+ store <2 x half> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb5:
+; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2f16_imm_comb5(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
+ %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 62, i32 61, i1 true)
+ store <2 x half> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb6:
+; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2f16_imm_comb6(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
+ %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 63, i32 63, i1 true)
+ store <2 x half> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb7:
+; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2f16_imm_comb7(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
+ %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 64, i32 64, i32 64, i1 true)
+ store <2 x half> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb8:
+; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
+; GFX8-OPT: s_mov
+; GFX8-OPT: s_mov
+; GFX8-NOOPT: s_nop 1
+; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}}
+define amdgpu_kernel void @dpp_test_v2f16_imm_comb8(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
+ %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 31, i32 63, i32 128, i1 true)
+ store <2 x half> %tmp0, ptr addrspace(1) %out
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x()
declare void @llvm.amdgcn.s.barrier()
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
+declare <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16>, <2 x i16>, i32, i32, i32, i1) #0
+declare <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half>, <2 x half>, i32, i32, i32, i1) #0
declare float @llvm.amdgcn.update.dpp.f32(float, float, i32, i32, i32, i1) #0
declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0
More information about the llvm-commits
mailing list