[llvm] [AMDGPU] Fix lowering of abs for i16 vectors with more than 2 elements (PR #95413)
Tim Gymnich via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 13 07:23:54 PDT 2024
https://github.com/tgymnich updated https://github.com/llvm/llvm-project/pull/95413
>From 4328161383ffee11ae1e230408c659f71b204d37 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tgymnich at icloud.com>
Date: Mon, 10 Jun 2024 14:47:15 +0200
Subject: [PATCH 1/4] add tests
---
llvm/test/CodeGen/AMDGPU/abs.ll | 1648 +++++++++++++++++++++++++++++++
1 file changed, 1648 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/abs.ll
diff --git a/llvm/test/CodeGen/AMDGPU/abs.ll b/llvm/test/CodeGen/AMDGPU/abs.ll
new file mode 100644
index 0000000000000..a28d4ef07babc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/abs.ll
@@ -0,0 +1,1648 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+
+define <4 x i16> @v_abs_v4i16(<4 x i16> %arg) {
+; GFX6-LABEL: v_abs_v4i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v2, v4
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v3, v4
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_abs_v4i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
+; GFX7-NEXT: v_max_i32_e32 v2, v2, v4
+; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v3, v4
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_abs_v4i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_sub_u16_e32 v3, 0, v2
+; GFX8-NEXT: v_max_i16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_sub_u16_e32 v4, 0, v3
+; GFX8-NEXT: v_max_i16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_sub_u16_e32 v4, 0, v1
+; GFX8-NEXT: v_sub_u16_e32 v5, 0, v0
+; GFX8-NEXT: v_max_i16_e32 v0, v0, v5
+; GFX8-NEXT: v_max_i16_e32 v1, v1, v4
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_abs_v4i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX9-NEXT: v_sub_u16_e32 v3, 0, v2
+; GFX9-NEXT: v_max_i16_e32 v2, v2, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT: v_sub_u16_e32 v4, 0, v3
+; GFX9-NEXT: v_max_i16_e32 v3, v3, v4
+; GFX9-NEXT: v_sub_u16_e32 v4, 0, v1
+; GFX9-NEXT: v_max_i16_e32 v1, v1, v4
+; GFX9-NEXT: v_sub_u16_e32 v4, 0, v0
+; GFX9-NEXT: v_max_i16_e32 v0, v0, v4
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_abs_v4i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX10-NEXT: v_sub_nc_u16 v4, 0, v1
+; GFX10-NEXT: v_sub_nc_u16 v5, 0, v0
+; GFX10-NEXT: v_sub_nc_u16 v6, 0, v2
+; GFX10-NEXT: v_sub_nc_u16 v7, 0, v3
+; GFX10-NEXT: v_max_i16 v1, v1, v4
+; GFX10-NEXT: v_max_i16 v0, v0, v5
+; GFX10-NEXT: v_max_i16 v2, v2, v6
+; GFX10-NEXT: v_max_i16 v3, v3, v7
+; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_abs_v4i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT: v_sub_nc_u16 v4, 0, v1
+; GFX11-NEXT: v_sub_nc_u16 v5, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_sub_nc_u16 v6, 0, v2
+; GFX11-NEXT: v_sub_nc_u16 v7, 0, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_max_i16 v1, v1, v4
+; GFX11-NEXT: v_max_i16 v0, v0, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_max_i16 v2, v2, v6
+; GFX11-NEXT: v_max_i16 v3, v3, v7
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_abs_v4i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX12-NEXT: v_sub_nc_u16 v4, 0, v1
+; GFX12-NEXT: v_sub_nc_u16 v5, 0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_sub_nc_u16 v6, 0, v2
+; GFX12-NEXT: v_sub_nc_u16 v7, 0, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_max_i16 v1, v1, v4
+; GFX12-NEXT: v_max_i16 v0, v0, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_max_i16 v2, v2, v6
+; GFX12-NEXT: v_max_i16 v3, v3, v7
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %arg, i1 false)
+ ret <4 x i16> %res
+}
+
+define <8 x i16> @v_abs_v8i16(<8 x i16> %arg) {
+; GFX6-LABEL: v_abs_v8i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v6
+; GFX6-NEXT: v_max_i32_e32 v6, v6, v8
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v7
+; GFX6-NEXT: v_max_i32_e32 v7, v7, v8
+; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
+; GFX6-NEXT: v_max_i32_e32 v4, v4, v7
+; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
+; GFX6-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v2, v5
+; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v3, v5
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX6-NEXT: v_alignbit_b32 v5, v6, v4, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_abs_v8i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v6
+; GFX7-NEXT: v_max_i32_e32 v6, v6, v8
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v7
+; GFX7-NEXT: v_max_i32_e32 v7, v7, v8
+; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
+; GFX7-NEXT: v_max_i32_e32 v4, v4, v7
+; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
+; GFX7-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
+; GFX7-NEXT: v_max_i32_e32 v2, v2, v5
+; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v3, v5
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v6, v4, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_abs_v8i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_sub_u16_e32 v5, 0, v4
+; GFX8-NEXT: v_max_i16_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX8-NEXT: v_sub_u16_e32 v6, 0, v5
+; GFX8-NEXT: v_max_i16_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_sub_u16_e32 v7, 0, v6
+; GFX8-NEXT: v_max_i16_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX8-NEXT: v_sub_u16_e32 v8, 0, v7
+; GFX8-NEXT: v_max_i16_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_sub_u16_e32 v8, 0, v3
+; GFX8-NEXT: v_sub_u16_e32 v9, 0, v2
+; GFX8-NEXT: v_sub_u16_e32 v10, 0, v1
+; GFX8-NEXT: v_sub_u16_e32 v11, 0, v0
+; GFX8-NEXT: v_max_i16_e32 v0, v0, v11
+; GFX8-NEXT: v_max_i16_e32 v1, v1, v10
+; GFX8-NEXT: v_max_i16_e32 v2, v2, v9
+; GFX8-NEXT: v_max_i16_e32 v3, v3, v8
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v6
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v5
+; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_abs_v8i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX9-NEXT: v_sub_u16_e32 v5, 0, v4
+; GFX9-NEXT: v_max_i16_e32 v4, v4, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX9-NEXT: v_sub_u16_e32 v6, 0, v5
+; GFX9-NEXT: v_max_i16_e32 v5, v5, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX9-NEXT: v_sub_u16_e32 v7, 0, v6
+; GFX9-NEXT: v_max_i16_e32 v6, v6, v7
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX9-NEXT: v_sub_u16_e32 v8, 0, v7
+; GFX9-NEXT: v_max_i16_e32 v7, v7, v8
+; GFX9-NEXT: v_sub_u16_e32 v8, 0, v3
+; GFX9-NEXT: v_max_i16_e32 v3, v3, v8
+; GFX9-NEXT: v_sub_u16_e32 v8, 0, v2
+; GFX9-NEXT: v_max_i16_e32 v2, v2, v8
+; GFX9-NEXT: v_sub_u16_e32 v8, 0, v1
+; GFX9-NEXT: v_max_i16_e32 v1, v1, v8
+; GFX9-NEXT: v_sub_u16_e32 v8, 0, v0
+; GFX9-NEXT: v_max_i16_e32 v0, v0, v8
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_perm_b32 v0, v7, v0, s4
+; GFX9-NEXT: v_perm_b32 v1, v6, v1, s4
+; GFX9-NEXT: v_perm_b32 v2, v5, v2, s4
+; GFX9-NEXT: v_perm_b32 v3, v4, v3, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_abs_v8i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX10-NEXT: v_sub_nc_u16 v11, 0, v0
+; GFX10-NEXT: v_sub_nc_u16 v8, 0, v4
+; GFX10-NEXT: v_sub_nc_u16 v9, 0, v5
+; GFX10-NEXT: v_sub_nc_u16 v10, 0, v6
+; GFX10-NEXT: v_sub_nc_u16 v12, 0, v7
+; GFX10-NEXT: v_max_i16 v0, v0, v11
+; GFX10-NEXT: v_max_i16 v4, v4, v8
+; GFX10-NEXT: v_max_i16 v5, v5, v9
+; GFX10-NEXT: v_max_i16 v6, v6, v10
+; GFX10-NEXT: v_sub_nc_u16 v8, 0, v1
+; GFX10-NEXT: v_sub_nc_u16 v9, 0, v2
+; GFX10-NEXT: v_sub_nc_u16 v10, 0, v3
+; GFX10-NEXT: v_max_i16 v7, v7, v12
+; GFX10-NEXT: v_max_i16 v1, v1, v8
+; GFX10-NEXT: v_max_i16 v2, v2, v9
+; GFX10-NEXT: v_max_i16 v3, v3, v10
+; GFX10-NEXT: v_perm_b32 v0, v7, v0, 0x5040100
+; GFX10-NEXT: v_perm_b32 v1, v6, v1, 0x5040100
+; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x5040100
+; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_abs_v8i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-NEXT: v_sub_nc_u16 v11, 0, v0
+; GFX11-NEXT: v_sub_nc_u16 v8, 0, v4
+; GFX11-NEXT: v_sub_nc_u16 v9, 0, v5
+; GFX11-NEXT: v_sub_nc_u16 v10, 0, v6
+; GFX11-NEXT: v_sub_nc_u16 v12, 0, v7
+; GFX11-NEXT: v_max_i16 v0, v0, v11
+; GFX11-NEXT: v_max_i16 v4, v4, v8
+; GFX11-NEXT: v_max_i16 v5, v5, v9
+; GFX11-NEXT: v_max_i16 v6, v6, v10
+; GFX11-NEXT: v_sub_nc_u16 v8, 0, v1
+; GFX11-NEXT: v_sub_nc_u16 v9, 0, v2
+; GFX11-NEXT: v_sub_nc_u16 v10, 0, v3
+; GFX11-NEXT: v_max_i16 v7, v7, v12
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_max_i16 v1, v1, v8
+; GFX11-NEXT: v_max_i16 v2, v2, v9
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_max_i16 v3, v3, v10
+; GFX11-NEXT: v_perm_b32 v0, v7, v0, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v1, v6, v1, 0x5040100
+; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x5040100
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x5040100
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_abs_v8i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX12-NEXT: v_sub_nc_u16 v11, 0, v0
+; GFX12-NEXT: v_sub_nc_u16 v8, 0, v4
+; GFX12-NEXT: v_sub_nc_u16 v9, 0, v5
+; GFX12-NEXT: v_sub_nc_u16 v10, 0, v6
+; GFX12-NEXT: v_sub_nc_u16 v12, 0, v7
+; GFX12-NEXT: v_max_i16 v0, v0, v11
+; GFX12-NEXT: v_max_i16 v4, v4, v8
+; GFX12-NEXT: v_max_i16 v5, v5, v9
+; GFX12-NEXT: v_max_i16 v6, v6, v10
+; GFX12-NEXT: v_sub_nc_u16 v8, 0, v1
+; GFX12-NEXT: v_sub_nc_u16 v9, 0, v2
+; GFX12-NEXT: v_sub_nc_u16 v10, 0, v3
+; GFX12-NEXT: v_max_i16 v7, v7, v12
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_max_i16 v1, v1, v8
+; GFX12-NEXT: v_max_i16 v2, v2, v9
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_max_i16 v3, v3, v10
+; GFX12-NEXT: v_perm_b32 v0, v7, v0, 0x5040100
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_perm_b32 v1, v6, v1, 0x5040100
+; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x5040100
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x5040100
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %arg, i1 false)
+ ret <8 x i16> %res
+}
+
+
+define <16 x i16> @v_abs_v16i16(<16 x i16> %arg) {
+; GFX6-LABEL: v_abs_v16i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v14
+; GFX6-NEXT: v_max_i32_e32 v14, v14, v16
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v15
+; GFX6-NEXT: v_max_i32_e32 v15, v15, v16
+; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v14, v14, v15
+; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
+; GFX6-NEXT: v_max_i32_e32 v12, v12, v15
+; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
+; GFX6-NEXT: v_max_i32_e32 v13, v13, v15
+; GFX6-NEXT: v_bfe_i32 v10, v10, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX6-NEXT: v_bfe_i32 v11, v11, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
+; GFX6-NEXT: v_max_i32_e32 v10, v10, v13
+; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
+; GFX6-NEXT: v_max_i32_e32 v11, v11, v13
+; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v10, v10, v11
+; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
+; GFX6-NEXT: v_max_i32_e32 v8, v8, v11
+; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
+; GFX6-NEXT: v_max_i32_e32 v9, v9, v11
+; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
+; GFX6-NEXT: v_max_i32_e32 v6, v6, v9
+; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
+; GFX6-NEXT: v_max_i32_e32 v7, v7, v9
+; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
+; GFX6-NEXT: v_max_i32_e32 v4, v4, v7
+; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
+; GFX6-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v2, v5
+; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v3, v5
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX6-NEXT: v_alignbit_b32 v5, v6, v4, 16
+; GFX6-NEXT: v_alignbit_b32 v9, v10, v8, 16
+; GFX6-NEXT: v_alignbit_b32 v13, v14, v12, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX6-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_abs_v16i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GFX7-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v14
+; GFX7-NEXT: v_max_i32_e32 v14, v14, v16
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v15
+; GFX7-NEXT: v_max_i32_e32 v15, v15, v16
+; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v14, v14, v15
+; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
+; GFX7-NEXT: v_max_i32_e32 v12, v12, v15
+; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
+; GFX7-NEXT: v_max_i32_e32 v13, v13, v15
+; GFX7-NEXT: v_bfe_i32 v10, v10, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_bfe_i32 v11, v11, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
+; GFX7-NEXT: v_max_i32_e32 v10, v10, v13
+; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
+; GFX7-NEXT: v_max_i32_e32 v11, v11, v13
+; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v10, v10, v11
+; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
+; GFX7-NEXT: v_max_i32_e32 v8, v8, v11
+; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
+; GFX7-NEXT: v_max_i32_e32 v9, v9, v11
+; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
+; GFX7-NEXT: v_max_i32_e32 v6, v6, v9
+; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
+; GFX7-NEXT: v_max_i32_e32 v7, v7, v9
+; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
+; GFX7-NEXT: v_max_i32_e32 v4, v4, v7
+; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
+; GFX7-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
+; GFX7-NEXT: v_max_i32_e32 v2, v2, v5
+; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v3, v5
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v6, v4, 16
+; GFX7-NEXT: v_alignbit_b32 v9, v10, v8, 16
+; GFX7-NEXT: v_alignbit_b32 v13, v14, v12, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_abs_v16i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX8-NEXT: v_sub_u16_e32 v9, 0, v8
+; GFX8-NEXT: v_max_i16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v6
+; GFX8-NEXT: v_sub_u16_e32 v10, 0, v9
+; GFX8-NEXT: v_max_i16_sdwa v9, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v5
+; GFX8-NEXT: v_sub_u16_e32 v11, 0, v10
+; GFX8-NEXT: v_max_i16_sdwa v10, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v4
+; GFX8-NEXT: v_sub_u16_e32 v12, 0, v11
+; GFX8-NEXT: v_max_i16_sdwa v11, v11, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v3
+; GFX8-NEXT: v_sub_u16_e32 v13, 0, v12
+; GFX8-NEXT: v_max_i16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v2
+; GFX8-NEXT: v_sub_u16_e32 v14, 0, v13
+; GFX8-NEXT: v_max_i16_sdwa v13, v13, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX8-NEXT: v_sub_u16_e32 v15, 0, v14
+; GFX8-NEXT: v_max_i16_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v0
+; GFX8-NEXT: v_sub_u16_e32 v16, 0, v15
+; GFX8-NEXT: v_max_i16_sdwa v15, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_sub_u16_e32 v16, 0, v7
+; GFX8-NEXT: v_sub_u16_e32 v17, 0, v6
+; GFX8-NEXT: v_sub_u16_e32 v18, 0, v5
+; GFX8-NEXT: v_sub_u16_e32 v19, 0, v4
+; GFX8-NEXT: v_sub_u16_e32 v20, 0, v3
+; GFX8-NEXT: v_sub_u16_e32 v21, 0, v2
+; GFX8-NEXT: v_sub_u16_e32 v22, 0, v1
+; GFX8-NEXT: v_sub_u16_e32 v23, 0, v0
+; GFX8-NEXT: v_max_i16_e32 v0, v0, v23
+; GFX8-NEXT: v_max_i16_e32 v1, v1, v22
+; GFX8-NEXT: v_max_i16_e32 v2, v2, v21
+; GFX8-NEXT: v_max_i16_e32 v3, v3, v20
+; GFX8-NEXT: v_max_i16_e32 v4, v4, v19
+; GFX8-NEXT: v_max_i16_e32 v5, v5, v18
+; GFX8-NEXT: v_max_i16_e32 v6, v6, v17
+; GFX8-NEXT: v_max_i16_e32 v7, v7, v16
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v15
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v14
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v13
+; GFX8-NEXT: v_or_b32_e32 v3, v3, v12
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v11
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v10
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v9
+; GFX8-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_abs_v16i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX9-NEXT: v_sub_u16_e32 v9, 0, v8
+; GFX9-NEXT: v_max_i16_e32 v8, v8, v9
+; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v6
+; GFX9-NEXT: v_sub_u16_e32 v10, 0, v9
+; GFX9-NEXT: v_max_i16_e32 v9, v9, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v5
+; GFX9-NEXT: v_sub_u16_e32 v11, 0, v10
+; GFX9-NEXT: v_max_i16_e32 v10, v10, v11
+; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v4
+; GFX9-NEXT: v_sub_u16_e32 v12, 0, v11
+; GFX9-NEXT: v_max_i16_e32 v11, v11, v12
+; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v3
+; GFX9-NEXT: v_sub_u16_e32 v13, 0, v12
+; GFX9-NEXT: v_max_i16_e32 v12, v12, v13
+; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v2
+; GFX9-NEXT: v_sub_u16_e32 v14, 0, v13
+; GFX9-NEXT: v_max_i16_e32 v13, v13, v14
+; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX9-NEXT: v_sub_u16_e32 v15, 0, v14
+; GFX9-NEXT: v_max_i16_e32 v14, v14, v15
+; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v0
+; GFX9-NEXT: v_sub_u16_e32 v16, 0, v15
+; GFX9-NEXT: v_max_i16_e32 v15, v15, v16
+; GFX9-NEXT: v_sub_u16_e32 v16, 0, v7
+; GFX9-NEXT: v_max_i16_e32 v7, v7, v16
+; GFX9-NEXT: v_sub_u16_e32 v16, 0, v6
+; GFX9-NEXT: v_max_i16_e32 v6, v6, v16
+; GFX9-NEXT: v_sub_u16_e32 v16, 0, v5
+; GFX9-NEXT: v_max_i16_e32 v5, v5, v16
+; GFX9-NEXT: v_sub_u16_e32 v16, 0, v4
+; GFX9-NEXT: v_max_i16_e32 v4, v4, v16
+; GFX9-NEXT: v_sub_u16_e32 v16, 0, v3
+; GFX9-NEXT: v_max_i16_e32 v3, v3, v16
+; GFX9-NEXT: v_sub_u16_e32 v16, 0, v2
+; GFX9-NEXT: v_max_i16_e32 v2, v2, v16
+; GFX9-NEXT: v_sub_u16_e32 v16, 0, v1
+; GFX9-NEXT: v_max_i16_e32 v1, v1, v16
+; GFX9-NEXT: v_sub_u16_e32 v16, 0, v0
+; GFX9-NEXT: v_max_i16_e32 v0, v0, v16
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_perm_b32 v0, v15, v0, s4
+; GFX9-NEXT: v_perm_b32 v1, v14, v1, s4
+; GFX9-NEXT: v_perm_b32 v2, v13, v2, s4
+; GFX9-NEXT: v_perm_b32 v3, v12, v3, s4
+; GFX9-NEXT: v_perm_b32 v4, v11, v4, s4
+; GFX9-NEXT: v_perm_b32 v5, v10, v5, s4
+; GFX9-NEXT: v_perm_b32 v6, v9, v6, s4
+; GFX9-NEXT: v_perm_b32 v7, v8, v7, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_abs_v16i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v6
+; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v2
+; GFX10-NEXT: v_sub_nc_u16 v11, 0, v8
+; GFX10-NEXT: v_sub_nc_u16 v13, 0, v9
+; GFX10-NEXT: v_sub_nc_u16 v14, 0, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v1
+; GFX10-NEXT: v_sub_nc_u16 v16, 0, v15
+; GFX10-NEXT: v_max_i16 v8, v8, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GFX10-NEXT: v_max_i16 v9, v9, v13
+; GFX10-NEXT: v_max_i16 v10, v10, v14
+; GFX10-NEXT: v_sub_nc_u16 v13, 0, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v0
+; GFX10-NEXT: v_sub_nc_u16 v14, 0, v11
+; GFX10-NEXT: v_sub_nc_u16 v19, 0, v6
+; GFX10-NEXT: v_sub_nc_u16 v20, 0, v5
+; GFX10-NEXT: v_max_i16 v12, v12, v13
+; GFX10-NEXT: v_max_i16 v13, v15, v16
+; GFX10-NEXT: v_max_i16 v11, v11, v14
+; GFX10-NEXT: v_sub_nc_u16 v14, 0, v17
+; GFX10-NEXT: v_sub_nc_u16 v15, 0, v18
+; GFX10-NEXT: v_sub_nc_u16 v16, 0, v7
+; GFX10-NEXT: v_max_i16 v6, v6, v19
+; GFX10-NEXT: v_max_i16 v5, v5, v20
+; GFX10-NEXT: v_max_i16 v14, v17, v14
+; GFX10-NEXT: v_max_i16 v15, v18, v15
+; GFX10-NEXT: v_max_i16 v7, v7, v16
+; GFX10-NEXT: v_sub_nc_u16 v16, 0, v0
+; GFX10-NEXT: v_sub_nc_u16 v17, 0, v1
+; GFX10-NEXT: v_sub_nc_u16 v18, 0, v2
+; GFX10-NEXT: v_sub_nc_u16 v19, 0, v3
+; GFX10-NEXT: v_sub_nc_u16 v20, 0, v4
+; GFX10-NEXT: v_max_i16 v0, v0, v16
+; GFX10-NEXT: v_max_i16 v1, v1, v17
+; GFX10-NEXT: v_max_i16 v2, v2, v18
+; GFX10-NEXT: v_max_i16 v3, v3, v19
+; GFX10-NEXT: v_max_i16 v4, v4, v20
+; GFX10-NEXT: v_perm_b32 v0, v15, v0, 0x5040100
+; GFX10-NEXT: v_perm_b32 v1, v14, v1, 0x5040100
+; GFX10-NEXT: v_perm_b32 v2, v13, v2, 0x5040100
+; GFX10-NEXT: v_perm_b32 v3, v11, v3, 0x5040100
+; GFX10-NEXT: v_perm_b32 v4, v12, v4, 0x5040100
+; GFX10-NEXT: v_perm_b32 v5, v10, v5, 0x5040100
+; GFX10-NEXT: v_perm_b32 v6, v9, v6, 0x5040100
+; GFX10-NEXT: v_perm_b32 v7, v8, v7, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_abs_v16i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v6
+; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v2
+; GFX11-NEXT: v_sub_nc_u16 v11, 0, v8
+; GFX11-NEXT: v_sub_nc_u16 v13, 0, v9
+; GFX11-NEXT: v_sub_nc_u16 v14, 0, v10
+; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v1
+; GFX11-NEXT: v_sub_nc_u16 v16, 0, v15
+; GFX11-NEXT: v_max_i16 v8, v8, v11
+; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GFX11-NEXT: v_max_i16 v9, v9, v13
+; GFX11-NEXT: v_max_i16 v10, v10, v14
+; GFX11-NEXT: v_sub_nc_u16 v13, 0, v12
+; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v0
+; GFX11-NEXT: v_sub_nc_u16 v14, 0, v11
+; GFX11-NEXT: v_sub_nc_u16 v19, 0, v6
+; GFX11-NEXT: v_sub_nc_u16 v20, 0, v5
+; GFX11-NEXT: v_max_i16 v12, v12, v13
+; GFX11-NEXT: v_max_i16 v13, v15, v16
+; GFX11-NEXT: v_max_i16 v11, v11, v14
+; GFX11-NEXT: v_sub_nc_u16 v14, 0, v17
+; GFX11-NEXT: v_sub_nc_u16 v15, 0, v18
+; GFX11-NEXT: v_sub_nc_u16 v16, 0, v7
+; GFX11-NEXT: v_max_i16 v6, v6, v19
+; GFX11-NEXT: v_max_i16 v5, v5, v20
+; GFX11-NEXT: v_max_i16 v14, v17, v14
+; GFX11-NEXT: v_max_i16 v15, v18, v15
+; GFX11-NEXT: v_max_i16 v7, v7, v16
+; GFX11-NEXT: v_sub_nc_u16 v16, 0, v0
+; GFX11-NEXT: v_sub_nc_u16 v17, 0, v1
+; GFX11-NEXT: v_sub_nc_u16 v18, 0, v2
+; GFX11-NEXT: v_sub_nc_u16 v19, 0, v3
+; GFX11-NEXT: v_sub_nc_u16 v20, 0, v4
+; GFX11-NEXT: v_max_i16 v0, v0, v16
+; GFX11-NEXT: v_max_i16 v1, v1, v17
+; GFX11-NEXT: v_max_i16 v2, v2, v18
+; GFX11-NEXT: v_max_i16 v3, v3, v19
+; GFX11-NEXT: v_max_i16 v4, v4, v20
+; GFX11-NEXT: v_perm_b32 v0, v15, v0, 0x5040100
+; GFX11-NEXT: v_perm_b32 v1, v14, v1, 0x5040100
+; GFX11-NEXT: v_perm_b32 v2, v13, v2, 0x5040100
+; GFX11-NEXT: v_perm_b32 v3, v11, v3, 0x5040100
+; GFX11-NEXT: v_perm_b32 v4, v12, v4, 0x5040100
+; GFX11-NEXT: v_perm_b32 v5, v10, v5, 0x5040100
+; GFX11-NEXT: v_perm_b32 v6, v9, v6, 0x5040100
+; GFX11-NEXT: v_perm_b32 v7, v8, v7, 0x5040100
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_abs_v16i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX12-NEXT: v_lshrrev_b32_e32 v9, 16, v6
+; GFX12-NEXT: v_lshrrev_b32_e32 v10, 16, v5
+; GFX12-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; GFX12-NEXT: v_lshrrev_b32_e32 v15, 16, v2
+; GFX12-NEXT: v_sub_nc_u16 v11, 0, v8
+; GFX12-NEXT: v_sub_nc_u16 v13, 0, v9
+; GFX12-NEXT: v_sub_nc_u16 v14, 0, v10
+; GFX12-NEXT: v_lshrrev_b32_e32 v17, 16, v1
+; GFX12-NEXT: v_sub_nc_u16 v16, 0, v15
+; GFX12-NEXT: v_max_i16 v8, v8, v11
+; GFX12-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GFX12-NEXT: v_max_i16 v9, v9, v13
+; GFX12-NEXT: v_max_i16 v10, v10, v14
+; GFX12-NEXT: v_sub_nc_u16 v13, 0, v12
+; GFX12-NEXT: v_lshrrev_b32_e32 v18, 16, v0
+; GFX12-NEXT: v_sub_nc_u16 v14, 0, v11
+; GFX12-NEXT: v_sub_nc_u16 v19, 0, v6
+; GFX12-NEXT: v_sub_nc_u16 v20, 0, v5
+; GFX12-NEXT: v_max_i16 v12, v12, v13
+; GFX12-NEXT: v_max_i16 v13, v15, v16
+; GFX12-NEXT: v_max_i16 v11, v11, v14
+; GFX12-NEXT: v_sub_nc_u16 v14, 0, v17
+; GFX12-NEXT: v_sub_nc_u16 v15, 0, v18
+; GFX12-NEXT: v_sub_nc_u16 v16, 0, v7
+; GFX12-NEXT: v_max_i16 v6, v6, v19
+; GFX12-NEXT: v_max_i16 v5, v5, v20
+; GFX12-NEXT: v_max_i16 v14, v17, v14
+; GFX12-NEXT: v_max_i16 v15, v18, v15
+; GFX12-NEXT: v_max_i16 v7, v7, v16
+; GFX12-NEXT: v_sub_nc_u16 v16, 0, v0
+; GFX12-NEXT: v_sub_nc_u16 v17, 0, v1
+; GFX12-NEXT: v_sub_nc_u16 v18, 0, v2
+; GFX12-NEXT: v_sub_nc_u16 v19, 0, v3
+; GFX12-NEXT: v_sub_nc_u16 v20, 0, v4
+; GFX12-NEXT: v_max_i16 v0, v0, v16
+; GFX12-NEXT: v_max_i16 v1, v1, v17
+; GFX12-NEXT: v_max_i16 v2, v2, v18
+; GFX12-NEXT: v_max_i16 v3, v3, v19
+; GFX12-NEXT: v_max_i16 v4, v4, v20
+; GFX12-NEXT: v_perm_b32 v0, v15, v0, 0x5040100
+; GFX12-NEXT: v_perm_b32 v1, v14, v1, 0x5040100
+; GFX12-NEXT: v_perm_b32 v2, v13, v2, 0x5040100
+; GFX12-NEXT: v_perm_b32 v3, v11, v3, 0x5040100
+; GFX12-NEXT: v_perm_b32 v4, v12, v4, 0x5040100
+; GFX12-NEXT: v_perm_b32 v5, v10, v5, 0x5040100
+; GFX12-NEXT: v_perm_b32 v6, v9, v6, 0x5040100
+; GFX12-NEXT: v_perm_b32 v7, v8, v7, 0x5040100
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %arg, i1 false)
+ ret <16 x i16> %res
+}
+
+define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) {
+; GFX6-LABEL: v_abs_v32i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v28, v28, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v28
+; GFX6-NEXT: v_bfe_i32 v29, v29, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v28, v28, v31
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v29
+; GFX6-NEXT: v_bfe_i32 v30, v30, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v29, v29, v31
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v30
+; GFX6-NEXT: v_bfe_i32 v26, v26, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v30, v30, v31
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v26
+; GFX6-NEXT: v_bfe_i32 v27, v27, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v26, v26, v31
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v27
+; GFX6-NEXT: v_bfe_i32 v24, v24, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v27, v27, v31
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v24
+; GFX6-NEXT: v_bfe_i32 v25, v25, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v24, v24, v31
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v25
+; GFX6-NEXT: v_bfe_i32 v22, v22, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v25, v25, v31
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v22
+; GFX6-NEXT: v_bfe_i32 v23, v23, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v22, v22, v31
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v23
+; GFX6-NEXT: v_max_i32_e32 v23, v23, v31
+; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX6-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX6-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX6-NEXT: v_or_b32_e32 v22, v22, v23
+; GFX6-NEXT: v_or_b32_e32 v24, v24, v25
+; GFX6-NEXT: v_bfe_i32 v21, v21, 0, 16
+; GFX6-NEXT: v_bfe_i32 v20, v20, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX6-NEXT: v_or_b32_e32 v28, v28, v29
+; GFX6-NEXT: v_sub_i32_e32 v29, vcc, 0, v20
+; GFX6-NEXT: v_max_i32_e32 v20, v20, v29
+; GFX6-NEXT: v_bfe_i32 v18, v18, 0, 16
+; GFX6-NEXT: v_bfe_i32 v19, v19, 0, 16
+; GFX6-NEXT: v_bfe_i32 v16, v16, 0, 16
+; GFX6-NEXT: v_bfe_i32 v17, v17, 0, 16
+; GFX6-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX6-NEXT: v_bfe_i32 v10, v10, 0, 16
+; GFX6-NEXT: v_bfe_i32 v11, v11, 0, 16
+; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX6-NEXT: v_or_b32_e32 v26, v26, v27
+; GFX6-NEXT: v_lshrrev_b32_e32 v27, 16, v26
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v23, v31, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v25, vcc, 0, v23
+; GFX6-NEXT: v_max_i32_e32 v23, v23, v25
+; GFX6-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX6-NEXT: v_or_b32_e32 v30, v30, v23
+; GFX6-NEXT: v_sub_i32_e32 v23, vcc, 0, v21
+; GFX6-NEXT: v_max_i32_e32 v21, v21, v23
+; GFX6-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX6-NEXT: v_or_b32_e32 v20, v20, v21
+; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 0, v18
+; GFX6-NEXT: v_max_i32_e32 v18, v18, v21
+; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 0, v19
+; GFX6-NEXT: v_max_i32_e32 v19, v19, v21
+; GFX6-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX6-NEXT: v_or_b32_e32 v18, v18, v19
+; GFX6-NEXT: v_sub_i32_e32 v19, vcc, 0, v16
+; GFX6-NEXT: v_max_i32_e32 v16, v16, v19
+; GFX6-NEXT: v_sub_i32_e32 v19, vcc, 0, v17
+; GFX6-NEXT: v_max_i32_e32 v17, v17, v19
+; GFX6-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX6-NEXT: v_or_b32_e32 v16, v16, v17
+; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 0, v14
+; GFX6-NEXT: v_max_i32_e32 v14, v14, v17
+; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 0, v15
+; GFX6-NEXT: v_max_i32_e32 v15, v15, v17
+; GFX6-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX6-NEXT: v_or_b32_e32 v14, v14, v15
+; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
+; GFX6-NEXT: v_max_i32_e32 v12, v12, v15
+; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
+; GFX6-NEXT: v_max_i32_e32 v13, v13, v15
+; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX6-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
+; GFX6-NEXT: v_max_i32_e32 v10, v10, v13
+; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
+; GFX6-NEXT: v_max_i32_e32 v11, v11, v13
+; GFX6-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX6-NEXT: v_or_b32_e32 v10, v10, v11
+; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
+; GFX6-NEXT: v_max_i32_e32 v8, v8, v11
+; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
+; GFX6-NEXT: v_max_i32_e32 v9, v9, v11
+; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX6-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
+; GFX6-NEXT: v_max_i32_e32 v6, v6, v9
+; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
+; GFX6-NEXT: v_max_i32_e32 v7, v7, v9
+; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX6-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
+; GFX6-NEXT: v_max_i32_e32 v4, v4, v7
+; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
+; GFX6-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v2, v5
+; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v3, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX6-NEXT: v_alignbit_b32 v5, v6, v4, 16
+; GFX6-NEXT: v_alignbit_b32 v9, v10, v8, 16
+; GFX6-NEXT: v_alignbit_b32 v13, v14, v12, 16
+; GFX6-NEXT: v_alignbit_b32 v17, v18, v16, 16
+; GFX6-NEXT: v_alignbit_b32 v21, v22, v20, 16
+; GFX6-NEXT: v_alignbit_b32 v25, v26, v24, 16
+; GFX6-NEXT: v_alignbit_b32 v29, v30, v28, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX6-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX6-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; GFX6-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX6-NEXT: v_lshrrev_b32_e32 v31, 16, v30
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_abs_v32i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v28, v28, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v28
+; GFX7-NEXT: v_bfe_i32 v29, v29, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v28, v28, v31
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v29
+; GFX7-NEXT: v_bfe_i32 v30, v30, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v29, v29, v31
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v30
+; GFX7-NEXT: v_bfe_i32 v26, v26, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v30, v30, v31
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v26
+; GFX7-NEXT: v_bfe_i32 v27, v27, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v26, v26, v31
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v27
+; GFX7-NEXT: v_bfe_i32 v24, v24, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v27, v27, v31
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v24
+; GFX7-NEXT: v_bfe_i32 v25, v25, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v24, v24, v31
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v25
+; GFX7-NEXT: v_bfe_i32 v22, v22, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v25, v25, v31
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v22
+; GFX7-NEXT: v_bfe_i32 v23, v23, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v22, v22, v31
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v23
+; GFX7-NEXT: v_max_i32_e32 v23, v23, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_or_b32_e32 v22, v22, v23
+; GFX7-NEXT: v_or_b32_e32 v24, v24, v25
+; GFX7-NEXT: v_bfe_i32 v21, v21, 0, 16
+; GFX7-NEXT: v_bfe_i32 v20, v20, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_or_b32_e32 v28, v28, v29
+; GFX7-NEXT: v_sub_i32_e32 v29, vcc, 0, v20
+; GFX7-NEXT: v_max_i32_e32 v20, v20, v29
+; GFX7-NEXT: v_bfe_i32 v18, v18, 0, 16
+; GFX7-NEXT: v_bfe_i32 v19, v19, 0, 16
+; GFX7-NEXT: v_bfe_i32 v16, v16, 0, 16
+; GFX7-NEXT: v_bfe_i32 v17, v17, 0, 16
+; GFX7-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GFX7-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX7-NEXT: v_bfe_i32 v10, v10, 0, 16
+; GFX7-NEXT: v_bfe_i32 v11, v11, 0, 16
+; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: v_or_b32_e32 v26, v26, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v26
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v23, v31, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v25, vcc, 0, v23
+; GFX7-NEXT: v_max_i32_e32 v23, v23, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_or_b32_e32 v30, v30, v23
+; GFX7-NEXT: v_sub_i32_e32 v23, vcc, 0, v21
+; GFX7-NEXT: v_max_i32_e32 v21, v21, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_or_b32_e32 v20, v20, v21
+; GFX7-NEXT: v_sub_i32_e32 v21, vcc, 0, v18
+; GFX7-NEXT: v_max_i32_e32 v18, v18, v21
+; GFX7-NEXT: v_sub_i32_e32 v21, vcc, 0, v19
+; GFX7-NEXT: v_max_i32_e32 v19, v19, v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_or_b32_e32 v18, v18, v19
+; GFX7-NEXT: v_sub_i32_e32 v19, vcc, 0, v16
+; GFX7-NEXT: v_max_i32_e32 v16, v16, v19
+; GFX7-NEXT: v_sub_i32_e32 v19, vcc, 0, v17
+; GFX7-NEXT: v_max_i32_e32 v17, v17, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_or_b32_e32 v16, v16, v17
+; GFX7-NEXT: v_sub_i32_e32 v17, vcc, 0, v14
+; GFX7-NEXT: v_max_i32_e32 v14, v14, v17
+; GFX7-NEXT: v_sub_i32_e32 v17, vcc, 0, v15
+; GFX7-NEXT: v_max_i32_e32 v15, v15, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_or_b32_e32 v14, v14, v15
+; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
+; GFX7-NEXT: v_max_i32_e32 v12, v12, v15
+; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
+; GFX7-NEXT: v_max_i32_e32 v13, v13, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
+; GFX7-NEXT: v_max_i32_e32 v10, v10, v13
+; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
+; GFX7-NEXT: v_max_i32_e32 v11, v11, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_or_b32_e32 v10, v10, v11
+; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
+; GFX7-NEXT: v_max_i32_e32 v8, v8, v11
+; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
+; GFX7-NEXT: v_max_i32_e32 v9, v9, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
+; GFX7-NEXT: v_max_i32_e32 v6, v6, v9
+; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
+; GFX7-NEXT: v_max_i32_e32 v7, v7, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
+; GFX7-NEXT: v_max_i32_e32 v4, v4, v7
+; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
+; GFX7-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
+; GFX7-NEXT: v_max_i32_e32 v2, v2, v5
+; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v6, v4, 16
+; GFX7-NEXT: v_alignbit_b32 v9, v10, v8, 16
+; GFX7-NEXT: v_alignbit_b32 v13, v14, v12, 16
+; GFX7-NEXT: v_alignbit_b32 v17, v18, v16, 16
+; GFX7-NEXT: v_alignbit_b32 v21, v22, v20, 16
+; GFX7-NEXT: v_alignbit_b32 v25, v26, v24, 16
+; GFX7-NEXT: v_alignbit_b32 v29, v30, v28, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v30
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_abs_v32i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v15
+; GFX8-NEXT: v_sub_u16_e32 v17, 0, v16
+; GFX8-NEXT: v_max_i16_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v14
+; GFX8-NEXT: v_sub_u16_e32 v18, 0, v17
+; GFX8-NEXT: v_max_i16_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; GFX8-NEXT: v_sub_u16_e32 v19, 0, v18
+; GFX8-NEXT: v_max_i16_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v12
+; GFX8-NEXT: v_sub_u16_e32 v20, 0, v19
+; GFX8-NEXT: v_max_i16_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v11
+; GFX8-NEXT: v_sub_u16_e32 v21, 0, v20
+; GFX8-NEXT: v_max_i16_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v10
+; GFX8-NEXT: v_sub_u16_e32 v22, 0, v21
+; GFX8-NEXT: v_max_i16_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v9
+; GFX8-NEXT: v_sub_u16_e32 v23, 0, v22
+; GFX8-NEXT: v_max_i16_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v8
+; GFX8-NEXT: v_sub_u16_e32 v24, 0, v23
+; GFX8-NEXT: v_max_i16_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v7
+; GFX8-NEXT: v_sub_u16_e32 v25, 0, v24
+; GFX8-NEXT: v_max_i16_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v6
+; GFX8-NEXT: v_sub_u16_e32 v26, 0, v25
+; GFX8-NEXT: v_max_i16_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v5
+; GFX8-NEXT: v_sub_u16_e32 v27, 0, v26
+; GFX8-NEXT: v_max_i16_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v4
+; GFX8-NEXT: v_sub_u16_e32 v28, 0, v27
+; GFX8-NEXT: v_max_i16_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX8-NEXT: v_sub_u16_e32 v29, 0, v28
+; GFX8-NEXT: v_max_i16_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v29, 16, v2
+; GFX8-NEXT: v_sub_u16_e32 v30, 0, v29
+; GFX8-NEXT: v_max_i16_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v30, 16, v1
+; GFX8-NEXT: v_sub_u16_e32 v31, 0, v30
+; GFX8-NEXT: v_max_i16_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v0
+; GFX8-NEXT: v_sub_u16_e32 v32, 0, v31
+; GFX8-NEXT: v_max_i16_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_sub_u16_e32 v32, 0, v0
+; GFX8-NEXT: v_max_i16_e32 v0, v0, v32
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v31
+; GFX8-NEXT: v_sub_u16_e32 v31, 0, v1
+; GFX8-NEXT: v_max_i16_e32 v1, v1, v31
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v30
+; GFX8-NEXT: v_sub_u16_e32 v30, 0, v2
+; GFX8-NEXT: v_max_i16_e32 v2, v2, v30
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v29
+; GFX8-NEXT: v_sub_u16_e32 v29, 0, v3
+; GFX8-NEXT: v_max_i16_e32 v3, v3, v29
+; GFX8-NEXT: v_or_b32_e32 v3, v3, v28
+; GFX8-NEXT: v_sub_u16_e32 v28, 0, v4
+; GFX8-NEXT: v_max_i16_e32 v4, v4, v28
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v27
+; GFX8-NEXT: v_sub_u16_e32 v27, 0, v5
+; GFX8-NEXT: v_max_i16_e32 v5, v5, v27
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v26
+; GFX8-NEXT: v_sub_u16_e32 v26, 0, v6
+; GFX8-NEXT: v_max_i16_e32 v6, v6, v26
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v25
+; GFX8-NEXT: v_sub_u16_e32 v25, 0, v7
+; GFX8-NEXT: v_max_i16_e32 v7, v7, v25
+; GFX8-NEXT: v_or_b32_e32 v7, v7, v24
+; GFX8-NEXT: v_sub_u16_e32 v24, 0, v8
+; GFX8-NEXT: v_max_i16_e32 v8, v8, v24
+; GFX8-NEXT: v_or_b32_e32 v8, v8, v23
+; GFX8-NEXT: v_sub_u16_e32 v23, 0, v9
+; GFX8-NEXT: v_max_i16_e32 v9, v9, v23
+; GFX8-NEXT: v_or_b32_e32 v9, v9, v22
+; GFX8-NEXT: v_sub_u16_e32 v22, 0, v10
+; GFX8-NEXT: v_max_i16_e32 v10, v10, v22
+; GFX8-NEXT: v_or_b32_e32 v10, v10, v21
+; GFX8-NEXT: v_sub_u16_e32 v21, 0, v11
+; GFX8-NEXT: v_max_i16_e32 v11, v11, v21
+; GFX8-NEXT: v_or_b32_e32 v11, v11, v20
+; GFX8-NEXT: v_sub_u16_e32 v20, 0, v12
+; GFX8-NEXT: v_max_i16_e32 v12, v12, v20
+; GFX8-NEXT: v_or_b32_e32 v12, v12, v19
+; GFX8-NEXT: v_sub_u16_e32 v19, 0, v13
+; GFX8-NEXT: v_sub_u16_e32 v20, 0, v15
+; GFX8-NEXT: v_max_i16_e32 v13, v13, v19
+; GFX8-NEXT: v_sub_u16_e32 v19, 0, v14
+; GFX8-NEXT: v_max_i16_e32 v14, v14, v19
+; GFX8-NEXT: v_max_i16_e32 v15, v15, v20
+; GFX8-NEXT: v_or_b32_e32 v13, v13, v18
+; GFX8-NEXT: v_or_b32_e32 v14, v14, v17
+; GFX8-NEXT: v_or_b32_e32 v15, v15, v16
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_abs_v32i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v15
+; GFX9-NEXT: v_sub_u16_e32 v17, 0, v16
+; GFX9-NEXT: v_max_i16_e32 v16, v16, v17
+; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v14
+; GFX9-NEXT: v_sub_u16_e32 v18, 0, v17
+; GFX9-NEXT: v_max_i16_e32 v17, v17, v18
+; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; GFX9-NEXT: v_sub_u16_e32 v19, 0, v18
+; GFX9-NEXT: v_max_i16_e32 v18, v18, v19
+; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v12
+; GFX9-NEXT: v_sub_u16_e32 v20, 0, v19
+; GFX9-NEXT: v_max_i16_e32 v19, v19, v20
+; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v11
+; GFX9-NEXT: v_sub_u16_e32 v21, 0, v20
+; GFX9-NEXT: v_max_i16_e32 v20, v20, v21
+; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v10
+; GFX9-NEXT: v_sub_u16_e32 v22, 0, v21
+; GFX9-NEXT: v_max_i16_e32 v21, v21, v22
+; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v9
+; GFX9-NEXT: v_sub_u16_e32 v23, 0, v22
+; GFX9-NEXT: v_max_i16_e32 v22, v22, v23
+; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8
+; GFX9-NEXT: v_sub_u16_e32 v24, 0, v23
+; GFX9-NEXT: v_max_i16_e32 v23, v23, v24
+; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v7
+; GFX9-NEXT: v_sub_u16_e32 v25, 0, v24
+; GFX9-NEXT: v_max_i16_e32 v24, v24, v25
+; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v6
+; GFX9-NEXT: v_sub_u16_e32 v26, 0, v25
+; GFX9-NEXT: v_max_i16_e32 v25, v25, v26
+; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v5
+; GFX9-NEXT: v_sub_u16_e32 v27, 0, v26
+; GFX9-NEXT: v_max_i16_e32 v26, v26, v27
+; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4
+; GFX9-NEXT: v_sub_u16_e32 v28, 0, v27
+; GFX9-NEXT: v_max_i16_e32 v27, v27, v28
+; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX9-NEXT: v_sub_u16_e32 v29, 0, v28
+; GFX9-NEXT: v_max_i16_e32 v28, v28, v29
+; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v2
+; GFX9-NEXT: v_sub_u16_e32 v30, 0, v29
+; GFX9-NEXT: v_max_i16_e32 v29, v29, v30
+; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1
+; GFX9-NEXT: v_sub_u16_e32 v31, 0, v30
+; GFX9-NEXT: v_max_i16_e32 v30, v30, v31
+; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0
+; GFX9-NEXT: v_sub_u16_e32 v32, 0, v31
+; GFX9-NEXT: v_max_i16_e32 v31, v31, v32
+; GFX9-NEXT: v_sub_u16_e32 v32, 0, v15
+; GFX9-NEXT: v_max_i16_e32 v15, v15, v32
+; GFX9-NEXT: v_sub_u16_e32 v32, 0, v14
+; GFX9-NEXT: v_max_i16_e32 v14, v14, v32
+; GFX9-NEXT: v_sub_u16_e32 v32, 0, v13
+; GFX9-NEXT: v_max_i16_e32 v13, v13, v32
+; GFX9-NEXT: v_sub_u16_e32 v32, 0, v12
+; GFX9-NEXT: v_max_i16_e32 v12, v12, v32
+; GFX9-NEXT: v_sub_u16_e32 v32, 0, v11
+; GFX9-NEXT: v_max_i16_e32 v11, v11, v32
+; GFX9-NEXT: v_sub_u16_e32 v32, 0, v10
+; GFX9-NEXT: v_max_i16_e32 v10, v10, v32
+; GFX9-NEXT: v_sub_u16_e32 v32, 0, v9
+; GFX9-NEXT: v_max_i16_e32 v9, v9, v32
+; GFX9-NEXT: v_sub_u16_e32 v32, 0, v8
+; GFX9-NEXT: v_max_i16_e32 v8, v8, v32
+; GFX9-NEXT: v_sub_u16_e32 v32, 0, v7
+; GFX9-NEXT: v_max_i16_e32 v7, v7, v32
+; GFX9-NEXT: v_sub_u16_e32 v32, 0, v6
+; GFX9-NEXT: v_max_i16_e32 v6, v6, v32
+; GFX9-NEXT: v_sub_u16_e32 v32, 0, v5
+; GFX9-NEXT: v_max_i16_e32 v5, v5, v32
+; GFX9-NEXT: v_sub_u16_e32 v32, 0, v4
+; GFX9-NEXT: v_max_i16_e32 v4, v4, v32
+; GFX9-NEXT: v_sub_u16_e32 v32, 0, v3
+; GFX9-NEXT: v_max_i16_e32 v3, v3, v32
+; GFX9-NEXT: v_sub_u16_e32 v32, 0, v2
+; GFX9-NEXT: v_max_i16_e32 v2, v2, v32
+; GFX9-NEXT: v_sub_u16_e32 v32, 0, v1
+; GFX9-NEXT: v_max_i16_e32 v1, v1, v32
+; GFX9-NEXT: v_sub_u16_e32 v32, 0, v0
+; GFX9-NEXT: v_max_i16_e32 v0, v0, v32
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_perm_b32 v0, v31, v0, s4
+; GFX9-NEXT: v_perm_b32 v1, v30, v1, s4
+; GFX9-NEXT: v_perm_b32 v2, v29, v2, s4
+; GFX9-NEXT: v_perm_b32 v3, v28, v3, s4
+; GFX9-NEXT: v_perm_b32 v4, v27, v4, s4
+; GFX9-NEXT: v_perm_b32 v5, v26, v5, s4
+; GFX9-NEXT: v_perm_b32 v6, v25, v6, s4
+; GFX9-NEXT: v_perm_b32 v7, v24, v7, s4
+; GFX9-NEXT: v_perm_b32 v8, v23, v8, s4
+; GFX9-NEXT: v_perm_b32 v9, v22, v9, s4
+; GFX9-NEXT: v_perm_b32 v10, v21, v10, s4
+; GFX9-NEXT: v_perm_b32 v11, v20, v11, s4
+; GFX9-NEXT: v_perm_b32 v12, v19, v12, s4
+; GFX9-NEXT: v_perm_b32 v13, v18, v13, s4
+; GFX9-NEXT: v_perm_b32 v14, v17, v14, s4
+; GFX9-NEXT: v_perm_b32 v15, v16, v15, s4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_abs_v32i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v15
+; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v14
+; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v13
+; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v12
+; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v11
+; GFX10-NEXT: v_sub_nc_u16 v17, 0, v16
+; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v10
+; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX10-NEXT: v_sub_nc_u16 v23, 0, v20
+; GFX10-NEXT: v_sub_nc_u16 v24, 0, v21
+; GFX10-NEXT: v_max_i16 v16, v16, v17
+; GFX10-NEXT: v_sub_nc_u16 v17, 0, v18
+; GFX10-NEXT: v_sub_nc_u16 v25, 0, v22
+; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v0
+; GFX10-NEXT: v_max_i16 v17, v18, v17
+; GFX10-NEXT: v_sub_nc_u16 v18, 0, v19
+; GFX10-NEXT: v_sub_nc_u16 v30, 0, v27
+; GFX10-NEXT: v_sub_nc_u16 v36, 0, v15
+; GFX10-NEXT: v_sub_nc_u16 v35, 0, v32
+; GFX10-NEXT: v_max_i16 v18, v19, v18
+; GFX10-NEXT: v_max_i16 v19, v20, v23
+; GFX10-NEXT: v_max_i16 v20, v21, v24
+; GFX10-NEXT: v_max_i16 v21, v22, v25
+; GFX10-NEXT: v_sub_nc_u16 v22, 0, v26
+; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v8
+; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v6
+; GFX10-NEXT: v_max_i16 v15, v15, v36
+; GFX10-NEXT: v_max_i16 v22, v26, v22
+; GFX10-NEXT: v_sub_nc_u16 v26, 0, v23
+; GFX10-NEXT: v_sub_nc_u16 v28, 0, v24
+; GFX10-NEXT: v_sub_nc_u16 v29, 0, v25
+; GFX10-NEXT: v_sub_nc_u16 v36, 0, v10
+; GFX10-NEXT: v_perm_b32 v15, v16, v15, 0x5040100
+; GFX10-NEXT: v_max_i16 v23, v23, v26
+; GFX10-NEXT: v_max_i16 v24, v24, v28
+; GFX10-NEXT: v_max_i16 v25, v25, v29
+; GFX10-NEXT: v_max_i16 v26, v27, v30
+; GFX10-NEXT: v_sub_nc_u16 v27, 0, v31
+; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v1
+; GFX10-NEXT: v_max_i16 v10, v10, v36
+; GFX10-NEXT: v_max_i16 v27, v31, v27
+; GFX10-NEXT: v_sub_nc_u16 v31, 0, v28
+; GFX10-NEXT: v_sub_nc_u16 v33, 0, v29
+; GFX10-NEXT: v_sub_nc_u16 v34, 0, v30
+; GFX10-NEXT: v_sub_nc_u16 v36, 0, v5
+; GFX10-NEXT: v_perm_b32 v10, v21, v10, 0x5040100
+; GFX10-NEXT: v_max_i16 v28, v28, v31
+; GFX10-NEXT: v_max_i16 v29, v29, v33
+; GFX10-NEXT: v_max_i16 v30, v30, v34
+; GFX10-NEXT: v_max_i16 v31, v32, v35
+; GFX10-NEXT: v_sub_nc_u16 v32, 0, v14
+; GFX10-NEXT: v_sub_nc_u16 v33, 0, v13
+; GFX10-NEXT: v_sub_nc_u16 v34, 0, v12
+; GFX10-NEXT: v_sub_nc_u16 v35, 0, v11
+; GFX10-NEXT: v_max_i16 v5, v5, v36
+; GFX10-NEXT: v_max_i16 v14, v14, v32
+; GFX10-NEXT: v_max_i16 v13, v13, v33
+; GFX10-NEXT: v_max_i16 v12, v12, v34
+; GFX10-NEXT: v_max_i16 v11, v11, v35
+; GFX10-NEXT: v_sub_nc_u16 v32, 0, v9
+; GFX10-NEXT: v_sub_nc_u16 v33, 0, v8
+; GFX10-NEXT: v_sub_nc_u16 v34, 0, v7
+; GFX10-NEXT: v_sub_nc_u16 v35, 0, v6
+; GFX10-NEXT: v_sub_nc_u16 v36, 0, v4
+; GFX10-NEXT: v_max_i16 v9, v9, v32
+; GFX10-NEXT: v_max_i16 v8, v8, v33
+; GFX10-NEXT: v_max_i16 v7, v7, v34
+; GFX10-NEXT: v_max_i16 v6, v6, v35
+; GFX10-NEXT: v_sub_nc_u16 v32, 0, v0
+; GFX10-NEXT: v_sub_nc_u16 v33, 0, v1
+; GFX10-NEXT: v_sub_nc_u16 v34, 0, v2
+; GFX10-NEXT: v_sub_nc_u16 v35, 0, v3
+; GFX10-NEXT: v_max_i16 v4, v4, v36
+; GFX10-NEXT: v_max_i16 v0, v0, v32
+; GFX10-NEXT: v_max_i16 v1, v1, v33
+; GFX10-NEXT: v_max_i16 v2, v2, v34
+; GFX10-NEXT: v_max_i16 v3, v3, v35
+; GFX10-NEXT: v_perm_b32 v4, v27, v4, 0x5040100
+; GFX10-NEXT: v_perm_b32 v0, v31, v0, 0x5040100
+; GFX10-NEXT: v_perm_b32 v1, v30, v1, 0x5040100
+; GFX10-NEXT: v_perm_b32 v2, v29, v2, 0x5040100
+; GFX10-NEXT: v_perm_b32 v3, v28, v3, 0x5040100
+; GFX10-NEXT: v_perm_b32 v5, v26, v5, 0x5040100
+; GFX10-NEXT: v_perm_b32 v6, v25, v6, 0x5040100
+; GFX10-NEXT: v_perm_b32 v7, v24, v7, 0x5040100
+; GFX10-NEXT: v_perm_b32 v8, v23, v8, 0x5040100
+; GFX10-NEXT: v_perm_b32 v9, v22, v9, 0x5040100
+; GFX10-NEXT: v_perm_b32 v11, v20, v11, 0x5040100
+; GFX10-NEXT: v_perm_b32 v12, v19, v12, 0x5040100
+; GFX10-NEXT: v_perm_b32 v13, v18, v13, 0x5040100
+; GFX10-NEXT: v_perm_b32 v14, v17, v14, 0x5040100
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_abs_v32i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v15
+; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v14
+; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v13
+; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v12
+; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v11
+; GFX11-NEXT: v_sub_nc_u16 v17, 0, v16
+; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v10
+; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX11-NEXT: v_sub_nc_u16 v23, 0, v20
+; GFX11-NEXT: v_sub_nc_u16 v24, 0, v21
+; GFX11-NEXT: v_max_i16 v16, v16, v17
+; GFX11-NEXT: v_sub_nc_u16 v17, 0, v18
+; GFX11-NEXT: v_sub_nc_u16 v25, 0, v22
+; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v0
+; GFX11-NEXT: v_max_i16 v17, v18, v17
+; GFX11-NEXT: v_sub_nc_u16 v18, 0, v19
+; GFX11-NEXT: v_sub_nc_u16 v30, 0, v27
+; GFX11-NEXT: v_sub_nc_u16 v36, 0, v15
+; GFX11-NEXT: v_sub_nc_u16 v35, 0, v32
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_max_i16 v18, v19, v18
+; GFX11-NEXT: v_max_i16 v19, v20, v23
+; GFX11-NEXT: v_max_i16 v20, v21, v24
+; GFX11-NEXT: v_max_i16 v21, v22, v25
+; GFX11-NEXT: v_sub_nc_u16 v22, 0, v26
+; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8
+; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v6
+; GFX11-NEXT: v_max_i16 v15, v15, v36
+; GFX11-NEXT: v_max_i16 v22, v26, v22
+; GFX11-NEXT: v_sub_nc_u16 v26, 0, v23
+; GFX11-NEXT: v_sub_nc_u16 v28, 0, v24
+; GFX11-NEXT: v_sub_nc_u16 v29, 0, v25
+; GFX11-NEXT: v_sub_nc_u16 v36, 0, v10
+; GFX11-NEXT: v_perm_b32 v15, v16, v15, 0x5040100
+; GFX11-NEXT: v_max_i16 v23, v23, v26
+; GFX11-NEXT: v_max_i16 v24, v24, v28
+; GFX11-NEXT: v_max_i16 v25, v25, v29
+; GFX11-NEXT: v_max_i16 v26, v27, v30
+; GFX11-NEXT: v_sub_nc_u16 v27, 0, v31
+; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v1
+; GFX11-NEXT: v_max_i16 v10, v10, v36
+; GFX11-NEXT: v_max_i16 v27, v31, v27
+; GFX11-NEXT: v_sub_nc_u16 v31, 0, v28
+; GFX11-NEXT: v_sub_nc_u16 v33, 0, v29
+; GFX11-NEXT: v_sub_nc_u16 v34, 0, v30
+; GFX11-NEXT: v_sub_nc_u16 v36, 0, v5
+; GFX11-NEXT: v_perm_b32 v10, v21, v10, 0x5040100
+; GFX11-NEXT: v_max_i16 v28, v28, v31
+; GFX11-NEXT: v_max_i16 v29, v29, v33
+; GFX11-NEXT: v_max_i16 v30, v30, v34
+; GFX11-NEXT: v_max_i16 v31, v32, v35
+; GFX11-NEXT: v_sub_nc_u16 v32, 0, v14
+; GFX11-NEXT: v_sub_nc_u16 v33, 0, v13
+; GFX11-NEXT: v_sub_nc_u16 v34, 0, v12
+; GFX11-NEXT: v_sub_nc_u16 v35, 0, v11
+; GFX11-NEXT: v_max_i16 v5, v5, v36
+; GFX11-NEXT: v_max_i16 v14, v14, v32
+; GFX11-NEXT: v_max_i16 v13, v13, v33
+; GFX11-NEXT: v_max_i16 v12, v12, v34
+; GFX11-NEXT: v_max_i16 v11, v11, v35
+; GFX11-NEXT: v_sub_nc_u16 v32, 0, v9
+; GFX11-NEXT: v_sub_nc_u16 v33, 0, v8
+; GFX11-NEXT: v_sub_nc_u16 v34, 0, v7
+; GFX11-NEXT: v_sub_nc_u16 v35, 0, v6
+; GFX11-NEXT: v_sub_nc_u16 v36, 0, v4
+; GFX11-NEXT: v_max_i16 v9, v9, v32
+; GFX11-NEXT: v_max_i16 v8, v8, v33
+; GFX11-NEXT: v_max_i16 v7, v7, v34
+; GFX11-NEXT: v_max_i16 v6, v6, v35
+; GFX11-NEXT: v_sub_nc_u16 v32, 0, v0
+; GFX11-NEXT: v_sub_nc_u16 v33, 0, v1
+; GFX11-NEXT: v_sub_nc_u16 v34, 0, v2
+; GFX11-NEXT: v_sub_nc_u16 v35, 0, v3
+; GFX11-NEXT: v_max_i16 v4, v4, v36
+; GFX11-NEXT: v_max_i16 v0, v0, v32
+; GFX11-NEXT: v_max_i16 v1, v1, v33
+; GFX11-NEXT: v_max_i16 v2, v2, v34
+; GFX11-NEXT: v_max_i16 v3, v3, v35
+; GFX11-NEXT: v_perm_b32 v4, v27, v4, 0x5040100
+; GFX11-NEXT: v_perm_b32 v0, v31, v0, 0x5040100
+; GFX11-NEXT: v_perm_b32 v1, v30, v1, 0x5040100
+; GFX11-NEXT: v_perm_b32 v2, v29, v2, 0x5040100
+; GFX11-NEXT: v_perm_b32 v3, v28, v3, 0x5040100
+; GFX11-NEXT: v_perm_b32 v5, v26, v5, 0x5040100
+; GFX11-NEXT: v_perm_b32 v6, v25, v6, 0x5040100
+; GFX11-NEXT: v_perm_b32 v7, v24, v7, 0x5040100
+; GFX11-NEXT: v_perm_b32 v8, v23, v8, 0x5040100
+; GFX11-NEXT: v_perm_b32 v9, v22, v9, 0x5040100
+; GFX11-NEXT: v_perm_b32 v11, v20, v11, 0x5040100
+; GFX11-NEXT: v_perm_b32 v12, v19, v12, 0x5040100
+; GFX11-NEXT: v_perm_b32 v13, v18, v13, 0x5040100
+; GFX11-NEXT: v_perm_b32 v14, v17, v14, 0x5040100
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_abs_v32i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v16, 16, v15
+; GFX12-NEXT: v_lshrrev_b32_e32 v18, 16, v14
+; GFX12-NEXT: v_lshrrev_b32_e32 v19, 16, v13
+; GFX12-NEXT: v_lshrrev_b32_e32 v20, 16, v12
+; GFX12-NEXT: v_lshrrev_b32_e32 v21, 16, v11
+; GFX12-NEXT: v_sub_nc_u16 v17, 0, v16
+; GFX12-NEXT: v_lshrrev_b32_e32 v22, 16, v10
+; GFX12-NEXT: v_lshrrev_b32_e32 v26, 16, v9
+; GFX12-NEXT: v_sub_nc_u16 v23, 0, v20
+; GFX12-NEXT: v_sub_nc_u16 v24, 0, v21
+; GFX12-NEXT: v_max_i16 v16, v16, v17
+; GFX12-NEXT: v_sub_nc_u16 v17, 0, v18
+; GFX12-NEXT: v_sub_nc_u16 v25, 0, v22
+; GFX12-NEXT: v_lshrrev_b32_e32 v27, 16, v5
+; GFX12-NEXT: v_lshrrev_b32_e32 v31, 16, v4
+; GFX12-NEXT: v_lshrrev_b32_e32 v32, 16, v0
+; GFX12-NEXT: v_max_i16 v17, v18, v17
+; GFX12-NEXT: v_sub_nc_u16 v18, 0, v19
+; GFX12-NEXT: v_sub_nc_u16 v30, 0, v27
+; GFX12-NEXT: v_sub_nc_u16 v36, 0, v15
+; GFX12-NEXT: v_sub_nc_u16 v35, 0, v32
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT: v_max_i16 v18, v19, v18
+; GFX12-NEXT: v_max_i16 v19, v20, v23
+; GFX12-NEXT: v_max_i16 v20, v21, v24
+; GFX12-NEXT: v_max_i16 v21, v22, v25
+; GFX12-NEXT: v_sub_nc_u16 v22, 0, v26
+; GFX12-NEXT: v_lshrrev_b32_e32 v23, 16, v8
+; GFX12-NEXT: v_lshrrev_b32_e32 v24, 16, v7
+; GFX12-NEXT: v_lshrrev_b32_e32 v25, 16, v6
+; GFX12-NEXT: v_max_i16 v15, v15, v36
+; GFX12-NEXT: v_max_i16 v22, v26, v22
+; GFX12-NEXT: v_sub_nc_u16 v26, 0, v23
+; GFX12-NEXT: v_sub_nc_u16 v28, 0, v24
+; GFX12-NEXT: v_sub_nc_u16 v29, 0, v25
+; GFX12-NEXT: v_sub_nc_u16 v36, 0, v10
+; GFX12-NEXT: v_perm_b32 v15, v16, v15, 0x5040100
+; GFX12-NEXT: v_max_i16 v23, v23, v26
+; GFX12-NEXT: v_max_i16 v24, v24, v28
+; GFX12-NEXT: v_max_i16 v25, v25, v29
+; GFX12-NEXT: v_max_i16 v26, v27, v30
+; GFX12-NEXT: v_sub_nc_u16 v27, 0, v31
+; GFX12-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX12-NEXT: v_lshrrev_b32_e32 v29, 16, v2
+; GFX12-NEXT: v_lshrrev_b32_e32 v30, 16, v1
+; GFX12-NEXT: v_max_i16 v10, v10, v36
+; GFX12-NEXT: v_max_i16 v27, v31, v27
+; GFX12-NEXT: v_sub_nc_u16 v31, 0, v28
+; GFX12-NEXT: v_sub_nc_u16 v33, 0, v29
+; GFX12-NEXT: v_sub_nc_u16 v34, 0, v30
+; GFX12-NEXT: v_sub_nc_u16 v36, 0, v5
+; GFX12-NEXT: v_perm_b32 v10, v21, v10, 0x5040100
+; GFX12-NEXT: v_max_i16 v28, v28, v31
+; GFX12-NEXT: v_max_i16 v29, v29, v33
+; GFX12-NEXT: v_max_i16 v30, v30, v34
+; GFX12-NEXT: v_max_i16 v31, v32, v35
+; GFX12-NEXT: v_sub_nc_u16 v32, 0, v14
+; GFX12-NEXT: v_sub_nc_u16 v33, 0, v13
+; GFX12-NEXT: v_sub_nc_u16 v34, 0, v12
+; GFX12-NEXT: v_sub_nc_u16 v35, 0, v11
+; GFX12-NEXT: v_max_i16 v5, v5, v36
+; GFX12-NEXT: v_max_i16 v14, v14, v32
+; GFX12-NEXT: v_max_i16 v13, v13, v33
+; GFX12-NEXT: v_max_i16 v12, v12, v34
+; GFX12-NEXT: v_max_i16 v11, v11, v35
+; GFX12-NEXT: v_sub_nc_u16 v32, 0, v9
+; GFX12-NEXT: v_sub_nc_u16 v33, 0, v8
+; GFX12-NEXT: v_sub_nc_u16 v34, 0, v7
+; GFX12-NEXT: v_sub_nc_u16 v35, 0, v6
+; GFX12-NEXT: v_sub_nc_u16 v36, 0, v4
+; GFX12-NEXT: v_max_i16 v9, v9, v32
+; GFX12-NEXT: v_max_i16 v8, v8, v33
+; GFX12-NEXT: v_max_i16 v7, v7, v34
+; GFX12-NEXT: v_max_i16 v6, v6, v35
+; GFX12-NEXT: v_sub_nc_u16 v32, 0, v0
+; GFX12-NEXT: v_sub_nc_u16 v33, 0, v1
+; GFX12-NEXT: v_sub_nc_u16 v34, 0, v2
+; GFX12-NEXT: v_sub_nc_u16 v35, 0, v3
+; GFX12-NEXT: v_max_i16 v4, v4, v36
+; GFX12-NEXT: v_max_i16 v0, v0, v32
+; GFX12-NEXT: v_max_i16 v1, v1, v33
+; GFX12-NEXT: v_max_i16 v2, v2, v34
+; GFX12-NEXT: v_max_i16 v3, v3, v35
+; GFX12-NEXT: v_perm_b32 v4, v27, v4, 0x5040100
+; GFX12-NEXT: v_perm_b32 v0, v31, v0, 0x5040100
+; GFX12-NEXT: v_perm_b32 v1, v30, v1, 0x5040100
+; GFX12-NEXT: v_perm_b32 v2, v29, v2, 0x5040100
+; GFX12-NEXT: v_perm_b32 v3, v28, v3, 0x5040100
+; GFX12-NEXT: v_perm_b32 v5, v26, v5, 0x5040100
+; GFX12-NEXT: v_perm_b32 v6, v25, v6, 0x5040100
+; GFX12-NEXT: v_perm_b32 v7, v24, v7, 0x5040100
+; GFX12-NEXT: v_perm_b32 v8, v23, v8, 0x5040100
+; GFX12-NEXT: v_perm_b32 v9, v22, v9, 0x5040100
+; GFX12-NEXT: v_perm_b32 v11, v20, v11, 0x5040100
+; GFX12-NEXT: v_perm_b32 v12, v19, v12, 0x5040100
+; GFX12-NEXT: v_perm_b32 v13, v18, v13, 0x5040100
+; GFX12-NEXT: v_perm_b32 v14, v17, v14, 0x5040100
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %res = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %arg, i1 false)
+ ret <32 x i16> %res
+}
>From c88c7baa9087a7e41209111ac662597d9d09b00d Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tgymnich at icloud.com>
Date: Mon, 10 Jun 2024 14:36:41 +0200
Subject: [PATCH 2/4] [AMDGPU] Fix expansion for v_abs_v4i16
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 81098201e9c0f..c030a9117340a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -791,7 +791,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
// Split vector operations.
setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
- ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
+ ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
ISD::SSUBSAT},
VT, Custom);
@@ -5804,6 +5804,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerTRAP(Op, DAG);
case ISD::DEBUGTRAP:
return lowerDEBUGTRAP(Op, DAG);
+ case ISD::ABS:
case ISD::FABS:
case ISD::FNEG:
case ISD::FCANONICALIZE:
>From dddbc2b42f1e0bdd6db157d1d53fb305b0ff534d Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tgymnich at icloud.com>
Date: Mon, 10 Jun 2024 15:03:54 +0200
Subject: [PATCH 3/4] update tests
---
llvm/test/CodeGen/AMDGPU/abs.ll | 976 ++++++++------------------------
1 file changed, 240 insertions(+), 736 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/abs.ll b/llvm/test/CodeGen/AMDGPU/abs.ll
index a28d4ef07babc..fdea3963f3ae6 100644
--- a/llvm/test/CodeGen/AMDGPU/abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/abs.ll
@@ -75,57 +75,29 @@ define <4 x i16> @v_abs_v4i16(<4 x i16> %arg) {
; GFX9-LABEL: v_abs_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX9-NEXT: v_sub_u16_e32 v3, 0, v2
-; GFX9-NEXT: v_max_i16_e32 v2, v2, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX9-NEXT: v_sub_u16_e32 v4, 0, v3
-; GFX9-NEXT: v_max_i16_e32 v3, v3, v4
-; GFX9-NEXT: v_sub_u16_e32 v4, 0, v1
-; GFX9-NEXT: v_max_i16_e32 v1, v1, v4
-; GFX9-NEXT: v_sub_u16_e32 v4, 0, v0
-; GFX9-NEXT: v_max_i16_e32 v0, v0, v4
-; GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4
-; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4
+; GFX9-NEXT: v_pk_sub_i16 v2, 0, v0
+; GFX9-NEXT: v_pk_max_i16 v0, v0, v2
+; GFX9-NEXT: v_pk_sub_i16 v2, 0, v1
+; GFX9-NEXT: v_pk_max_i16 v1, v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_abs_v4i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX10-NEXT: v_sub_nc_u16 v4, 0, v1
-; GFX10-NEXT: v_sub_nc_u16 v5, 0, v0
-; GFX10-NEXT: v_sub_nc_u16 v6, 0, v2
-; GFX10-NEXT: v_sub_nc_u16 v7, 0, v3
-; GFX10-NEXT: v_max_i16 v1, v1, v4
-; GFX10-NEXT: v_max_i16 v0, v0, v5
-; GFX10-NEXT: v_max_i16 v2, v2, v6
-; GFX10-NEXT: v_max_i16 v3, v3, v7
-; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX10-NEXT: v_pk_sub_i16 v2, 0, v0
+; GFX10-NEXT: v_pk_sub_i16 v3, 0, v1
+; GFX10-NEXT: v_pk_max_i16 v0, v0, v2
+; GFX10-NEXT: v_pk_max_i16 v1, v1, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_abs_v4i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT: v_sub_nc_u16 v4, 0, v1
-; GFX11-NEXT: v_sub_nc_u16 v5, 0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_sub_nc_u16 v6, 0, v2
-; GFX11-NEXT: v_sub_nc_u16 v7, 0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_max_i16 v1, v1, v4
-; GFX11-NEXT: v_max_i16 v0, v0, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_max_i16 v2, v2, v6
-; GFX11-NEXT: v_max_i16 v3, v3, v7
+; GFX11-NEXT: v_pk_sub_i16 v2, 0, v0
+; GFX11-NEXT: v_pk_sub_i16 v3, 0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT: v_pk_max_i16 v0, v0, v2
+; GFX11-NEXT: v_pk_max_i16 v1, v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_abs_v4i16:
@@ -135,22 +107,11 @@ define <4 x i16> @v_abs_v4i16(<4 x i16> %arg) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX12-NEXT: v_sub_nc_u16 v4, 0, v1
-; GFX12-NEXT: v_sub_nc_u16 v5, 0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_sub_nc_u16 v6, 0, v2
-; GFX12-NEXT: v_sub_nc_u16 v7, 0, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_max_i16 v1, v1, v4
-; GFX12-NEXT: v_max_i16 v0, v0, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_max_i16 v2, v2, v6
-; GFX12-NEXT: v_max_i16 v3, v3, v7
+; GFX12-NEXT: v_pk_sub_i16 v2, 0, v0
+; GFX12-NEXT: v_pk_sub_i16 v3, 0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX12-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX12-NEXT: v_pk_max_i16 v0, v0, v2
+; GFX12-NEXT: v_pk_max_i16 v1, v1, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %arg, i1 false)
ret <4 x i16> %res
@@ -271,93 +232,42 @@ define <8 x i16> @v_abs_v8i16(<8 x i16> %arg) {
; GFX9-LABEL: v_abs_v8i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; GFX9-NEXT: v_sub_u16_e32 v5, 0, v4
-; GFX9-NEXT: v_max_i16_e32 v4, v4, v5
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX9-NEXT: v_sub_u16_e32 v6, 0, v5
-; GFX9-NEXT: v_max_i16_e32 v5, v5, v6
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX9-NEXT: v_sub_u16_e32 v7, 0, v6
-; GFX9-NEXT: v_max_i16_e32 v6, v6, v7
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v0
-; GFX9-NEXT: v_sub_u16_e32 v8, 0, v7
-; GFX9-NEXT: v_max_i16_e32 v7, v7, v8
-; GFX9-NEXT: v_sub_u16_e32 v8, 0, v3
-; GFX9-NEXT: v_max_i16_e32 v3, v3, v8
-; GFX9-NEXT: v_sub_u16_e32 v8, 0, v2
-; GFX9-NEXT: v_max_i16_e32 v2, v2, v8
-; GFX9-NEXT: v_sub_u16_e32 v8, 0, v1
-; GFX9-NEXT: v_max_i16_e32 v1, v1, v8
-; GFX9-NEXT: v_sub_u16_e32 v8, 0, v0
-; GFX9-NEXT: v_max_i16_e32 v0, v0, v8
-; GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GFX9-NEXT: v_perm_b32 v0, v7, v0, s4
-; GFX9-NEXT: v_perm_b32 v1, v6, v1, s4
-; GFX9-NEXT: v_perm_b32 v2, v5, v2, s4
-; GFX9-NEXT: v_perm_b32 v3, v4, v3, s4
+; GFX9-NEXT: v_pk_sub_i16 v4, 0, v0
+; GFX9-NEXT: v_pk_max_i16 v0, v0, v4
+; GFX9-NEXT: v_pk_sub_i16 v4, 0, v1
+; GFX9-NEXT: v_pk_max_i16 v1, v1, v4
+; GFX9-NEXT: v_pk_sub_i16 v4, 0, v2
+; GFX9-NEXT: v_pk_max_i16 v2, v2, v4
+; GFX9-NEXT: v_pk_sub_i16 v4, 0, v3
+; GFX9-NEXT: v_pk_max_i16 v3, v3, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_abs_v8i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v0
-; GFX10-NEXT: v_sub_nc_u16 v11, 0, v0
-; GFX10-NEXT: v_sub_nc_u16 v8, 0, v4
-; GFX10-NEXT: v_sub_nc_u16 v9, 0, v5
-; GFX10-NEXT: v_sub_nc_u16 v10, 0, v6
-; GFX10-NEXT: v_sub_nc_u16 v12, 0, v7
-; GFX10-NEXT: v_max_i16 v0, v0, v11
-; GFX10-NEXT: v_max_i16 v4, v4, v8
-; GFX10-NEXT: v_max_i16 v5, v5, v9
-; GFX10-NEXT: v_max_i16 v6, v6, v10
-; GFX10-NEXT: v_sub_nc_u16 v8, 0, v1
-; GFX10-NEXT: v_sub_nc_u16 v9, 0, v2
-; GFX10-NEXT: v_sub_nc_u16 v10, 0, v3
-; GFX10-NEXT: v_max_i16 v7, v7, v12
-; GFX10-NEXT: v_max_i16 v1, v1, v8
-; GFX10-NEXT: v_max_i16 v2, v2, v9
-; GFX10-NEXT: v_max_i16 v3, v3, v10
-; GFX10-NEXT: v_perm_b32 v0, v7, v0, 0x5040100
-; GFX10-NEXT: v_perm_b32 v1, v6, v1, 0x5040100
-; GFX10-NEXT: v_perm_b32 v2, v5, v2, 0x5040100
-; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100
+; GFX10-NEXT: v_pk_sub_i16 v4, 0, v0
+; GFX10-NEXT: v_pk_sub_i16 v5, 0, v1
+; GFX10-NEXT: v_pk_sub_i16 v6, 0, v2
+; GFX10-NEXT: v_pk_sub_i16 v7, 0, v3
+; GFX10-NEXT: v_pk_max_i16 v0, v0, v4
+; GFX10-NEXT: v_pk_max_i16 v1, v1, v5
+; GFX10-NEXT: v_pk_max_i16 v2, v2, v6
+; GFX10-NEXT: v_pk_max_i16 v3, v3, v7
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_abs_v8i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v0
-; GFX11-NEXT: v_sub_nc_u16 v11, 0, v0
-; GFX11-NEXT: v_sub_nc_u16 v8, 0, v4
-; GFX11-NEXT: v_sub_nc_u16 v9, 0, v5
-; GFX11-NEXT: v_sub_nc_u16 v10, 0, v6
-; GFX11-NEXT: v_sub_nc_u16 v12, 0, v7
-; GFX11-NEXT: v_max_i16 v0, v0, v11
-; GFX11-NEXT: v_max_i16 v4, v4, v8
-; GFX11-NEXT: v_max_i16 v5, v5, v9
-; GFX11-NEXT: v_max_i16 v6, v6, v10
-; GFX11-NEXT: v_sub_nc_u16 v8, 0, v1
-; GFX11-NEXT: v_sub_nc_u16 v9, 0, v2
-; GFX11-NEXT: v_sub_nc_u16 v10, 0, v3
-; GFX11-NEXT: v_max_i16 v7, v7, v12
+; GFX11-NEXT: v_pk_sub_i16 v4, 0, v0
+; GFX11-NEXT: v_pk_sub_i16 v5, 0, v1
+; GFX11-NEXT: v_pk_sub_i16 v6, 0, v2
+; GFX11-NEXT: v_pk_sub_i16 v7, 0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_max_i16 v1, v1, v8
-; GFX11-NEXT: v_max_i16 v2, v2, v9
+; GFX11-NEXT: v_pk_max_i16 v0, v0, v4
+; GFX11-NEXT: v_pk_max_i16 v1, v1, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_max_i16 v3, v3, v10
-; GFX11-NEXT: v_perm_b32 v0, v7, v0, 0x5040100
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v1, v6, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x5040100
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x5040100
+; GFX11-NEXT: v_pk_max_i16 v2, v2, v6
+; GFX11-NEXT: v_pk_max_i16 v3, v3, v7
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_abs_v8i16:
@@ -367,34 +277,16 @@ define <8 x i16> @v_abs_v8i16(<8 x i16> %arg) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX12-NEXT: v_lshrrev_b32_e32 v7, 16, v0
-; GFX12-NEXT: v_sub_nc_u16 v11, 0, v0
-; GFX12-NEXT: v_sub_nc_u16 v8, 0, v4
-; GFX12-NEXT: v_sub_nc_u16 v9, 0, v5
-; GFX12-NEXT: v_sub_nc_u16 v10, 0, v6
-; GFX12-NEXT: v_sub_nc_u16 v12, 0, v7
-; GFX12-NEXT: v_max_i16 v0, v0, v11
-; GFX12-NEXT: v_max_i16 v4, v4, v8
-; GFX12-NEXT: v_max_i16 v5, v5, v9
-; GFX12-NEXT: v_max_i16 v6, v6, v10
-; GFX12-NEXT: v_sub_nc_u16 v8, 0, v1
-; GFX12-NEXT: v_sub_nc_u16 v9, 0, v2
-; GFX12-NEXT: v_sub_nc_u16 v10, 0, v3
-; GFX12-NEXT: v_max_i16 v7, v7, v12
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_max_i16 v1, v1, v8
-; GFX12-NEXT: v_max_i16 v2, v2, v9
+; GFX12-NEXT: v_pk_sub_i16 v4, 0, v0
+; GFX12-NEXT: v_pk_sub_i16 v5, 0, v1
+; GFX12-NEXT: v_pk_sub_i16 v6, 0, v2
+; GFX12-NEXT: v_pk_sub_i16 v7, 0, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_max_i16 v3, v3, v10
-; GFX12-NEXT: v_perm_b32 v0, v7, v0, 0x5040100
+; GFX12-NEXT: v_pk_max_i16 v0, v0, v4
+; GFX12-NEXT: v_pk_max_i16 v1, v1, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_perm_b32 v1, v6, v1, 0x5040100
-; GFX12-NEXT: v_perm_b32 v2, v5, v2, 0x5040100
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT: v_perm_b32 v3, v4, v3, 0x5040100
+; GFX12-NEXT: v_pk_max_i16 v2, v2, v6
+; GFX12-NEXT: v_pk_max_i16 v3, v3, v7
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %arg, i1 false)
ret <8 x i16> %res
@@ -612,161 +504,64 @@ define <16 x i16> @v_abs_v16i16(<16 x i16> %arg) {
; GFX9-LABEL: v_abs_v16i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v7
-; GFX9-NEXT: v_sub_u16_e32 v9, 0, v8
-; GFX9-NEXT: v_max_i16_e32 v8, v8, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v6
-; GFX9-NEXT: v_sub_u16_e32 v10, 0, v9
-; GFX9-NEXT: v_max_i16_e32 v9, v9, v10
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v5
-; GFX9-NEXT: v_sub_u16_e32 v11, 0, v10
-; GFX9-NEXT: v_max_i16_e32 v10, v10, v11
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v4
-; GFX9-NEXT: v_sub_u16_e32 v12, 0, v11
-; GFX9-NEXT: v_max_i16_e32 v11, v11, v12
-; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v3
-; GFX9-NEXT: v_sub_u16_e32 v13, 0, v12
-; GFX9-NEXT: v_max_i16_e32 v12, v12, v13
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v2
-; GFX9-NEXT: v_sub_u16_e32 v14, 0, v13
-; GFX9-NEXT: v_max_i16_e32 v13, v13, v14
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v1
-; GFX9-NEXT: v_sub_u16_e32 v15, 0, v14
-; GFX9-NEXT: v_max_i16_e32 v14, v14, v15
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v0
-; GFX9-NEXT: v_sub_u16_e32 v16, 0, v15
-; GFX9-NEXT: v_max_i16_e32 v15, v15, v16
-; GFX9-NEXT: v_sub_u16_e32 v16, 0, v7
-; GFX9-NEXT: v_max_i16_e32 v7, v7, v16
-; GFX9-NEXT: v_sub_u16_e32 v16, 0, v6
-; GFX9-NEXT: v_max_i16_e32 v6, v6, v16
-; GFX9-NEXT: v_sub_u16_e32 v16, 0, v5
-; GFX9-NEXT: v_max_i16_e32 v5, v5, v16
-; GFX9-NEXT: v_sub_u16_e32 v16, 0, v4
-; GFX9-NEXT: v_max_i16_e32 v4, v4, v16
-; GFX9-NEXT: v_sub_u16_e32 v16, 0, v3
-; GFX9-NEXT: v_max_i16_e32 v3, v3, v16
-; GFX9-NEXT: v_sub_u16_e32 v16, 0, v2
-; GFX9-NEXT: v_max_i16_e32 v2, v2, v16
-; GFX9-NEXT: v_sub_u16_e32 v16, 0, v1
-; GFX9-NEXT: v_max_i16_e32 v1, v1, v16
-; GFX9-NEXT: v_sub_u16_e32 v16, 0, v0
-; GFX9-NEXT: v_max_i16_e32 v0, v0, v16
-; GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GFX9-NEXT: v_perm_b32 v0, v15, v0, s4
-; GFX9-NEXT: v_perm_b32 v1, v14, v1, s4
-; GFX9-NEXT: v_perm_b32 v2, v13, v2, s4
-; GFX9-NEXT: v_perm_b32 v3, v12, v3, s4
-; GFX9-NEXT: v_perm_b32 v4, v11, v4, s4
-; GFX9-NEXT: v_perm_b32 v5, v10, v5, s4
-; GFX9-NEXT: v_perm_b32 v6, v9, v6, s4
-; GFX9-NEXT: v_perm_b32 v7, v8, v7, s4
+; GFX9-NEXT: v_pk_sub_i16 v8, 0, v0
+; GFX9-NEXT: v_pk_max_i16 v0, v0, v8
+; GFX9-NEXT: v_pk_sub_i16 v8, 0, v1
+; GFX9-NEXT: v_pk_max_i16 v1, v1, v8
+; GFX9-NEXT: v_pk_sub_i16 v8, 0, v2
+; GFX9-NEXT: v_pk_max_i16 v2, v2, v8
+; GFX9-NEXT: v_pk_sub_i16 v8, 0, v3
+; GFX9-NEXT: v_pk_max_i16 v3, v3, v8
+; GFX9-NEXT: v_pk_sub_i16 v8, 0, v4
+; GFX9-NEXT: v_pk_max_i16 v4, v4, v8
+; GFX9-NEXT: v_pk_sub_i16 v8, 0, v5
+; GFX9-NEXT: v_pk_max_i16 v5, v5, v8
+; GFX9-NEXT: v_pk_sub_i16 v8, 0, v6
+; GFX9-NEXT: v_pk_max_i16 v6, v6, v8
+; GFX9-NEXT: v_pk_sub_i16 v8, 0, v7
+; GFX9-NEXT: v_pk_max_i16 v7, v7, v8
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_abs_v16i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v7
-; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v6
-; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v2
-; GFX10-NEXT: v_sub_nc_u16 v11, 0, v8
-; GFX10-NEXT: v_sub_nc_u16 v13, 0, v9
-; GFX10-NEXT: v_sub_nc_u16 v14, 0, v10
-; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v1
-; GFX10-NEXT: v_sub_nc_u16 v16, 0, v15
-; GFX10-NEXT: v_max_i16 v8, v8, v11
-; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v3
-; GFX10-NEXT: v_max_i16 v9, v9, v13
-; GFX10-NEXT: v_max_i16 v10, v10, v14
-; GFX10-NEXT: v_sub_nc_u16 v13, 0, v12
-; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX10-NEXT: v_sub_nc_u16 v14, 0, v11
-; GFX10-NEXT: v_sub_nc_u16 v19, 0, v6
-; GFX10-NEXT: v_sub_nc_u16 v20, 0, v5
-; GFX10-NEXT: v_max_i16 v12, v12, v13
-; GFX10-NEXT: v_max_i16 v13, v15, v16
-; GFX10-NEXT: v_max_i16 v11, v11, v14
-; GFX10-NEXT: v_sub_nc_u16 v14, 0, v17
-; GFX10-NEXT: v_sub_nc_u16 v15, 0, v18
-; GFX10-NEXT: v_sub_nc_u16 v16, 0, v7
-; GFX10-NEXT: v_max_i16 v6, v6, v19
-; GFX10-NEXT: v_max_i16 v5, v5, v20
-; GFX10-NEXT: v_max_i16 v14, v17, v14
-; GFX10-NEXT: v_max_i16 v15, v18, v15
-; GFX10-NEXT: v_max_i16 v7, v7, v16
-; GFX10-NEXT: v_sub_nc_u16 v16, 0, v0
-; GFX10-NEXT: v_sub_nc_u16 v17, 0, v1
-; GFX10-NEXT: v_sub_nc_u16 v18, 0, v2
-; GFX10-NEXT: v_sub_nc_u16 v19, 0, v3
-; GFX10-NEXT: v_sub_nc_u16 v20, 0, v4
-; GFX10-NEXT: v_max_i16 v0, v0, v16
-; GFX10-NEXT: v_max_i16 v1, v1, v17
-; GFX10-NEXT: v_max_i16 v2, v2, v18
-; GFX10-NEXT: v_max_i16 v3, v3, v19
-; GFX10-NEXT: v_max_i16 v4, v4, v20
-; GFX10-NEXT: v_perm_b32 v0, v15, v0, 0x5040100
-; GFX10-NEXT: v_perm_b32 v1, v14, v1, 0x5040100
-; GFX10-NEXT: v_perm_b32 v2, v13, v2, 0x5040100
-; GFX10-NEXT: v_perm_b32 v3, v11, v3, 0x5040100
-; GFX10-NEXT: v_perm_b32 v4, v12, v4, 0x5040100
-; GFX10-NEXT: v_perm_b32 v5, v10, v5, 0x5040100
-; GFX10-NEXT: v_perm_b32 v6, v9, v6, 0x5040100
-; GFX10-NEXT: v_perm_b32 v7, v8, v7, 0x5040100
+; GFX10-NEXT: v_pk_sub_i16 v8, 0, v0
+; GFX10-NEXT: v_pk_sub_i16 v9, 0, v1
+; GFX10-NEXT: v_pk_sub_i16 v10, 0, v2
+; GFX10-NEXT: v_pk_sub_i16 v11, 0, v6
+; GFX10-NEXT: v_pk_sub_i16 v12, 0, v7
+; GFX10-NEXT: v_pk_max_i16 v0, v0, v8
+; GFX10-NEXT: v_pk_max_i16 v1, v1, v9
+; GFX10-NEXT: v_pk_max_i16 v2, v2, v10
+; GFX10-NEXT: v_pk_sub_i16 v8, 0, v3
+; GFX10-NEXT: v_pk_sub_i16 v9, 0, v4
+; GFX10-NEXT: v_pk_sub_i16 v10, 0, v5
+; GFX10-NEXT: v_pk_max_i16 v6, v6, v11
+; GFX10-NEXT: v_pk_max_i16 v7, v7, v12
+; GFX10-NEXT: v_pk_max_i16 v3, v3, v8
+; GFX10-NEXT: v_pk_max_i16 v4, v4, v9
+; GFX10-NEXT: v_pk_max_i16 v5, v5, v10
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_abs_v16i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v6
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v2
-; GFX11-NEXT: v_sub_nc_u16 v11, 0, v8
-; GFX11-NEXT: v_sub_nc_u16 v13, 0, v9
-; GFX11-NEXT: v_sub_nc_u16 v14, 0, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v1
-; GFX11-NEXT: v_sub_nc_u16 v16, 0, v15
-; GFX11-NEXT: v_max_i16 v8, v8, v11
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v3
-; GFX11-NEXT: v_max_i16 v9, v9, v13
-; GFX11-NEXT: v_max_i16 v10, v10, v14
-; GFX11-NEXT: v_sub_nc_u16 v13, 0, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX11-NEXT: v_sub_nc_u16 v14, 0, v11
-; GFX11-NEXT: v_sub_nc_u16 v19, 0, v6
-; GFX11-NEXT: v_sub_nc_u16 v20, 0, v5
-; GFX11-NEXT: v_max_i16 v12, v12, v13
-; GFX11-NEXT: v_max_i16 v13, v15, v16
-; GFX11-NEXT: v_max_i16 v11, v11, v14
-; GFX11-NEXT: v_sub_nc_u16 v14, 0, v17
-; GFX11-NEXT: v_sub_nc_u16 v15, 0, v18
-; GFX11-NEXT: v_sub_nc_u16 v16, 0, v7
-; GFX11-NEXT: v_max_i16 v6, v6, v19
-; GFX11-NEXT: v_max_i16 v5, v5, v20
-; GFX11-NEXT: v_max_i16 v14, v17, v14
-; GFX11-NEXT: v_max_i16 v15, v18, v15
-; GFX11-NEXT: v_max_i16 v7, v7, v16
-; GFX11-NEXT: v_sub_nc_u16 v16, 0, v0
-; GFX11-NEXT: v_sub_nc_u16 v17, 0, v1
-; GFX11-NEXT: v_sub_nc_u16 v18, 0, v2
-; GFX11-NEXT: v_sub_nc_u16 v19, 0, v3
-; GFX11-NEXT: v_sub_nc_u16 v20, 0, v4
-; GFX11-NEXT: v_max_i16 v0, v0, v16
-; GFX11-NEXT: v_max_i16 v1, v1, v17
-; GFX11-NEXT: v_max_i16 v2, v2, v18
-; GFX11-NEXT: v_max_i16 v3, v3, v19
-; GFX11-NEXT: v_max_i16 v4, v4, v20
-; GFX11-NEXT: v_perm_b32 v0, v15, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v14, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v13, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v11, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v12, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v10, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v9, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v8, v7, 0x5040100
+; GFX11-NEXT: v_pk_sub_i16 v8, 0, v0
+; GFX11-NEXT: v_pk_sub_i16 v9, 0, v1
+; GFX11-NEXT: v_pk_sub_i16 v10, 0, v2
+; GFX11-NEXT: v_pk_sub_i16 v11, 0, v6
+; GFX11-NEXT: v_pk_sub_i16 v12, 0, v7
+; GFX11-NEXT: v_pk_max_i16 v0, v0, v8
+; GFX11-NEXT: v_pk_max_i16 v1, v1, v9
+; GFX11-NEXT: v_pk_max_i16 v2, v2, v10
+; GFX11-NEXT: v_pk_sub_i16 v8, 0, v3
+; GFX11-NEXT: v_pk_sub_i16 v9, 0, v4
+; GFX11-NEXT: v_pk_sub_i16 v10, 0, v5
+; GFX11-NEXT: v_pk_max_i16 v6, v6, v11
+; GFX11-NEXT: v_pk_max_i16 v7, v7, v12
+; GFX11-NEXT: v_pk_max_i16 v3, v3, v8
+; GFX11-NEXT: v_pk_max_i16 v4, v4, v9
+; GFX11-NEXT: v_pk_max_i16 v5, v5, v10
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_abs_v16i16:
@@ -776,54 +571,22 @@ define <16 x i16> @v_abs_v16i16(<16 x i16> %arg) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v8, 16, v7
-; GFX12-NEXT: v_lshrrev_b32_e32 v9, 16, v6
-; GFX12-NEXT: v_lshrrev_b32_e32 v10, 16, v5
-; GFX12-NEXT: v_lshrrev_b32_e32 v12, 16, v4
-; GFX12-NEXT: v_lshrrev_b32_e32 v15, 16, v2
-; GFX12-NEXT: v_sub_nc_u16 v11, 0, v8
-; GFX12-NEXT: v_sub_nc_u16 v13, 0, v9
-; GFX12-NEXT: v_sub_nc_u16 v14, 0, v10
-; GFX12-NEXT: v_lshrrev_b32_e32 v17, 16, v1
-; GFX12-NEXT: v_sub_nc_u16 v16, 0, v15
-; GFX12-NEXT: v_max_i16 v8, v8, v11
-; GFX12-NEXT: v_lshrrev_b32_e32 v11, 16, v3
-; GFX12-NEXT: v_max_i16 v9, v9, v13
-; GFX12-NEXT: v_max_i16 v10, v10, v14
-; GFX12-NEXT: v_sub_nc_u16 v13, 0, v12
-; GFX12-NEXT: v_lshrrev_b32_e32 v18, 16, v0
-; GFX12-NEXT: v_sub_nc_u16 v14, 0, v11
-; GFX12-NEXT: v_sub_nc_u16 v19, 0, v6
-; GFX12-NEXT: v_sub_nc_u16 v20, 0, v5
-; GFX12-NEXT: v_max_i16 v12, v12, v13
-; GFX12-NEXT: v_max_i16 v13, v15, v16
-; GFX12-NEXT: v_max_i16 v11, v11, v14
-; GFX12-NEXT: v_sub_nc_u16 v14, 0, v17
-; GFX12-NEXT: v_sub_nc_u16 v15, 0, v18
-; GFX12-NEXT: v_sub_nc_u16 v16, 0, v7
-; GFX12-NEXT: v_max_i16 v6, v6, v19
-; GFX12-NEXT: v_max_i16 v5, v5, v20
-; GFX12-NEXT: v_max_i16 v14, v17, v14
-; GFX12-NEXT: v_max_i16 v15, v18, v15
-; GFX12-NEXT: v_max_i16 v7, v7, v16
-; GFX12-NEXT: v_sub_nc_u16 v16, 0, v0
-; GFX12-NEXT: v_sub_nc_u16 v17, 0, v1
-; GFX12-NEXT: v_sub_nc_u16 v18, 0, v2
-; GFX12-NEXT: v_sub_nc_u16 v19, 0, v3
-; GFX12-NEXT: v_sub_nc_u16 v20, 0, v4
-; GFX12-NEXT: v_max_i16 v0, v0, v16
-; GFX12-NEXT: v_max_i16 v1, v1, v17
-; GFX12-NEXT: v_max_i16 v2, v2, v18
-; GFX12-NEXT: v_max_i16 v3, v3, v19
-; GFX12-NEXT: v_max_i16 v4, v4, v20
-; GFX12-NEXT: v_perm_b32 v0, v15, v0, 0x5040100
-; GFX12-NEXT: v_perm_b32 v1, v14, v1, 0x5040100
-; GFX12-NEXT: v_perm_b32 v2, v13, v2, 0x5040100
-; GFX12-NEXT: v_perm_b32 v3, v11, v3, 0x5040100
-; GFX12-NEXT: v_perm_b32 v4, v12, v4, 0x5040100
-; GFX12-NEXT: v_perm_b32 v5, v10, v5, 0x5040100
-; GFX12-NEXT: v_perm_b32 v6, v9, v6, 0x5040100
-; GFX12-NEXT: v_perm_b32 v7, v8, v7, 0x5040100
+; GFX12-NEXT: v_pk_sub_i16 v8, 0, v0
+; GFX12-NEXT: v_pk_sub_i16 v9, 0, v1
+; GFX12-NEXT: v_pk_sub_i16 v10, 0, v2
+; GFX12-NEXT: v_pk_sub_i16 v11, 0, v6
+; GFX12-NEXT: v_pk_sub_i16 v12, 0, v7
+; GFX12-NEXT: v_pk_max_i16 v0, v0, v8
+; GFX12-NEXT: v_pk_max_i16 v1, v1, v9
+; GFX12-NEXT: v_pk_max_i16 v2, v2, v10
+; GFX12-NEXT: v_pk_sub_i16 v8, 0, v3
+; GFX12-NEXT: v_pk_sub_i16 v9, 0, v4
+; GFX12-NEXT: v_pk_sub_i16 v10, 0, v5
+; GFX12-NEXT: v_pk_max_i16 v6, v6, v11
+; GFX12-NEXT: v_pk_max_i16 v7, v7, v12
+; GFX12-NEXT: v_pk_max_i16 v3, v3, v8
+; GFX12-NEXT: v_pk_max_i16 v4, v4, v9
+; GFX12-NEXT: v_pk_max_i16 v5, v5, v10
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %arg, i1 false)
ret <16 x i16> %res
@@ -1236,306 +999,112 @@ define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) {
; GFX9-LABEL: v_abs_v32i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v15
-; GFX9-NEXT: v_sub_u16_e32 v17, 0, v16
-; GFX9-NEXT: v_max_i16_e32 v16, v16, v17
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v14
-; GFX9-NEXT: v_sub_u16_e32 v18, 0, v17
-; GFX9-NEXT: v_max_i16_e32 v17, v17, v18
-; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v13
-; GFX9-NEXT: v_sub_u16_e32 v19, 0, v18
-; GFX9-NEXT: v_max_i16_e32 v18, v18, v19
-; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v12
-; GFX9-NEXT: v_sub_u16_e32 v20, 0, v19
-; GFX9-NEXT: v_max_i16_e32 v19, v19, v20
-; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v11
-; GFX9-NEXT: v_sub_u16_e32 v21, 0, v20
-; GFX9-NEXT: v_max_i16_e32 v20, v20, v21
-; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v10
-; GFX9-NEXT: v_sub_u16_e32 v22, 0, v21
-; GFX9-NEXT: v_max_i16_e32 v21, v21, v22
-; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v9
-; GFX9-NEXT: v_sub_u16_e32 v23, 0, v22
-; GFX9-NEXT: v_max_i16_e32 v22, v22, v23
-; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8
-; GFX9-NEXT: v_sub_u16_e32 v24, 0, v23
-; GFX9-NEXT: v_max_i16_e32 v23, v23, v24
-; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v7
-; GFX9-NEXT: v_sub_u16_e32 v25, 0, v24
-; GFX9-NEXT: v_max_i16_e32 v24, v24, v25
-; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v6
-; GFX9-NEXT: v_sub_u16_e32 v26, 0, v25
-; GFX9-NEXT: v_max_i16_e32 v25, v25, v26
-; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v5
-; GFX9-NEXT: v_sub_u16_e32 v27, 0, v26
-; GFX9-NEXT: v_max_i16_e32 v26, v26, v27
-; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v4
-; GFX9-NEXT: v_sub_u16_e32 v28, 0, v27
-; GFX9-NEXT: v_max_i16_e32 v27, v27, v28
-; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; GFX9-NEXT: v_sub_u16_e32 v29, 0, v28
-; GFX9-NEXT: v_max_i16_e32 v28, v28, v29
-; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v2
-; GFX9-NEXT: v_sub_u16_e32 v30, 0, v29
-; GFX9-NEXT: v_max_i16_e32 v29, v29, v30
-; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v1
-; GFX9-NEXT: v_sub_u16_e32 v31, 0, v30
-; GFX9-NEXT: v_max_i16_e32 v30, v30, v31
-; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v0
-; GFX9-NEXT: v_sub_u16_e32 v32, 0, v31
-; GFX9-NEXT: v_max_i16_e32 v31, v31, v32
-; GFX9-NEXT: v_sub_u16_e32 v32, 0, v15
-; GFX9-NEXT: v_max_i16_e32 v15, v15, v32
-; GFX9-NEXT: v_sub_u16_e32 v32, 0, v14
-; GFX9-NEXT: v_max_i16_e32 v14, v14, v32
-; GFX9-NEXT: v_sub_u16_e32 v32, 0, v13
-; GFX9-NEXT: v_max_i16_e32 v13, v13, v32
-; GFX9-NEXT: v_sub_u16_e32 v32, 0, v12
-; GFX9-NEXT: v_max_i16_e32 v12, v12, v32
-; GFX9-NEXT: v_sub_u16_e32 v32, 0, v11
-; GFX9-NEXT: v_max_i16_e32 v11, v11, v32
-; GFX9-NEXT: v_sub_u16_e32 v32, 0, v10
-; GFX9-NEXT: v_max_i16_e32 v10, v10, v32
-; GFX9-NEXT: v_sub_u16_e32 v32, 0, v9
-; GFX9-NEXT: v_max_i16_e32 v9, v9, v32
-; GFX9-NEXT: v_sub_u16_e32 v32, 0, v8
-; GFX9-NEXT: v_max_i16_e32 v8, v8, v32
-; GFX9-NEXT: v_sub_u16_e32 v32, 0, v7
-; GFX9-NEXT: v_max_i16_e32 v7, v7, v32
-; GFX9-NEXT: v_sub_u16_e32 v32, 0, v6
-; GFX9-NEXT: v_max_i16_e32 v6, v6, v32
-; GFX9-NEXT: v_sub_u16_e32 v32, 0, v5
-; GFX9-NEXT: v_max_i16_e32 v5, v5, v32
-; GFX9-NEXT: v_sub_u16_e32 v32, 0, v4
-; GFX9-NEXT: v_max_i16_e32 v4, v4, v32
-; GFX9-NEXT: v_sub_u16_e32 v32, 0, v3
-; GFX9-NEXT: v_max_i16_e32 v3, v3, v32
-; GFX9-NEXT: v_sub_u16_e32 v32, 0, v2
-; GFX9-NEXT: v_max_i16_e32 v2, v2, v32
-; GFX9-NEXT: v_sub_u16_e32 v32, 0, v1
-; GFX9-NEXT: v_max_i16_e32 v1, v1, v32
-; GFX9-NEXT: v_sub_u16_e32 v32, 0, v0
-; GFX9-NEXT: v_max_i16_e32 v0, v0, v32
-; GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GFX9-NEXT: v_perm_b32 v0, v31, v0, s4
-; GFX9-NEXT: v_perm_b32 v1, v30, v1, s4
-; GFX9-NEXT: v_perm_b32 v2, v29, v2, s4
-; GFX9-NEXT: v_perm_b32 v3, v28, v3, s4
-; GFX9-NEXT: v_perm_b32 v4, v27, v4, s4
-; GFX9-NEXT: v_perm_b32 v5, v26, v5, s4
-; GFX9-NEXT: v_perm_b32 v6, v25, v6, s4
-; GFX9-NEXT: v_perm_b32 v7, v24, v7, s4
-; GFX9-NEXT: v_perm_b32 v8, v23, v8, s4
-; GFX9-NEXT: v_perm_b32 v9, v22, v9, s4
-; GFX9-NEXT: v_perm_b32 v10, v21, v10, s4
-; GFX9-NEXT: v_perm_b32 v11, v20, v11, s4
-; GFX9-NEXT: v_perm_b32 v12, v19, v12, s4
-; GFX9-NEXT: v_perm_b32 v13, v18, v13, s4
-; GFX9-NEXT: v_perm_b32 v14, v17, v14, s4
-; GFX9-NEXT: v_perm_b32 v15, v16, v15, s4
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v0
+; GFX9-NEXT: v_pk_max_i16 v0, v0, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v1
+; GFX9-NEXT: v_pk_max_i16 v1, v1, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v2
+; GFX9-NEXT: v_pk_max_i16 v2, v2, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v3
+; GFX9-NEXT: v_pk_max_i16 v3, v3, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v4
+; GFX9-NEXT: v_pk_max_i16 v4, v4, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v5
+; GFX9-NEXT: v_pk_max_i16 v5, v5, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v6
+; GFX9-NEXT: v_pk_max_i16 v6, v6, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v7
+; GFX9-NEXT: v_pk_max_i16 v7, v7, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v8
+; GFX9-NEXT: v_pk_max_i16 v8, v8, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v9
+; GFX9-NEXT: v_pk_max_i16 v9, v9, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v10
+; GFX9-NEXT: v_pk_max_i16 v10, v10, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v11
+; GFX9-NEXT: v_pk_max_i16 v11, v11, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v12
+; GFX9-NEXT: v_pk_max_i16 v12, v12, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v13
+; GFX9-NEXT: v_pk_max_i16 v13, v13, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v14
+; GFX9-NEXT: v_pk_max_i16 v14, v14, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v15
+; GFX9-NEXT: v_pk_max_i16 v15, v15, v16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_abs_v32i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v15
-; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v14
-; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v13
-; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v12
-; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v11
-; GFX10-NEXT: v_sub_nc_u16 v17, 0, v16
-; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v10
-; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v9
-; GFX10-NEXT: v_sub_nc_u16 v23, 0, v20
-; GFX10-NEXT: v_sub_nc_u16 v24, 0, v21
-; GFX10-NEXT: v_max_i16 v16, v16, v17
-; GFX10-NEXT: v_sub_nc_u16 v17, 0, v18
-; GFX10-NEXT: v_sub_nc_u16 v25, 0, v22
-; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX10-NEXT: v_lshrrev_b32_e32 v32, 16, v0
-; GFX10-NEXT: v_max_i16 v17, v18, v17
-; GFX10-NEXT: v_sub_nc_u16 v18, 0, v19
-; GFX10-NEXT: v_sub_nc_u16 v30, 0, v27
-; GFX10-NEXT: v_sub_nc_u16 v36, 0, v15
-; GFX10-NEXT: v_sub_nc_u16 v35, 0, v32
-; GFX10-NEXT: v_max_i16 v18, v19, v18
-; GFX10-NEXT: v_max_i16 v19, v20, v23
-; GFX10-NEXT: v_max_i16 v20, v21, v24
-; GFX10-NEXT: v_max_i16 v21, v22, v25
-; GFX10-NEXT: v_sub_nc_u16 v22, 0, v26
-; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v8
-; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v7
-; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v6
-; GFX10-NEXT: v_max_i16 v15, v15, v36
-; GFX10-NEXT: v_max_i16 v22, v26, v22
-; GFX10-NEXT: v_sub_nc_u16 v26, 0, v23
-; GFX10-NEXT: v_sub_nc_u16 v28, 0, v24
-; GFX10-NEXT: v_sub_nc_u16 v29, 0, v25
-; GFX10-NEXT: v_sub_nc_u16 v36, 0, v10
-; GFX10-NEXT: v_perm_b32 v15, v16, v15, 0x5040100
-; GFX10-NEXT: v_max_i16 v23, v23, v26
-; GFX10-NEXT: v_max_i16 v24, v24, v28
-; GFX10-NEXT: v_max_i16 v25, v25, v29
-; GFX10-NEXT: v_max_i16 v26, v27, v30
-; GFX10-NEXT: v_sub_nc_u16 v27, 0, v31
-; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v1
-; GFX10-NEXT: v_max_i16 v10, v10, v36
-; GFX10-NEXT: v_max_i16 v27, v31, v27
-; GFX10-NEXT: v_sub_nc_u16 v31, 0, v28
-; GFX10-NEXT: v_sub_nc_u16 v33, 0, v29
-; GFX10-NEXT: v_sub_nc_u16 v34, 0, v30
-; GFX10-NEXT: v_sub_nc_u16 v36, 0, v5
-; GFX10-NEXT: v_perm_b32 v10, v21, v10, 0x5040100
-; GFX10-NEXT: v_max_i16 v28, v28, v31
-; GFX10-NEXT: v_max_i16 v29, v29, v33
-; GFX10-NEXT: v_max_i16 v30, v30, v34
-; GFX10-NEXT: v_max_i16 v31, v32, v35
-; GFX10-NEXT: v_sub_nc_u16 v32, 0, v14
-; GFX10-NEXT: v_sub_nc_u16 v33, 0, v13
-; GFX10-NEXT: v_sub_nc_u16 v34, 0, v12
-; GFX10-NEXT: v_sub_nc_u16 v35, 0, v11
-; GFX10-NEXT: v_max_i16 v5, v5, v36
-; GFX10-NEXT: v_max_i16 v14, v14, v32
-; GFX10-NEXT: v_max_i16 v13, v13, v33
-; GFX10-NEXT: v_max_i16 v12, v12, v34
-; GFX10-NEXT: v_max_i16 v11, v11, v35
-; GFX10-NEXT: v_sub_nc_u16 v32, 0, v9
-; GFX10-NEXT: v_sub_nc_u16 v33, 0, v8
-; GFX10-NEXT: v_sub_nc_u16 v34, 0, v7
-; GFX10-NEXT: v_sub_nc_u16 v35, 0, v6
-; GFX10-NEXT: v_sub_nc_u16 v36, 0, v4
-; GFX10-NEXT: v_max_i16 v9, v9, v32
-; GFX10-NEXT: v_max_i16 v8, v8, v33
-; GFX10-NEXT: v_max_i16 v7, v7, v34
-; GFX10-NEXT: v_max_i16 v6, v6, v35
-; GFX10-NEXT: v_sub_nc_u16 v32, 0, v0
-; GFX10-NEXT: v_sub_nc_u16 v33, 0, v1
-; GFX10-NEXT: v_sub_nc_u16 v34, 0, v2
-; GFX10-NEXT: v_sub_nc_u16 v35, 0, v3
-; GFX10-NEXT: v_max_i16 v4, v4, v36
-; GFX10-NEXT: v_max_i16 v0, v0, v32
-; GFX10-NEXT: v_max_i16 v1, v1, v33
-; GFX10-NEXT: v_max_i16 v2, v2, v34
-; GFX10-NEXT: v_max_i16 v3, v3, v35
-; GFX10-NEXT: v_perm_b32 v4, v27, v4, 0x5040100
-; GFX10-NEXT: v_perm_b32 v0, v31, v0, 0x5040100
-; GFX10-NEXT: v_perm_b32 v1, v30, v1, 0x5040100
-; GFX10-NEXT: v_perm_b32 v2, v29, v2, 0x5040100
-; GFX10-NEXT: v_perm_b32 v3, v28, v3, 0x5040100
-; GFX10-NEXT: v_perm_b32 v5, v26, v5, 0x5040100
-; GFX10-NEXT: v_perm_b32 v6, v25, v6, 0x5040100
-; GFX10-NEXT: v_perm_b32 v7, v24, v7, 0x5040100
-; GFX10-NEXT: v_perm_b32 v8, v23, v8, 0x5040100
-; GFX10-NEXT: v_perm_b32 v9, v22, v9, 0x5040100
-; GFX10-NEXT: v_perm_b32 v11, v20, v11, 0x5040100
-; GFX10-NEXT: v_perm_b32 v12, v19, v12, 0x5040100
-; GFX10-NEXT: v_perm_b32 v13, v18, v13, 0x5040100
-; GFX10-NEXT: v_perm_b32 v14, v17, v14, 0x5040100
+; GFX10-NEXT: v_pk_sub_i16 v16, 0, v0
+; GFX10-NEXT: v_pk_sub_i16 v17, 0, v2
+; GFX10-NEXT: v_pk_sub_i16 v18, 0, v3
+; GFX10-NEXT: v_pk_sub_i16 v19, 0, v4
+; GFX10-NEXT: v_pk_sub_i16 v20, 0, v5
+; GFX10-NEXT: v_pk_max_i16 v0, v0, v16
+; GFX10-NEXT: v_pk_sub_i16 v16, 0, v1
+; GFX10-NEXT: v_pk_max_i16 v2, v2, v17
+; GFX10-NEXT: v_pk_max_i16 v3, v3, v18
+; GFX10-NEXT: v_pk_max_i16 v4, v4, v19
+; GFX10-NEXT: v_pk_max_i16 v5, v5, v20
+; GFX10-NEXT: v_pk_max_i16 v1, v1, v16
+; GFX10-NEXT: v_pk_sub_i16 v16, 0, v6
+; GFX10-NEXT: v_pk_sub_i16 v17, 0, v7
+; GFX10-NEXT: v_pk_sub_i16 v18, 0, v8
+; GFX10-NEXT: v_pk_sub_i16 v19, 0, v9
+; GFX10-NEXT: v_pk_sub_i16 v20, 0, v10
+; GFX10-NEXT: v_pk_max_i16 v6, v6, v16
+; GFX10-NEXT: v_pk_max_i16 v7, v7, v17
+; GFX10-NEXT: v_pk_max_i16 v8, v8, v18
+; GFX10-NEXT: v_pk_max_i16 v9, v9, v19
+; GFX10-NEXT: v_pk_max_i16 v10, v10, v20
+; GFX10-NEXT: v_pk_sub_i16 v16, 0, v11
+; GFX10-NEXT: v_pk_sub_i16 v17, 0, v12
+; GFX10-NEXT: v_pk_sub_i16 v18, 0, v13
+; GFX10-NEXT: v_pk_sub_i16 v19, 0, v14
+; GFX10-NEXT: v_pk_sub_i16 v20, 0, v15
+; GFX10-NEXT: v_pk_max_i16 v11, v11, v16
+; GFX10-NEXT: v_pk_max_i16 v12, v12, v17
+; GFX10-NEXT: v_pk_max_i16 v13, v13, v18
+; GFX10-NEXT: v_pk_max_i16 v14, v14, v19
+; GFX10-NEXT: v_pk_max_i16 v15, v15, v20
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_abs_v32i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v15
-; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v14
-; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v13
-; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v12
-; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v11
-; GFX11-NEXT: v_sub_nc_u16 v17, 0, v16
-; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v10
-; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v9
-; GFX11-NEXT: v_sub_nc_u16 v23, 0, v20
-; GFX11-NEXT: v_sub_nc_u16 v24, 0, v21
-; GFX11-NEXT: v_max_i16 v16, v16, v17
-; GFX11-NEXT: v_sub_nc_u16 v17, 0, v18
-; GFX11-NEXT: v_sub_nc_u16 v25, 0, v22
-; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v5
-; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v0
-; GFX11-NEXT: v_max_i16 v17, v18, v17
-; GFX11-NEXT: v_sub_nc_u16 v18, 0, v19
-; GFX11-NEXT: v_sub_nc_u16 v30, 0, v27
-; GFX11-NEXT: v_sub_nc_u16 v36, 0, v15
-; GFX11-NEXT: v_sub_nc_u16 v35, 0, v32
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT: v_max_i16 v18, v19, v18
-; GFX11-NEXT: v_max_i16 v19, v20, v23
-; GFX11-NEXT: v_max_i16 v20, v21, v24
-; GFX11-NEXT: v_max_i16 v21, v22, v25
-; GFX11-NEXT: v_sub_nc_u16 v22, 0, v26
-; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v7
-; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v6
-; GFX11-NEXT: v_max_i16 v15, v15, v36
-; GFX11-NEXT: v_max_i16 v22, v26, v22
-; GFX11-NEXT: v_sub_nc_u16 v26, 0, v23
-; GFX11-NEXT: v_sub_nc_u16 v28, 0, v24
-; GFX11-NEXT: v_sub_nc_u16 v29, 0, v25
-; GFX11-NEXT: v_sub_nc_u16 v36, 0, v10
-; GFX11-NEXT: v_perm_b32 v15, v16, v15, 0x5040100
-; GFX11-NEXT: v_max_i16 v23, v23, v26
-; GFX11-NEXT: v_max_i16 v24, v24, v28
-; GFX11-NEXT: v_max_i16 v25, v25, v29
-; GFX11-NEXT: v_max_i16 v26, v27, v30
-; GFX11-NEXT: v_sub_nc_u16 v27, 0, v31
-; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v1
-; GFX11-NEXT: v_max_i16 v10, v10, v36
-; GFX11-NEXT: v_max_i16 v27, v31, v27
-; GFX11-NEXT: v_sub_nc_u16 v31, 0, v28
-; GFX11-NEXT: v_sub_nc_u16 v33, 0, v29
-; GFX11-NEXT: v_sub_nc_u16 v34, 0, v30
-; GFX11-NEXT: v_sub_nc_u16 v36, 0, v5
-; GFX11-NEXT: v_perm_b32 v10, v21, v10, 0x5040100
-; GFX11-NEXT: v_max_i16 v28, v28, v31
-; GFX11-NEXT: v_max_i16 v29, v29, v33
-; GFX11-NEXT: v_max_i16 v30, v30, v34
-; GFX11-NEXT: v_max_i16 v31, v32, v35
-; GFX11-NEXT: v_sub_nc_u16 v32, 0, v14
-; GFX11-NEXT: v_sub_nc_u16 v33, 0, v13
-; GFX11-NEXT: v_sub_nc_u16 v34, 0, v12
-; GFX11-NEXT: v_sub_nc_u16 v35, 0, v11
-; GFX11-NEXT: v_max_i16 v5, v5, v36
-; GFX11-NEXT: v_max_i16 v14, v14, v32
-; GFX11-NEXT: v_max_i16 v13, v13, v33
-; GFX11-NEXT: v_max_i16 v12, v12, v34
-; GFX11-NEXT: v_max_i16 v11, v11, v35
-; GFX11-NEXT: v_sub_nc_u16 v32, 0, v9
-; GFX11-NEXT: v_sub_nc_u16 v33, 0, v8
-; GFX11-NEXT: v_sub_nc_u16 v34, 0, v7
-; GFX11-NEXT: v_sub_nc_u16 v35, 0, v6
-; GFX11-NEXT: v_sub_nc_u16 v36, 0, v4
-; GFX11-NEXT: v_max_i16 v9, v9, v32
-; GFX11-NEXT: v_max_i16 v8, v8, v33
-; GFX11-NEXT: v_max_i16 v7, v7, v34
-; GFX11-NEXT: v_max_i16 v6, v6, v35
-; GFX11-NEXT: v_sub_nc_u16 v32, 0, v0
-; GFX11-NEXT: v_sub_nc_u16 v33, 0, v1
-; GFX11-NEXT: v_sub_nc_u16 v34, 0, v2
-; GFX11-NEXT: v_sub_nc_u16 v35, 0, v3
-; GFX11-NEXT: v_max_i16 v4, v4, v36
-; GFX11-NEXT: v_max_i16 v0, v0, v32
-; GFX11-NEXT: v_max_i16 v1, v1, v33
-; GFX11-NEXT: v_max_i16 v2, v2, v34
-; GFX11-NEXT: v_max_i16 v3, v3, v35
-; GFX11-NEXT: v_perm_b32 v4, v27, v4, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v31, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v30, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v2, v29, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v3, v28, v3, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v26, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v6, v25, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v7, v24, v7, 0x5040100
-; GFX11-NEXT: v_perm_b32 v8, v23, v8, 0x5040100
-; GFX11-NEXT: v_perm_b32 v9, v22, v9, 0x5040100
-; GFX11-NEXT: v_perm_b32 v11, v20, v11, 0x5040100
-; GFX11-NEXT: v_perm_b32 v12, v19, v12, 0x5040100
-; GFX11-NEXT: v_perm_b32 v13, v18, v13, 0x5040100
-; GFX11-NEXT: v_perm_b32 v14, v17, v14, 0x5040100
+; GFX11-NEXT: v_pk_sub_i16 v16, 0, v0
+; GFX11-NEXT: v_pk_sub_i16 v17, 0, v2
+; GFX11-NEXT: v_pk_sub_i16 v18, 0, v3
+; GFX11-NEXT: v_pk_sub_i16 v19, 0, v4
+; GFX11-NEXT: v_pk_sub_i16 v20, 0, v5
+; GFX11-NEXT: v_pk_max_i16 v0, v0, v16
+; GFX11-NEXT: v_pk_sub_i16 v16, 0, v1
+; GFX11-NEXT: v_pk_max_i16 v2, v2, v17
+; GFX11-NEXT: v_pk_max_i16 v3, v3, v18
+; GFX11-NEXT: v_pk_max_i16 v4, v4, v19
+; GFX11-NEXT: v_pk_max_i16 v5, v5, v20
+; GFX11-NEXT: v_pk_max_i16 v1, v1, v16
+; GFX11-NEXT: v_pk_sub_i16 v16, 0, v6
+; GFX11-NEXT: v_pk_sub_i16 v17, 0, v7
+; GFX11-NEXT: v_pk_sub_i16 v18, 0, v8
+; GFX11-NEXT: v_pk_sub_i16 v19, 0, v9
+; GFX11-NEXT: v_pk_sub_i16 v20, 0, v10
+; GFX11-NEXT: v_pk_max_i16 v6, v6, v16
+; GFX11-NEXT: v_pk_max_i16 v7, v7, v17
+; GFX11-NEXT: v_pk_max_i16 v8, v8, v18
+; GFX11-NEXT: v_pk_max_i16 v9, v9, v19
+; GFX11-NEXT: v_pk_max_i16 v10, v10, v20
+; GFX11-NEXT: v_pk_sub_i16 v16, 0, v11
+; GFX11-NEXT: v_pk_sub_i16 v17, 0, v12
+; GFX11-NEXT: v_pk_sub_i16 v18, 0, v13
+; GFX11-NEXT: v_pk_sub_i16 v19, 0, v14
+; GFX11-NEXT: v_pk_sub_i16 v20, 0, v15
+; GFX11-NEXT: v_pk_max_i16 v11, v11, v16
+; GFX11-NEXT: v_pk_max_i16 v12, v12, v17
+; GFX11-NEXT: v_pk_max_i16 v13, v13, v18
+; GFX11-NEXT: v_pk_max_i16 v14, v14, v19
+; GFX11-NEXT: v_pk_max_i16 v15, v15, v20
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_abs_v32i16:
@@ -1545,103 +1114,38 @@ define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v16, 16, v15
-; GFX12-NEXT: v_lshrrev_b32_e32 v18, 16, v14
-; GFX12-NEXT: v_lshrrev_b32_e32 v19, 16, v13
-; GFX12-NEXT: v_lshrrev_b32_e32 v20, 16, v12
-; GFX12-NEXT: v_lshrrev_b32_e32 v21, 16, v11
-; GFX12-NEXT: v_sub_nc_u16 v17, 0, v16
-; GFX12-NEXT: v_lshrrev_b32_e32 v22, 16, v10
-; GFX12-NEXT: v_lshrrev_b32_e32 v26, 16, v9
-; GFX12-NEXT: v_sub_nc_u16 v23, 0, v20
-; GFX12-NEXT: v_sub_nc_u16 v24, 0, v21
-; GFX12-NEXT: v_max_i16 v16, v16, v17
-; GFX12-NEXT: v_sub_nc_u16 v17, 0, v18
-; GFX12-NEXT: v_sub_nc_u16 v25, 0, v22
-; GFX12-NEXT: v_lshrrev_b32_e32 v27, 16, v5
-; GFX12-NEXT: v_lshrrev_b32_e32 v31, 16, v4
-; GFX12-NEXT: v_lshrrev_b32_e32 v32, 16, v0
-; GFX12-NEXT: v_max_i16 v17, v18, v17
-; GFX12-NEXT: v_sub_nc_u16 v18, 0, v19
-; GFX12-NEXT: v_sub_nc_u16 v30, 0, v27
-; GFX12-NEXT: v_sub_nc_u16 v36, 0, v15
-; GFX12-NEXT: v_sub_nc_u16 v35, 0, v32
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT: v_max_i16 v18, v19, v18
-; GFX12-NEXT: v_max_i16 v19, v20, v23
-; GFX12-NEXT: v_max_i16 v20, v21, v24
-; GFX12-NEXT: v_max_i16 v21, v22, v25
-; GFX12-NEXT: v_sub_nc_u16 v22, 0, v26
-; GFX12-NEXT: v_lshrrev_b32_e32 v23, 16, v8
-; GFX12-NEXT: v_lshrrev_b32_e32 v24, 16, v7
-; GFX12-NEXT: v_lshrrev_b32_e32 v25, 16, v6
-; GFX12-NEXT: v_max_i16 v15, v15, v36
-; GFX12-NEXT: v_max_i16 v22, v26, v22
-; GFX12-NEXT: v_sub_nc_u16 v26, 0, v23
-; GFX12-NEXT: v_sub_nc_u16 v28, 0, v24
-; GFX12-NEXT: v_sub_nc_u16 v29, 0, v25
-; GFX12-NEXT: v_sub_nc_u16 v36, 0, v10
-; GFX12-NEXT: v_perm_b32 v15, v16, v15, 0x5040100
-; GFX12-NEXT: v_max_i16 v23, v23, v26
-; GFX12-NEXT: v_max_i16 v24, v24, v28
-; GFX12-NEXT: v_max_i16 v25, v25, v29
-; GFX12-NEXT: v_max_i16 v26, v27, v30
-; GFX12-NEXT: v_sub_nc_u16 v27, 0, v31
-; GFX12-NEXT: v_lshrrev_b32_e32 v28, 16, v3
-; GFX12-NEXT: v_lshrrev_b32_e32 v29, 16, v2
-; GFX12-NEXT: v_lshrrev_b32_e32 v30, 16, v1
-; GFX12-NEXT: v_max_i16 v10, v10, v36
-; GFX12-NEXT: v_max_i16 v27, v31, v27
-; GFX12-NEXT: v_sub_nc_u16 v31, 0, v28
-; GFX12-NEXT: v_sub_nc_u16 v33, 0, v29
-; GFX12-NEXT: v_sub_nc_u16 v34, 0, v30
-; GFX12-NEXT: v_sub_nc_u16 v36, 0, v5
-; GFX12-NEXT: v_perm_b32 v10, v21, v10, 0x5040100
-; GFX12-NEXT: v_max_i16 v28, v28, v31
-; GFX12-NEXT: v_max_i16 v29, v29, v33
-; GFX12-NEXT: v_max_i16 v30, v30, v34
-; GFX12-NEXT: v_max_i16 v31, v32, v35
-; GFX12-NEXT: v_sub_nc_u16 v32, 0, v14
-; GFX12-NEXT: v_sub_nc_u16 v33, 0, v13
-; GFX12-NEXT: v_sub_nc_u16 v34, 0, v12
-; GFX12-NEXT: v_sub_nc_u16 v35, 0, v11
-; GFX12-NEXT: v_max_i16 v5, v5, v36
-; GFX12-NEXT: v_max_i16 v14, v14, v32
-; GFX12-NEXT: v_max_i16 v13, v13, v33
-; GFX12-NEXT: v_max_i16 v12, v12, v34
-; GFX12-NEXT: v_max_i16 v11, v11, v35
-; GFX12-NEXT: v_sub_nc_u16 v32, 0, v9
-; GFX12-NEXT: v_sub_nc_u16 v33, 0, v8
-; GFX12-NEXT: v_sub_nc_u16 v34, 0, v7
-; GFX12-NEXT: v_sub_nc_u16 v35, 0, v6
-; GFX12-NEXT: v_sub_nc_u16 v36, 0, v4
-; GFX12-NEXT: v_max_i16 v9, v9, v32
-; GFX12-NEXT: v_max_i16 v8, v8, v33
-; GFX12-NEXT: v_max_i16 v7, v7, v34
-; GFX12-NEXT: v_max_i16 v6, v6, v35
-; GFX12-NEXT: v_sub_nc_u16 v32, 0, v0
-; GFX12-NEXT: v_sub_nc_u16 v33, 0, v1
-; GFX12-NEXT: v_sub_nc_u16 v34, 0, v2
-; GFX12-NEXT: v_sub_nc_u16 v35, 0, v3
-; GFX12-NEXT: v_max_i16 v4, v4, v36
-; GFX12-NEXT: v_max_i16 v0, v0, v32
-; GFX12-NEXT: v_max_i16 v1, v1, v33
-; GFX12-NEXT: v_max_i16 v2, v2, v34
-; GFX12-NEXT: v_max_i16 v3, v3, v35
-; GFX12-NEXT: v_perm_b32 v4, v27, v4, 0x5040100
-; GFX12-NEXT: v_perm_b32 v0, v31, v0, 0x5040100
-; GFX12-NEXT: v_perm_b32 v1, v30, v1, 0x5040100
-; GFX12-NEXT: v_perm_b32 v2, v29, v2, 0x5040100
-; GFX12-NEXT: v_perm_b32 v3, v28, v3, 0x5040100
-; GFX12-NEXT: v_perm_b32 v5, v26, v5, 0x5040100
-; GFX12-NEXT: v_perm_b32 v6, v25, v6, 0x5040100
-; GFX12-NEXT: v_perm_b32 v7, v24, v7, 0x5040100
-; GFX12-NEXT: v_perm_b32 v8, v23, v8, 0x5040100
-; GFX12-NEXT: v_perm_b32 v9, v22, v9, 0x5040100
-; GFX12-NEXT: v_perm_b32 v11, v20, v11, 0x5040100
-; GFX12-NEXT: v_perm_b32 v12, v19, v12, 0x5040100
-; GFX12-NEXT: v_perm_b32 v13, v18, v13, 0x5040100
-; GFX12-NEXT: v_perm_b32 v14, v17, v14, 0x5040100
+; GFX12-NEXT: v_pk_sub_i16 v16, 0, v0
+; GFX12-NEXT: v_pk_sub_i16 v17, 0, v2
+; GFX12-NEXT: v_pk_sub_i16 v18, 0, v3
+; GFX12-NEXT: v_pk_sub_i16 v19, 0, v4
+; GFX12-NEXT: v_pk_sub_i16 v20, 0, v5
+; GFX12-NEXT: v_pk_max_i16 v0, v0, v16
+; GFX12-NEXT: v_pk_sub_i16 v16, 0, v1
+; GFX12-NEXT: v_pk_max_i16 v2, v2, v17
+; GFX12-NEXT: v_pk_max_i16 v3, v3, v18
+; GFX12-NEXT: v_pk_max_i16 v4, v4, v19
+; GFX12-NEXT: v_pk_max_i16 v5, v5, v20
+; GFX12-NEXT: v_pk_max_i16 v1, v1, v16
+; GFX12-NEXT: v_pk_sub_i16 v16, 0, v6
+; GFX12-NEXT: v_pk_sub_i16 v17, 0, v7
+; GFX12-NEXT: v_pk_sub_i16 v18, 0, v8
+; GFX12-NEXT: v_pk_sub_i16 v19, 0, v9
+; GFX12-NEXT: v_pk_sub_i16 v20, 0, v10
+; GFX12-NEXT: v_pk_max_i16 v6, v6, v16
+; GFX12-NEXT: v_pk_max_i16 v7, v7, v17
+; GFX12-NEXT: v_pk_max_i16 v8, v8, v18
+; GFX12-NEXT: v_pk_max_i16 v9, v9, v19
+; GFX12-NEXT: v_pk_max_i16 v10, v10, v20
+; GFX12-NEXT: v_pk_sub_i16 v16, 0, v11
+; GFX12-NEXT: v_pk_sub_i16 v17, 0, v12
+; GFX12-NEXT: v_pk_sub_i16 v18, 0, v13
+; GFX12-NEXT: v_pk_sub_i16 v19, 0, v14
+; GFX12-NEXT: v_pk_sub_i16 v20, 0, v15
+; GFX12-NEXT: v_pk_max_i16 v11, v11, v16
+; GFX12-NEXT: v_pk_max_i16 v12, v12, v17
+; GFX12-NEXT: v_pk_max_i16 v13, v13, v18
+; GFX12-NEXT: v_pk_max_i16 v14, v14, v19
+; GFX12-NEXT: v_pk_max_i16 v15, v15, v20
; GFX12-NEXT: s_setpc_b64 s[30:31]
%res = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %arg, i1 false)
ret <32 x i16> %res
>From af0ab226bc5df8dff71f4a731263af5aaa60825d Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tgymnich at icloud.com>
Date: Thu, 13 Jun 2024 16:23:40 +0200
Subject: [PATCH 4/4] clang format
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c030a9117340a..3f8b1e4217588 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -791,8 +791,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
// Split vector operations.
setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
- ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
- ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
+ ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN,
+ ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
ISD::SSUBSAT},
VT, Custom);
More information about the llvm-commits
mailing list