[llvm] 7091dd2 - [AMDGPU] Fix lowering of abs for i16 vectors with more than 2 elements (#95413)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 14 13:53:33 PDT 2024
Author: Tim Gymnich
Date: 2024-06-14T22:53:30+02:00
New Revision: 7091dd277a1e2349f33390be9f3ccf21bff39003
URL: https://github.com/llvm/llvm-project/commit/7091dd277a1e2349f33390be9f3ccf21bff39003
DIFF: https://github.com/llvm/llvm-project/commit/7091dd277a1e2349f33390be9f3ccf21bff39003.diff
LOG: [AMDGPU] Fix lowering of abs for i16 vectors with more than 2 elements (#95413)
fixes #94606
Expansion of `ABS` for `i16` vectors with more than 2 elements is
currently falling back to scalarization of the vector.
This PR adds a custom lowering for `ABS` on `i16` vectors that splits
the vector into multiple `<2 x i 16>` vectors.
Added:
llvm/test/CodeGen/AMDGPU/abs_i16.ll
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 81098201e9c0f..3f8b1e4217588 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -791,8 +791,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
// Split vector operations.
setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
- ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
- ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
+ ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN,
+ ISD::UMAX, ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
ISD::SSUBSAT},
VT, Custom);
@@ -5804,6 +5804,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerTRAP(Op, DAG);
case ISD::DEBUGTRAP:
return lowerDEBUGTRAP(Op, DAG);
+ case ISD::ABS:
case ISD::FABS:
case ISD::FNEG:
case ISD::FCANONICALIZE:
diff --git a/llvm/test/CodeGen/AMDGPU/abs_i16.ll b/llvm/test/CodeGen/AMDGPU/abs_i16.ll
new file mode 100644
index 0000000000000..c7d0df6d0bf46
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/abs_i16.ll
@@ -0,0 +1,1521 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+define i16 @abs_i16(i16 %arg) {
+; GFX6-LABEL: abs_i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: abs_i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
+; GFX7-NEXT: v_max_i32_e32 v0, v0, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: abs_i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0
+; GFX8-NEXT: v_max_i16_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: abs_i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sub_u16_e32 v1, 0, v0
+; GFX9-NEXT: v_max_i16_e32 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: abs_i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sub_nc_u16 v1, 0, v0
+; GFX10-NEXT: v_max_i16 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: abs_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sub_nc_u16 v1, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_max_i16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: abs_i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_sub_nc_u16 v1, 0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_max_i16 v0, v0, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
+ ret i16 %res
+}
+
+define <2 x i16> @v_abs_v2i16(<2 x i16> %arg) {
+; GFX6-LABEL: v_abs_v2i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v2
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
+; GFX6-NEXT: v_max_i32_e32 v1, v1, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_abs_v2i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
+; GFX7-NEXT: v_max_i32_e32 v0, v0, v2
+; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
+; GFX7-NEXT: v_max_i32_e32 v1, v1, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_abs_v2i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT: v_sub_u16_e32 v2, 0, v1
+; GFX8-NEXT: v_max_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_sub_u16_e32 v2, 0, v0
+; GFX8-NEXT: v_max_i16_e32 v0, v0, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_abs_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_sub_i16 v1, 0, v0
+; GFX9-NEXT: v_pk_max_i16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_abs_v2i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_sub_i16 v1, 0, v0
+; GFX10-NEXT: v_pk_max_i16 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_abs_v2i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_sub_i16 v1, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_max_i16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_abs_v2i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_sub_i16 v1, 0, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_pk_max_i16 v0, v0, v1
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false)
+ ret <2 x i16> %res
+}
+
+define <3 x i16> @v_abs_v3i16(<3 x i16> %arg) {
+; GFX6-LABEL: v_abs_v3i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v2, v1
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_abs_v3i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
+; GFX7-NEXT: v_max_i32_e32 v2, v2, v1
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_abs_v3i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_sub_u16_e32 v3, 0, v2
+; GFX8-NEXT: v_max_i16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_sub_u16_e32 v3, 0, v1
+; GFX8-NEXT: v_max_i16_e32 v1, v1, v3
+; GFX8-NEXT: v_sub_u16_e32 v3, 0, v0
+; GFX8-NEXT: v_max_i16_e32 v0, v0, v3
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_abs_v3i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_sub_i16 v2, 0, v0
+; GFX9-NEXT: v_pk_max_i16 v0, v0, v2
+; GFX9-NEXT: v_pk_sub_i16 v2, 0, v1
+; GFX9-NEXT: v_pk_max_i16 v1, v1, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_abs_v3i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_sub_i16 v2, 0, v0
+; GFX10-NEXT: v_pk_sub_i16 v3, 0, v1
+; GFX10-NEXT: v_pk_max_i16 v0, v0, v2
+; GFX10-NEXT: v_pk_max_i16 v1, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_abs_v3i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_sub_i16 v2, 0, v0
+; GFX11-NEXT: v_pk_sub_i16 v3, 0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_pk_max_i16 v0, v0, v2
+; GFX11-NEXT: v_pk_max_i16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_abs_v3i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_sub_i16 v2, 0, v0
+; GFX12-NEXT: v_pk_sub_i16 v3, 0, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_pk_max_i16 v0, v0, v2
+; GFX12-NEXT: v_pk_max_i16 v1, v1, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
+ ret <3 x i16> %res
+}
+
+define <4 x i16> @v_abs_v4i16(<4 x i16> %arg) {
+; GFX6-LABEL: v_abs_v4i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v2, v4
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v3, v4
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_abs_v4i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
+; GFX7-NEXT: v_max_i32_e32 v2, v2, v4
+; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v3, v4
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_abs_v4i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_sub_u16_e32 v3, 0, v2
+; GFX8-NEXT: v_max_i16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_sub_u16_e32 v4, 0, v3
+; GFX8-NEXT: v_max_i16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_sub_u16_e32 v4, 0, v1
+; GFX8-NEXT: v_sub_u16_e32 v5, 0, v0
+; GFX8-NEXT: v_max_i16_e32 v0, v0, v5
+; GFX8-NEXT: v_max_i16_e32 v1, v1, v4
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v3
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_abs_v4i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_sub_i16 v2, 0, v0
+; GFX9-NEXT: v_pk_max_i16 v0, v0, v2
+; GFX9-NEXT: v_pk_sub_i16 v2, 0, v1
+; GFX9-NEXT: v_pk_max_i16 v1, v1, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_abs_v4i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_sub_i16 v2, 0, v0
+; GFX10-NEXT: v_pk_sub_i16 v3, 0, v1
+; GFX10-NEXT: v_pk_max_i16 v0, v0, v2
+; GFX10-NEXT: v_pk_max_i16 v1, v1, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_abs_v4i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_sub_i16 v2, 0, v0
+; GFX11-NEXT: v_pk_sub_i16 v3, 0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_pk_max_i16 v0, v0, v2
+; GFX11-NEXT: v_pk_max_i16 v1, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_abs_v4i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_sub_i16 v2, 0, v0
+; GFX12-NEXT: v_pk_sub_i16 v3, 0, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_pk_max_i16 v0, v0, v2
+; GFX12-NEXT: v_pk_max_i16 v1, v1, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %arg, i1 false)
+ ret <4 x i16> %res
+}
+
+define <6 x i16> @v_abs_v6i16(<6 x i16> %arg) {
+; GFX6-LABEL: v_abs_v6i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v2, v6
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v3, v6
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v5
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v4
+; GFX6-NEXT: v_max_i32_e32 v5, v5, v3
+; GFX6-NEXT: v_max_i32_e32 v1, v4, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_or_b32_e32 v4, v1, v3
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_abs_v6i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
+; GFX7-NEXT: v_max_i32_e32 v2, v2, v6
+; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v3, v6
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v5
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 0, v4
+; GFX7-NEXT: v_max_i32_e32 v5, v5, v3
+; GFX7-NEXT: v_max_i32_e32 v1, v4, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v4, v1, v3
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_abs_v6i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX8-NEXT: v_sub_u16_e32 v4, 0, v3
+; GFX8-NEXT: v_max_i16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX8-NEXT: v_sub_u16_e32 v5, 0, v4
+; GFX8-NEXT: v_max_i16_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_sub_u16_e32 v6, 0, v5
+; GFX8-NEXT: v_max_i16_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_sub_u16_e32 v6, 0, v2
+; GFX8-NEXT: v_sub_u16_e32 v7, 0, v1
+; GFX8-NEXT: v_sub_u16_e32 v8, 0, v0
+; GFX8-NEXT: v_max_i16_e32 v0, v0, v8
+; GFX8-NEXT: v_max_i16_e32 v1, v1, v7
+; GFX8-NEXT: v_max_i16_e32 v2, v2, v6
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v5
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_abs_v6i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_sub_i16 v3, 0, v0
+; GFX9-NEXT: v_pk_max_i16 v0, v0, v3
+; GFX9-NEXT: v_pk_sub_i16 v3, 0, v1
+; GFX9-NEXT: v_pk_max_i16 v1, v1, v3
+; GFX9-NEXT: v_pk_sub_i16 v3, 0, v2
+; GFX9-NEXT: v_pk_max_i16 v2, v2, v3
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_abs_v6i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_sub_i16 v3, 0, v0
+; GFX10-NEXT: v_pk_sub_i16 v4, 0, v1
+; GFX10-NEXT: v_pk_sub_i16 v5, 0, v2
+; GFX10-NEXT: v_pk_max_i16 v0, v0, v3
+; GFX10-NEXT: v_pk_max_i16 v1, v1, v4
+; GFX10-NEXT: v_pk_max_i16 v2, v2, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_abs_v6i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_sub_i16 v3, 0, v0
+; GFX11-NEXT: v_pk_sub_i16 v4, 0, v1
+; GFX11-NEXT: v_pk_sub_i16 v5, 0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_pk_max_i16 v0, v0, v3
+; GFX11-NEXT: v_pk_max_i16 v1, v1, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_pk_max_i16 v2, v2, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_abs_v6i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_sub_i16 v3, 0, v0
+; GFX12-NEXT: v_pk_sub_i16 v4, 0, v1
+; GFX12-NEXT: v_pk_sub_i16 v5, 0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_pk_max_i16 v0, v0, v3
+; GFX12-NEXT: v_pk_max_i16 v1, v1, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_pk_max_i16 v2, v2, v5
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %res = call <6 x i16> @llvm.abs.v6i16(<6 x i16> %arg, i1 false)
+ ret <6 x i16> %res
+}
+
+define <8 x i16> @v_abs_v8i16(<8 x i16> %arg) {
+; GFX6-LABEL: v_abs_v8i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v6
+; GFX6-NEXT: v_max_i32_e32 v6, v6, v8
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v7
+; GFX6-NEXT: v_max_i32_e32 v7, v7, v8
+; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
+; GFX6-NEXT: v_max_i32_e32 v4, v4, v7
+; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
+; GFX6-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v2, v5
+; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v3, v5
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX6-NEXT: v_alignbit_b32 v5, v6, v4, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_abs_v8i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v6
+; GFX7-NEXT: v_max_i32_e32 v6, v6, v8
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v7
+; GFX7-NEXT: v_max_i32_e32 v7, v7, v8
+; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
+; GFX7-NEXT: v_max_i32_e32 v4, v4, v7
+; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
+; GFX7-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
+; GFX7-NEXT: v_max_i32_e32 v2, v2, v5
+; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v3, v5
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v6, v4, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_abs_v8i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX8-NEXT: v_sub_u16_e32 v5, 0, v4
+; GFX8-NEXT: v_max_i16_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX8-NEXT: v_sub_u16_e32 v6, 0, v5
+; GFX8-NEXT: v_max_i16_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_sub_u16_e32 v7, 0, v6
+; GFX8-NEXT: v_max_i16_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX8-NEXT: v_sub_u16_e32 v8, 0, v7
+; GFX8-NEXT: v_max_i16_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_sub_u16_e32 v8, 0, v3
+; GFX8-NEXT: v_sub_u16_e32 v9, 0, v2
+; GFX8-NEXT: v_sub_u16_e32 v10, 0, v1
+; GFX8-NEXT: v_sub_u16_e32 v11, 0, v0
+; GFX8-NEXT: v_max_i16_e32 v0, v0, v11
+; GFX8-NEXT: v_max_i16_e32 v1, v1, v10
+; GFX8-NEXT: v_max_i16_e32 v2, v2, v9
+; GFX8-NEXT: v_max_i16_e32 v3, v3, v8
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v7
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v6
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v5
+; GFX8-NEXT: v_or_b32_e32 v3, v3, v4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_abs_v8i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_sub_i16 v4, 0, v0
+; GFX9-NEXT: v_pk_max_i16 v0, v0, v4
+; GFX9-NEXT: v_pk_sub_i16 v4, 0, v1
+; GFX9-NEXT: v_pk_max_i16 v1, v1, v4
+; GFX9-NEXT: v_pk_sub_i16 v4, 0, v2
+; GFX9-NEXT: v_pk_max_i16 v2, v2, v4
+; GFX9-NEXT: v_pk_sub_i16 v4, 0, v3
+; GFX9-NEXT: v_pk_max_i16 v3, v3, v4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_abs_v8i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_sub_i16 v4, 0, v0
+; GFX10-NEXT: v_pk_sub_i16 v5, 0, v1
+; GFX10-NEXT: v_pk_sub_i16 v6, 0, v2
+; GFX10-NEXT: v_pk_sub_i16 v7, 0, v3
+; GFX10-NEXT: v_pk_max_i16 v0, v0, v4
+; GFX10-NEXT: v_pk_max_i16 v1, v1, v5
+; GFX10-NEXT: v_pk_max_i16 v2, v2, v6
+; GFX10-NEXT: v_pk_max_i16 v3, v3, v7
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_abs_v8i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_sub_i16 v4, 0, v0
+; GFX11-NEXT: v_pk_sub_i16 v5, 0, v1
+; GFX11-NEXT: v_pk_sub_i16 v6, 0, v2
+; GFX11-NEXT: v_pk_sub_i16 v7, 0, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_pk_max_i16 v0, v0, v4
+; GFX11-NEXT: v_pk_max_i16 v1, v1, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_pk_max_i16 v2, v2, v6
+; GFX11-NEXT: v_pk_max_i16 v3, v3, v7
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_abs_v8i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_sub_i16 v4, 0, v0
+; GFX12-NEXT: v_pk_sub_i16 v5, 0, v1
+; GFX12-NEXT: v_pk_sub_i16 v6, 0, v2
+; GFX12-NEXT: v_pk_sub_i16 v7, 0, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_pk_max_i16 v0, v0, v4
+; GFX12-NEXT: v_pk_max_i16 v1, v1, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_pk_max_i16 v2, v2, v6
+; GFX12-NEXT: v_pk_max_i16 v3, v3, v7
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %arg, i1 false)
+ ret <8 x i16> %res
+}
+
+
+define <16 x i16> @v_abs_v16i16(<16 x i16> %arg) {
+; GFX6-LABEL: v_abs_v16i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v14
+; GFX6-NEXT: v_max_i32_e32 v14, v14, v16
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v15
+; GFX6-NEXT: v_max_i32_e32 v15, v15, v16
+; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v14, v14, v15
+; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
+; GFX6-NEXT: v_max_i32_e32 v12, v12, v15
+; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
+; GFX6-NEXT: v_max_i32_e32 v13, v13, v15
+; GFX6-NEXT: v_bfe_i32 v10, v10, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX6-NEXT: v_bfe_i32 v11, v11, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
+; GFX6-NEXT: v_max_i32_e32 v10, v10, v13
+; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
+; GFX6-NEXT: v_max_i32_e32 v11, v11, v13
+; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v10, v10, v11
+; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
+; GFX6-NEXT: v_max_i32_e32 v8, v8, v11
+; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
+; GFX6-NEXT: v_max_i32_e32 v9, v9, v11
+; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
+; GFX6-NEXT: v_max_i32_e32 v6, v6, v9
+; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
+; GFX6-NEXT: v_max_i32_e32 v7, v7, v9
+; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
+; GFX6-NEXT: v_max_i32_e32 v4, v4, v7
+; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
+; GFX6-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v2, v5
+; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v3, v5
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX6-NEXT: v_alignbit_b32 v5, v6, v4, 16
+; GFX6-NEXT: v_alignbit_b32 v9, v10, v8, 16
+; GFX6-NEXT: v_alignbit_b32 v13, v14, v12, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX6-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_abs_v16i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GFX7-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v14
+; GFX7-NEXT: v_max_i32_e32 v14, v14, v16
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v15
+; GFX7-NEXT: v_max_i32_e32 v15, v15, v16
+; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v14, v14, v15
+; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
+; GFX7-NEXT: v_max_i32_e32 v12, v12, v15
+; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
+; GFX7-NEXT: v_max_i32_e32 v13, v13, v15
+; GFX7-NEXT: v_bfe_i32 v10, v10, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_bfe_i32 v11, v11, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
+; GFX7-NEXT: v_max_i32_e32 v10, v10, v13
+; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
+; GFX7-NEXT: v_max_i32_e32 v11, v11, v13
+; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v10, v10, v11
+; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
+; GFX7-NEXT: v_max_i32_e32 v8, v8, v11
+; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
+; GFX7-NEXT: v_max_i32_e32 v9, v9, v11
+; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
+; GFX7-NEXT: v_max_i32_e32 v6, v6, v9
+; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
+; GFX7-NEXT: v_max_i32_e32 v7, v7, v9
+; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
+; GFX7-NEXT: v_max_i32_e32 v4, v4, v7
+; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
+; GFX7-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
+; GFX7-NEXT: v_max_i32_e32 v2, v2, v5
+; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v3, v5
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v6, v4, 16
+; GFX7-NEXT: v_alignbit_b32 v9, v10, v8, 16
+; GFX7-NEXT: v_alignbit_b32 v13, v14, v12, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_abs_v16i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v7
+; GFX8-NEXT: v_sub_u16_e32 v9, 0, v8
+; GFX8-NEXT: v_max_i16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v6
+; GFX8-NEXT: v_sub_u16_e32 v10, 0, v9
+; GFX8-NEXT: v_max_i16_sdwa v9, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v5
+; GFX8-NEXT: v_sub_u16_e32 v11, 0, v10
+; GFX8-NEXT: v_max_i16_sdwa v10, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v4
+; GFX8-NEXT: v_sub_u16_e32 v12, 0, v11
+; GFX8-NEXT: v_max_i16_sdwa v11, v11, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v3
+; GFX8-NEXT: v_sub_u16_e32 v13, 0, v12
+; GFX8-NEXT: v_max_i16_sdwa v12, v12, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v2
+; GFX8-NEXT: v_sub_u16_e32 v14, 0, v13
+; GFX8-NEXT: v_max_i16_sdwa v13, v13, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v1
+; GFX8-NEXT: v_sub_u16_e32 v15, 0, v14
+; GFX8-NEXT: v_max_i16_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v0
+; GFX8-NEXT: v_sub_u16_e32 v16, 0, v15
+; GFX8-NEXT: v_max_i16_sdwa v15, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_sub_u16_e32 v16, 0, v7
+; GFX8-NEXT: v_sub_u16_e32 v17, 0, v6
+; GFX8-NEXT: v_sub_u16_e32 v18, 0, v5
+; GFX8-NEXT: v_sub_u16_e32 v19, 0, v4
+; GFX8-NEXT: v_sub_u16_e32 v20, 0, v3
+; GFX8-NEXT: v_sub_u16_e32 v21, 0, v2
+; GFX8-NEXT: v_sub_u16_e32 v22, 0, v1
+; GFX8-NEXT: v_sub_u16_e32 v23, 0, v0
+; GFX8-NEXT: v_max_i16_e32 v0, v0, v23
+; GFX8-NEXT: v_max_i16_e32 v1, v1, v22
+; GFX8-NEXT: v_max_i16_e32 v2, v2, v21
+; GFX8-NEXT: v_max_i16_e32 v3, v3, v20
+; GFX8-NEXT: v_max_i16_e32 v4, v4, v19
+; GFX8-NEXT: v_max_i16_e32 v5, v5, v18
+; GFX8-NEXT: v_max_i16_e32 v6, v6, v17
+; GFX8-NEXT: v_max_i16_e32 v7, v7, v16
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v15
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v14
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v13
+; GFX8-NEXT: v_or_b32_e32 v3, v3, v12
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v11
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v10
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v9
+; GFX8-NEXT: v_or_b32_e32 v7, v7, v8
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_abs_v16i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_sub_i16 v8, 0, v0
+; GFX9-NEXT: v_pk_max_i16 v0, v0, v8
+; GFX9-NEXT: v_pk_sub_i16 v8, 0, v1
+; GFX9-NEXT: v_pk_max_i16 v1, v1, v8
+; GFX9-NEXT: v_pk_sub_i16 v8, 0, v2
+; GFX9-NEXT: v_pk_max_i16 v2, v2, v8
+; GFX9-NEXT: v_pk_sub_i16 v8, 0, v3
+; GFX9-NEXT: v_pk_max_i16 v3, v3, v8
+; GFX9-NEXT: v_pk_sub_i16 v8, 0, v4
+; GFX9-NEXT: v_pk_max_i16 v4, v4, v8
+; GFX9-NEXT: v_pk_sub_i16 v8, 0, v5
+; GFX9-NEXT: v_pk_max_i16 v5, v5, v8
+; GFX9-NEXT: v_pk_sub_i16 v8, 0, v6
+; GFX9-NEXT: v_pk_max_i16 v6, v6, v8
+; GFX9-NEXT: v_pk_sub_i16 v8, 0, v7
+; GFX9-NEXT: v_pk_max_i16 v7, v7, v8
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_abs_v16i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_sub_i16 v8, 0, v0
+; GFX10-NEXT: v_pk_sub_i16 v9, 0, v1
+; GFX10-NEXT: v_pk_sub_i16 v10, 0, v2
+; GFX10-NEXT: v_pk_sub_i16 v11, 0, v6
+; GFX10-NEXT: v_pk_sub_i16 v12, 0, v7
+; GFX10-NEXT: v_pk_max_i16 v0, v0, v8
+; GFX10-NEXT: v_pk_max_i16 v1, v1, v9
+; GFX10-NEXT: v_pk_max_i16 v2, v2, v10
+; GFX10-NEXT: v_pk_sub_i16 v8, 0, v3
+; GFX10-NEXT: v_pk_sub_i16 v9, 0, v4
+; GFX10-NEXT: v_pk_sub_i16 v10, 0, v5
+; GFX10-NEXT: v_pk_max_i16 v6, v6, v11
+; GFX10-NEXT: v_pk_max_i16 v7, v7, v12
+; GFX10-NEXT: v_pk_max_i16 v3, v3, v8
+; GFX10-NEXT: v_pk_max_i16 v4, v4, v9
+; GFX10-NEXT: v_pk_max_i16 v5, v5, v10
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_abs_v16i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_sub_i16 v8, 0, v0
+; GFX11-NEXT: v_pk_sub_i16 v9, 0, v1
+; GFX11-NEXT: v_pk_sub_i16 v10, 0, v2
+; GFX11-NEXT: v_pk_sub_i16 v11, 0, v6
+; GFX11-NEXT: v_pk_sub_i16 v12, 0, v7
+; GFX11-NEXT: v_pk_max_i16 v0, v0, v8
+; GFX11-NEXT: v_pk_max_i16 v1, v1, v9
+; GFX11-NEXT: v_pk_max_i16 v2, v2, v10
+; GFX11-NEXT: v_pk_sub_i16 v8, 0, v3
+; GFX11-NEXT: v_pk_sub_i16 v9, 0, v4
+; GFX11-NEXT: v_pk_sub_i16 v10, 0, v5
+; GFX11-NEXT: v_pk_max_i16 v6, v6, v11
+; GFX11-NEXT: v_pk_max_i16 v7, v7, v12
+; GFX11-NEXT: v_pk_max_i16 v3, v3, v8
+; GFX11-NEXT: v_pk_max_i16 v4, v4, v9
+; GFX11-NEXT: v_pk_max_i16 v5, v5, v10
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_abs_v16i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_sub_i16 v8, 0, v0
+; GFX12-NEXT: v_pk_sub_i16 v9, 0, v1
+; GFX12-NEXT: v_pk_sub_i16 v10, 0, v2
+; GFX12-NEXT: v_pk_sub_i16 v11, 0, v6
+; GFX12-NEXT: v_pk_sub_i16 v12, 0, v7
+; GFX12-NEXT: v_pk_max_i16 v0, v0, v8
+; GFX12-NEXT: v_pk_max_i16 v1, v1, v9
+; GFX12-NEXT: v_pk_max_i16 v2, v2, v10
+; GFX12-NEXT: v_pk_sub_i16 v8, 0, v3
+; GFX12-NEXT: v_pk_sub_i16 v9, 0, v4
+; GFX12-NEXT: v_pk_sub_i16 v10, 0, v5
+; GFX12-NEXT: v_pk_max_i16 v6, v6, v11
+; GFX12-NEXT: v_pk_max_i16 v7, v7, v12
+; GFX12-NEXT: v_pk_max_i16 v3, v3, v8
+; GFX12-NEXT: v_pk_max_i16 v4, v4, v9
+; GFX12-NEXT: v_pk_max_i16 v5, v5, v10
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %arg, i1 false)
+ ret <16 x i16> %res
+}
+
+define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) {
+; GFX6-LABEL: v_abs_v32i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v28, v28, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v28
+; GFX6-NEXT: v_bfe_i32 v29, v29, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v28, v28, v31
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v29
+; GFX6-NEXT: v_bfe_i32 v30, v30, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v29, v29, v31
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v30
+; GFX6-NEXT: v_bfe_i32 v26, v26, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v30, v30, v31
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v26
+; GFX6-NEXT: v_bfe_i32 v27, v27, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v26, v26, v31
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v27
+; GFX6-NEXT: v_bfe_i32 v24, v24, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v27, v27, v31
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v24
+; GFX6-NEXT: v_bfe_i32 v25, v25, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v24, v24, v31
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v25
+; GFX6-NEXT: v_bfe_i32 v22, v22, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v25, v25, v31
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v22
+; GFX6-NEXT: v_bfe_i32 v23, v23, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v22, v22, v31
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v23
+; GFX6-NEXT: v_max_i32_e32 v23, v23, v31
+; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX6-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX6-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX6-NEXT: v_or_b32_e32 v22, v22, v23
+; GFX6-NEXT: v_or_b32_e32 v24, v24, v25
+; GFX6-NEXT: v_bfe_i32 v21, v21, 0, 16
+; GFX6-NEXT: v_bfe_i32 v20, v20, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX6-NEXT: v_or_b32_e32 v28, v28, v29
+; GFX6-NEXT: v_sub_i32_e32 v29, vcc, 0, v20
+; GFX6-NEXT: v_max_i32_e32 v20, v20, v29
+; GFX6-NEXT: v_bfe_i32 v18, v18, 0, 16
+; GFX6-NEXT: v_bfe_i32 v19, v19, 0, 16
+; GFX6-NEXT: v_bfe_i32 v16, v16, 0, 16
+; GFX6-NEXT: v_bfe_i32 v17, v17, 0, 16
+; GFX6-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX6-NEXT: v_bfe_i32 v10, v10, 0, 16
+; GFX6-NEXT: v_bfe_i32 v11, v11, 0, 16
+; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX6-NEXT: v_or_b32_e32 v26, v26, v27
+; GFX6-NEXT: v_lshrrev_b32_e32 v27, 16, v26
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v23, v31, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v25, vcc, 0, v23
+; GFX6-NEXT: v_max_i32_e32 v23, v23, v25
+; GFX6-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX6-NEXT: v_or_b32_e32 v30, v30, v23
+; GFX6-NEXT: v_sub_i32_e32 v23, vcc, 0, v21
+; GFX6-NEXT: v_max_i32_e32 v21, v21, v23
+; GFX6-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX6-NEXT: v_or_b32_e32 v20, v20, v21
+; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 0, v18
+; GFX6-NEXT: v_max_i32_e32 v18, v18, v21
+; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 0, v19
+; GFX6-NEXT: v_max_i32_e32 v19, v19, v21
+; GFX6-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX6-NEXT: v_or_b32_e32 v18, v18, v19
+; GFX6-NEXT: v_sub_i32_e32 v19, vcc, 0, v16
+; GFX6-NEXT: v_max_i32_e32 v16, v16, v19
+; GFX6-NEXT: v_sub_i32_e32 v19, vcc, 0, v17
+; GFX6-NEXT: v_max_i32_e32 v17, v17, v19
+; GFX6-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX6-NEXT: v_or_b32_e32 v16, v16, v17
+; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 0, v14
+; GFX6-NEXT: v_max_i32_e32 v14, v14, v17
+; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 0, v15
+; GFX6-NEXT: v_max_i32_e32 v15, v15, v17
+; GFX6-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX6-NEXT: v_or_b32_e32 v14, v14, v15
+; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
+; GFX6-NEXT: v_max_i32_e32 v12, v12, v15
+; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
+; GFX6-NEXT: v_max_i32_e32 v13, v13, v15
+; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX6-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
+; GFX6-NEXT: v_max_i32_e32 v10, v10, v13
+; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
+; GFX6-NEXT: v_max_i32_e32 v11, v11, v13
+; GFX6-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX6-NEXT: v_or_b32_e32 v10, v10, v11
+; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
+; GFX6-NEXT: v_max_i32_e32 v8, v8, v11
+; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
+; GFX6-NEXT: v_max_i32_e32 v9, v9, v11
+; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX6-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
+; GFX6-NEXT: v_max_i32_e32 v6, v6, v9
+; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
+; GFX6-NEXT: v_max_i32_e32 v7, v7, v9
+; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX6-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
+; GFX6-NEXT: v_max_i32_e32 v4, v4, v7
+; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
+; GFX6-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v2, v5
+; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v3, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX6-NEXT: v_alignbit_b32 v5, v6, v4, 16
+; GFX6-NEXT: v_alignbit_b32 v9, v10, v8, 16
+; GFX6-NEXT: v_alignbit_b32 v13, v14, v12, 16
+; GFX6-NEXT: v_alignbit_b32 v17, v18, v16, 16
+; GFX6-NEXT: v_alignbit_b32 v21, v22, v20, 16
+; GFX6-NEXT: v_alignbit_b32 v25, v26, v24, 16
+; GFX6-NEXT: v_alignbit_b32 v29, v30, v28, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX6-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX6-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; GFX6-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX6-NEXT: v_lshrrev_b32_e32 v31, 16, v30
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_abs_v32i16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v28, v28, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v28
+; GFX7-NEXT: v_bfe_i32 v29, v29, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v28, v28, v31
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v29
+; GFX7-NEXT: v_bfe_i32 v30, v30, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v29, v29, v31
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v30
+; GFX7-NEXT: v_bfe_i32 v26, v26, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v30, v30, v31
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v26
+; GFX7-NEXT: v_bfe_i32 v27, v27, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v26, v26, v31
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v27
+; GFX7-NEXT: v_bfe_i32 v24, v24, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v27, v27, v31
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v24
+; GFX7-NEXT: v_bfe_i32 v25, v25, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v24, v24, v31
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v25
+; GFX7-NEXT: v_bfe_i32 v22, v22, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v25, v25, v31
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v22
+; GFX7-NEXT: v_bfe_i32 v23, v23, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v22, v22, v31
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v23
+; GFX7-NEXT: v_max_i32_e32 v23, v23, v31
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_or_b32_e32 v22, v22, v23
+; GFX7-NEXT: v_or_b32_e32 v24, v24, v25
+; GFX7-NEXT: v_bfe_i32 v21, v21, 0, 16
+; GFX7-NEXT: v_bfe_i32 v20, v20, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: v_or_b32_e32 v28, v28, v29
+; GFX7-NEXT: v_sub_i32_e32 v29, vcc, 0, v20
+; GFX7-NEXT: v_max_i32_e32 v20, v20, v29
+; GFX7-NEXT: v_bfe_i32 v18, v18, 0, 16
+; GFX7-NEXT: v_bfe_i32 v19, v19, 0, 16
+; GFX7-NEXT: v_bfe_i32 v16, v16, 0, 16
+; GFX7-NEXT: v_bfe_i32 v17, v17, 0, 16
+; GFX7-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GFX7-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX7-NEXT: v_bfe_i32 v10, v10, 0, 16
+; GFX7-NEXT: v_bfe_i32 v11, v11, 0, 16
+; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: v_or_b32_e32 v26, v26, v27
+; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v26
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v23, v31, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v25, vcc, 0, v23
+; GFX7-NEXT: v_max_i32_e32 v23, v23, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GFX7-NEXT: v_or_b32_e32 v30, v30, v23
+; GFX7-NEXT: v_sub_i32_e32 v23, vcc, 0, v21
+; GFX7-NEXT: v_max_i32_e32 v21, v21, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_or_b32_e32 v20, v20, v21
+; GFX7-NEXT: v_sub_i32_e32 v21, vcc, 0, v18
+; GFX7-NEXT: v_max_i32_e32 v18, v18, v21
+; GFX7-NEXT: v_sub_i32_e32 v21, vcc, 0, v19
+; GFX7-NEXT: v_max_i32_e32 v19, v19, v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GFX7-NEXT: v_or_b32_e32 v18, v18, v19
+; GFX7-NEXT: v_sub_i32_e32 v19, vcc, 0, v16
+; GFX7-NEXT: v_max_i32_e32 v16, v16, v19
+; GFX7-NEXT: v_sub_i32_e32 v19, vcc, 0, v17
+; GFX7-NEXT: v_max_i32_e32 v17, v17, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_or_b32_e32 v16, v16, v17
+; GFX7-NEXT: v_sub_i32_e32 v17, vcc, 0, v14
+; GFX7-NEXT: v_max_i32_e32 v14, v14, v17
+; GFX7-NEXT: v_sub_i32_e32 v17, vcc, 0, v15
+; GFX7-NEXT: v_max_i32_e32 v15, v15, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GFX7-NEXT: v_or_b32_e32 v14, v14, v15
+; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
+; GFX7-NEXT: v_max_i32_e32 v12, v12, v15
+; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
+; GFX7-NEXT: v_max_i32_e32 v13, v13, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
+; GFX7-NEXT: v_max_i32_e32 v10, v10, v13
+; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
+; GFX7-NEXT: v_max_i32_e32 v11, v11, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX7-NEXT: v_or_b32_e32 v10, v10, v11
+; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
+; GFX7-NEXT: v_max_i32_e32 v8, v8, v11
+; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
+; GFX7-NEXT: v_max_i32_e32 v9, v9, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
+; GFX7-NEXT: v_max_i32_e32 v6, v6, v9
+; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
+; GFX7-NEXT: v_max_i32_e32 v7, v7, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
+; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
+; GFX7-NEXT: v_max_i32_e32 v4, v4, v7
+; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
+; GFX7-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
+; GFX7-NEXT: v_max_i32_e32 v2, v2, v5
+; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
+; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v6, v4, 16
+; GFX7-NEXT: v_alignbit_b32 v9, v10, v8, 16
+; GFX7-NEXT: v_alignbit_b32 v13, v14, v12, 16
+; GFX7-NEXT: v_alignbit_b32 v17, v18, v16, 16
+; GFX7-NEXT: v_alignbit_b32 v21, v22, v20, 16
+; GFX7-NEXT: v_alignbit_b32 v25, v26, v24, 16
+; GFX7-NEXT: v_alignbit_b32 v29, v30, v28, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v18
+; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v22
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v30
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_abs_v32i16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v16, 16, v15
+; GFX8-NEXT: v_sub_u16_e32 v17, 0, v16
+; GFX8-NEXT: v_max_i16_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v14
+; GFX8-NEXT: v_sub_u16_e32 v18, 0, v17
+; GFX8-NEXT: v_max_i16_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v18, 16, v13
+; GFX8-NEXT: v_sub_u16_e32 v19, 0, v18
+; GFX8-NEXT: v_max_i16_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v12
+; GFX8-NEXT: v_sub_u16_e32 v20, 0, v19
+; GFX8-NEXT: v_max_i16_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v20, 16, v11
+; GFX8-NEXT: v_sub_u16_e32 v21, 0, v20
+; GFX8-NEXT: v_max_i16_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v10
+; GFX8-NEXT: v_sub_u16_e32 v22, 0, v21
+; GFX8-NEXT: v_max_i16_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v22, 16, v9
+; GFX8-NEXT: v_sub_u16_e32 v23, 0, v22
+; GFX8-NEXT: v_max_i16_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v8
+; GFX8-NEXT: v_sub_u16_e32 v24, 0, v23
+; GFX8-NEXT: v_max_i16_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v7
+; GFX8-NEXT: v_sub_u16_e32 v25, 0, v24
+; GFX8-NEXT: v_max_i16_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v25, 16, v6
+; GFX8-NEXT: v_sub_u16_e32 v26, 0, v25
+; GFX8-NEXT: v_max_i16_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v5
+; GFX8-NEXT: v_sub_u16_e32 v27, 0, v26
+; GFX8-NEXT: v_max_i16_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v27, 16, v4
+; GFX8-NEXT: v_sub_u16_e32 v28, 0, v27
+; GFX8-NEXT: v_max_i16_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v3
+; GFX8-NEXT: v_sub_u16_e32 v29, 0, v28
+; GFX8-NEXT: v_max_i16_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v29, 16, v2
+; GFX8-NEXT: v_sub_u16_e32 v30, 0, v29
+; GFX8-NEXT: v_max_i16_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v30, 16, v1
+; GFX8-NEXT: v_sub_u16_e32 v31, 0, v30
+; GFX8-NEXT: v_max_i16_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v0
+; GFX8-NEXT: v_sub_u16_e32 v32, 0, v31
+; GFX8-NEXT: v_max_i16_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_sub_u16_e32 v32, 0, v0
+; GFX8-NEXT: v_max_i16_e32 v0, v0, v32
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v31
+; GFX8-NEXT: v_sub_u16_e32 v31, 0, v1
+; GFX8-NEXT: v_max_i16_e32 v1, v1, v31
+; GFX8-NEXT: v_or_b32_e32 v1, v1, v30
+; GFX8-NEXT: v_sub_u16_e32 v30, 0, v2
+; GFX8-NEXT: v_max_i16_e32 v2, v2, v30
+; GFX8-NEXT: v_or_b32_e32 v2, v2, v29
+; GFX8-NEXT: v_sub_u16_e32 v29, 0, v3
+; GFX8-NEXT: v_max_i16_e32 v3, v3, v29
+; GFX8-NEXT: v_or_b32_e32 v3, v3, v28
+; GFX8-NEXT: v_sub_u16_e32 v28, 0, v4
+; GFX8-NEXT: v_max_i16_e32 v4, v4, v28
+; GFX8-NEXT: v_or_b32_e32 v4, v4, v27
+; GFX8-NEXT: v_sub_u16_e32 v27, 0, v5
+; GFX8-NEXT: v_max_i16_e32 v5, v5, v27
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v26
+; GFX8-NEXT: v_sub_u16_e32 v26, 0, v6
+; GFX8-NEXT: v_max_i16_e32 v6, v6, v26
+; GFX8-NEXT: v_or_b32_e32 v6, v6, v25
+; GFX8-NEXT: v_sub_u16_e32 v25, 0, v7
+; GFX8-NEXT: v_max_i16_e32 v7, v7, v25
+; GFX8-NEXT: v_or_b32_e32 v7, v7, v24
+; GFX8-NEXT: v_sub_u16_e32 v24, 0, v8
+; GFX8-NEXT: v_max_i16_e32 v8, v8, v24
+; GFX8-NEXT: v_or_b32_e32 v8, v8, v23
+; GFX8-NEXT: v_sub_u16_e32 v23, 0, v9
+; GFX8-NEXT: v_max_i16_e32 v9, v9, v23
+; GFX8-NEXT: v_or_b32_e32 v9, v9, v22
+; GFX8-NEXT: v_sub_u16_e32 v22, 0, v10
+; GFX8-NEXT: v_max_i16_e32 v10, v10, v22
+; GFX8-NEXT: v_or_b32_e32 v10, v10, v21
+; GFX8-NEXT: v_sub_u16_e32 v21, 0, v11
+; GFX8-NEXT: v_max_i16_e32 v11, v11, v21
+; GFX8-NEXT: v_or_b32_e32 v11, v11, v20
+; GFX8-NEXT: v_sub_u16_e32 v20, 0, v12
+; GFX8-NEXT: v_max_i16_e32 v12, v12, v20
+; GFX8-NEXT: v_or_b32_e32 v12, v12, v19
+; GFX8-NEXT: v_sub_u16_e32 v19, 0, v13
+; GFX8-NEXT: v_sub_u16_e32 v20, 0, v15
+; GFX8-NEXT: v_max_i16_e32 v13, v13, v19
+; GFX8-NEXT: v_sub_u16_e32 v19, 0, v14
+; GFX8-NEXT: v_max_i16_e32 v14, v14, v19
+; GFX8-NEXT: v_max_i16_e32 v15, v15, v20
+; GFX8-NEXT: v_or_b32_e32 v13, v13, v18
+; GFX8-NEXT: v_or_b32_e32 v14, v14, v17
+; GFX8-NEXT: v_or_b32_e32 v15, v15, v16
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_abs_v32i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v0
+; GFX9-NEXT: v_pk_max_i16 v0, v0, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v1
+; GFX9-NEXT: v_pk_max_i16 v1, v1, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v2
+; GFX9-NEXT: v_pk_max_i16 v2, v2, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v3
+; GFX9-NEXT: v_pk_max_i16 v3, v3, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v4
+; GFX9-NEXT: v_pk_max_i16 v4, v4, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v5
+; GFX9-NEXT: v_pk_max_i16 v5, v5, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v6
+; GFX9-NEXT: v_pk_max_i16 v6, v6, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v7
+; GFX9-NEXT: v_pk_max_i16 v7, v7, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v8
+; GFX9-NEXT: v_pk_max_i16 v8, v8, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v9
+; GFX9-NEXT: v_pk_max_i16 v9, v9, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v10
+; GFX9-NEXT: v_pk_max_i16 v10, v10, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v11
+; GFX9-NEXT: v_pk_max_i16 v11, v11, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v12
+; GFX9-NEXT: v_pk_max_i16 v12, v12, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v13
+; GFX9-NEXT: v_pk_max_i16 v13, v13, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v14
+; GFX9-NEXT: v_pk_max_i16 v14, v14, v16
+; GFX9-NEXT: v_pk_sub_i16 v16, 0, v15
+; GFX9-NEXT: v_pk_max_i16 v15, v15, v16
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_abs_v32i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_pk_sub_i16 v16, 0, v0
+; GFX10-NEXT: v_pk_sub_i16 v17, 0, v2
+; GFX10-NEXT: v_pk_sub_i16 v18, 0, v3
+; GFX10-NEXT: v_pk_sub_i16 v19, 0, v4
+; GFX10-NEXT: v_pk_sub_i16 v20, 0, v5
+; GFX10-NEXT: v_pk_max_i16 v0, v0, v16
+; GFX10-NEXT: v_pk_sub_i16 v16, 0, v1
+; GFX10-NEXT: v_pk_max_i16 v2, v2, v17
+; GFX10-NEXT: v_pk_max_i16 v3, v3, v18
+; GFX10-NEXT: v_pk_max_i16 v4, v4, v19
+; GFX10-NEXT: v_pk_max_i16 v5, v5, v20
+; GFX10-NEXT: v_pk_max_i16 v1, v1, v16
+; GFX10-NEXT: v_pk_sub_i16 v16, 0, v6
+; GFX10-NEXT: v_pk_sub_i16 v17, 0, v7
+; GFX10-NEXT: v_pk_sub_i16 v18, 0, v8
+; GFX10-NEXT: v_pk_sub_i16 v19, 0, v9
+; GFX10-NEXT: v_pk_sub_i16 v20, 0, v10
+; GFX10-NEXT: v_pk_max_i16 v6, v6, v16
+; GFX10-NEXT: v_pk_max_i16 v7, v7, v17
+; GFX10-NEXT: v_pk_max_i16 v8, v8, v18
+; GFX10-NEXT: v_pk_max_i16 v9, v9, v19
+; GFX10-NEXT: v_pk_max_i16 v10, v10, v20
+; GFX10-NEXT: v_pk_sub_i16 v16, 0, v11
+; GFX10-NEXT: v_pk_sub_i16 v17, 0, v12
+; GFX10-NEXT: v_pk_sub_i16 v18, 0, v13
+; GFX10-NEXT: v_pk_sub_i16 v19, 0, v14
+; GFX10-NEXT: v_pk_sub_i16 v20, 0, v15
+; GFX10-NEXT: v_pk_max_i16 v11, v11, v16
+; GFX10-NEXT: v_pk_max_i16 v12, v12, v17
+; GFX10-NEXT: v_pk_max_i16 v13, v13, v18
+; GFX10-NEXT: v_pk_max_i16 v14, v14, v19
+; GFX10-NEXT: v_pk_max_i16 v15, v15, v20
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_abs_v32i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_pk_sub_i16 v16, 0, v0
+; GFX11-NEXT: v_pk_sub_i16 v17, 0, v2
+; GFX11-NEXT: v_pk_sub_i16 v18, 0, v3
+; GFX11-NEXT: v_pk_sub_i16 v19, 0, v4
+; GFX11-NEXT: v_pk_sub_i16 v20, 0, v5
+; GFX11-NEXT: v_pk_max_i16 v0, v0, v16
+; GFX11-NEXT: v_pk_sub_i16 v16, 0, v1
+; GFX11-NEXT: v_pk_max_i16 v2, v2, v17
+; GFX11-NEXT: v_pk_max_i16 v3, v3, v18
+; GFX11-NEXT: v_pk_max_i16 v4, v4, v19
+; GFX11-NEXT: v_pk_max_i16 v5, v5, v20
+; GFX11-NEXT: v_pk_max_i16 v1, v1, v16
+; GFX11-NEXT: v_pk_sub_i16 v16, 0, v6
+; GFX11-NEXT: v_pk_sub_i16 v17, 0, v7
+; GFX11-NEXT: v_pk_sub_i16 v18, 0, v8
+; GFX11-NEXT: v_pk_sub_i16 v19, 0, v9
+; GFX11-NEXT: v_pk_sub_i16 v20, 0, v10
+; GFX11-NEXT: v_pk_max_i16 v6, v6, v16
+; GFX11-NEXT: v_pk_max_i16 v7, v7, v17
+; GFX11-NEXT: v_pk_max_i16 v8, v8, v18
+; GFX11-NEXT: v_pk_max_i16 v9, v9, v19
+; GFX11-NEXT: v_pk_max_i16 v10, v10, v20
+; GFX11-NEXT: v_pk_sub_i16 v16, 0, v11
+; GFX11-NEXT: v_pk_sub_i16 v17, 0, v12
+; GFX11-NEXT: v_pk_sub_i16 v18, 0, v13
+; GFX11-NEXT: v_pk_sub_i16 v19, 0, v14
+; GFX11-NEXT: v_pk_sub_i16 v20, 0, v15
+; GFX11-NEXT: v_pk_max_i16 v11, v11, v16
+; GFX11-NEXT: v_pk_max_i16 v12, v12, v17
+; GFX11-NEXT: v_pk_max_i16 v13, v13, v18
+; GFX11-NEXT: v_pk_max_i16 v14, v14, v19
+; GFX11-NEXT: v_pk_max_i16 v15, v15, v20
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_abs_v32i16:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_pk_sub_i16 v16, 0, v0
+; GFX12-NEXT: v_pk_sub_i16 v17, 0, v2
+; GFX12-NEXT: v_pk_sub_i16 v18, 0, v3
+; GFX12-NEXT: v_pk_sub_i16 v19, 0, v4
+; GFX12-NEXT: v_pk_sub_i16 v20, 0, v5
+; GFX12-NEXT: v_pk_max_i16 v0, v0, v16
+; GFX12-NEXT: v_pk_sub_i16 v16, 0, v1
+; GFX12-NEXT: v_pk_max_i16 v2, v2, v17
+; GFX12-NEXT: v_pk_max_i16 v3, v3, v18
+; GFX12-NEXT: v_pk_max_i16 v4, v4, v19
+; GFX12-NEXT: v_pk_max_i16 v5, v5, v20
+; GFX12-NEXT: v_pk_max_i16 v1, v1, v16
+; GFX12-NEXT: v_pk_sub_i16 v16, 0, v6
+; GFX12-NEXT: v_pk_sub_i16 v17, 0, v7
+; GFX12-NEXT: v_pk_sub_i16 v18, 0, v8
+; GFX12-NEXT: v_pk_sub_i16 v19, 0, v9
+; GFX12-NEXT: v_pk_sub_i16 v20, 0, v10
+; GFX12-NEXT: v_pk_max_i16 v6, v6, v16
+; GFX12-NEXT: v_pk_max_i16 v7, v7, v17
+; GFX12-NEXT: v_pk_max_i16 v8, v8, v18
+; GFX12-NEXT: v_pk_max_i16 v9, v9, v19
+; GFX12-NEXT: v_pk_max_i16 v10, v10, v20
+; GFX12-NEXT: v_pk_sub_i16 v16, 0, v11
+; GFX12-NEXT: v_pk_sub_i16 v17, 0, v12
+; GFX12-NEXT: v_pk_sub_i16 v18, 0, v13
+; GFX12-NEXT: v_pk_sub_i16 v19, 0, v14
+; GFX12-NEXT: v_pk_sub_i16 v20, 0, v15
+; GFX12-NEXT: v_pk_max_i16 v11, v11, v16
+; GFX12-NEXT: v_pk_max_i16 v12, v12, v17
+; GFX12-NEXT: v_pk_max_i16 v13, v13, v18
+; GFX12-NEXT: v_pk_max_i16 v14, v14, v19
+; GFX12-NEXT: v_pk_max_i16 v15, v15, v20
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %res = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %arg, i1 false)
+ ret <32 x i16> %res
+}
More information about the llvm-commits
mailing list