[llvm] [AMDGPU] 32-bit ABS is a legal DAG node (PR #163907)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 16 21:23:04 PDT 2025
https://github.com/LU-JOHN created https://github.com/llvm/llvm-project/pull/163907
32-bit ABS can be lowered legally.
>From 18930539c72e24cfcc26cd37fed46e5e0843a7af Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Thu, 16 Oct 2025 23:20:04 -0500
Subject: [PATCH] Make 32-bit ABS is legal DAG node
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 7 +
llvm/test/CodeGen/AMDGPU/abs_i16.ll | 980 +++++++++---------
...amdgpu-codegenprepare-fold-binop-select.ll | 4 +-
.../AMDGPU/amdgpu-codegenprepare-idiv.ll | 140 ++-
llvm/test/CodeGen/AMDGPU/bypass-div.ll | 8 +-
.../CodeGen/AMDGPU/divergence-driven-abs.ll | 8 +-
llvm/test/CodeGen/AMDGPU/sdiv.ll | 788 +++++++-------
llvm/test/CodeGen/AMDGPU/sminmax.ll | 16 +-
llvm/test/CodeGen/AMDGPU/srem.ll | 26 +-
10 files changed, 967 insertions(+), 1014 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 1b559a628be08..8ed4062e43946 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -514,8 +514,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MVT::i64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
- setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
- Legal);
+ setOperationAction({ISD::ABS, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX},
+ MVT::i32, Legal);
setOperationAction(
{ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a2841c114a698..8ded201f03055 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14945,6 +14945,13 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
}
}
+ // max(x, neg(x)) -> abs(x)
+ if (Opc == ISD::SMAX && VT == MVT::i32) {
+ SDValue Value;
+ if (sd_match(N, m_SMax(m_Value(Value), m_Neg(m_Deferred(Value)))))
+ return DAG.getNode(ISD::ABS, SDLoc(N), VT, Value);
+ }
+
// min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
// max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
diff --git a/llvm/test/CodeGen/AMDGPU/abs_i16.ll b/llvm/test/CodeGen/AMDGPU/abs_i16.ll
index 7633ba0eb4f9c..66cc7f3db03c2 100644
--- a/llvm/test/CodeGen/AMDGPU/abs_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/abs_i16.ll
@@ -15,7 +15,7 @@ define i16 @abs_i16(i16 %arg) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
+; GFX6-NEXT: v_max_i32_e32 v0, v1, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: abs_i16:
@@ -23,7 +23,7 @@ define i16 @abs_i16(i16 %arg) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v1
+; GFX7-NEXT: v_max_i32_e32 v0, v1, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_i16:
@@ -97,9 +97,9 @@ define <2 x i16> @v_abs_v2i16(<2 x i16> %arg) {
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v2
+; GFX6-NEXT: v_max_i32_e32 v0, v2, v0
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v2
+; GFX6-NEXT: v_max_i32_e32 v1, v2, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -110,9 +110,9 @@ define <2 x i16> @v_abs_v2i16(<2 x i16> %arg) {
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v2
+; GFX7-NEXT: v_max_i32_e32 v0, v2, v0
; GFX7-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v2
+; GFX7-NEXT: v_max_i32_e32 v1, v2, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -172,15 +172,15 @@ define <3 x i16> @v_abs_v3i16(<3 x i16> %arg) {
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v3, v0
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX6-NEXT: v_max_i32_e32 v1, v3, v1
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_max_i32_e32 v2, v3, v2
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v1
-; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_abs_v3i16:
@@ -189,15 +189,15 @@ define <3 x i16> @v_abs_v3i16(<3 x i16> %arg) {
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v0, v3, v0
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GFX7-NEXT: v_max_i32_e32 v1, v3, v1
+; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_max_i32_e32 v2, v3, v2
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
-; GFX7-NEXT: v_max_i32_e32 v2, v2, v1
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_abs_v3i16:
@@ -262,47 +262,45 @@ define <4 x i16> @v_abs_v4i16(<4 x i16> %arg) {
; GFX6-LABEL: v_abs_v4i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v4, v0
+; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v1, v4, v1
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v4
+; GFX6-NEXT: v_max_i32_e32 v2, v4, v2
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
-; GFX6-NEXT: v_max_i32_e32 v3, v3, v4
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v4, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v4
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_abs_v4i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v0, v4, v0
+; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v1, v4, v1
; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
-; GFX7-NEXT: v_max_i32_e32 v2, v2, v4
+; GFX7-NEXT: v_max_i32_e32 v2, v4, v2
; GFX7-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
-; GFX7-NEXT: v_max_i32_e32 v3, v3, v4
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v4, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v4
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_abs_v4i16:
@@ -370,63 +368,61 @@ define <6 x i16> @v_abs_v6i16(<6 x i16> %arg) {
; GFX6-LABEL: v_abs_v6i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v0
+; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v6, v0
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v1
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v1, v6, v1
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v4
+; GFX6-NEXT: v_max_i32_e32 v4, v6, v4
+; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v5
+; GFX6-NEXT: v_max_i32_e32 v5, v6, v5
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v6
; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v6
+; GFX6-NEXT: v_max_i32_e32 v2, v6, v2
; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0, v3
-; GFX6-NEXT: v_max_i32_e32 v3, v3, v6
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
-; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v3, v6, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v5
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v6
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v4
-; GFX6-NEXT: v_max_i32_e32 v5, v5, v3
-; GFX6-NEXT: v_max_i32_e32 v1, v4, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_or_b32_e32 v4, v1, v3
-; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_abs_v6i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v0
+; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v0, v6, v0
+; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v1
+; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v1, v6, v1
+; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v4
+; GFX7-NEXT: v_max_i32_e32 v4, v6, v4
+; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v5
+; GFX7-NEXT: v_max_i32_e32 v5, v6, v5
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v5
; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v6
; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
-; GFX7-NEXT: v_max_i32_e32 v2, v2, v6
+; GFX7-NEXT: v_max_i32_e32 v2, v6, v2
; GFX7-NEXT: v_sub_i32_e32 v6, vcc, 0, v3
-; GFX7-NEXT: v_max_i32_e32 v3, v3, v6
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
-; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v3, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v5
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 0, v4
-; GFX7-NEXT: v_max_i32_e32 v5, v5, v3
-; GFX7-NEXT: v_max_i32_e32 v1, v4, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_or_b32_e32 v4, v1, v3
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_abs_v6i16:
@@ -509,83 +505,79 @@ define <8 x i16> @v_abs_v8i16(<8 x i16> %arg) {
; GFX6-LABEL: v_abs_v8i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v0
+; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v8, v0
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v1
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v1, v8, v1
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v4
; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v4, v8, v4
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v5
; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v5, v8, v5
; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v6
-; GFX6-NEXT: v_max_i32_e32 v6, v6, v8
+; GFX6-NEXT: v_max_i32_e32 v6, v8, v6
; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v7
-; GFX6-NEXT: v_max_i32_e32 v7, v7, v8
-; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
-; GFX6-NEXT: v_max_i32_e32 v4, v4, v7
-; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
-; GFX6-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX6-NEXT: v_max_i32_e32 v7, v8, v7
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v5
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GFX6-NEXT: v_max_i32_e32 v3, v3, v5
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_or_b32_e32 v6, v6, v8
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v8, v2
+; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v8, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v8
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT: v_alignbit_b32 v5, v6, v4, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_abs_v8i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v0
+; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v0, v8, v0
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v1
+; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v1, v8, v1
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v4
; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v4, v8, v4
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v5
; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v5, v8, v5
; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v6
-; GFX7-NEXT: v_max_i32_e32 v6, v6, v8
+; GFX7-NEXT: v_max_i32_e32 v6, v8, v6
; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v7
-; GFX7-NEXT: v_max_i32_e32 v7, v7, v8
-; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
-; GFX7-NEXT: v_max_i32_e32 v4, v4, v7
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
-; GFX7-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX7-NEXT: v_max_i32_e32 v7, v8, v7
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v7
; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GFX7-NEXT: v_max_i32_e32 v2, v2, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GFX7-NEXT: v_max_i32_e32 v3, v3, v5
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v8
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v2
+; GFX7-NEXT: v_max_i32_e32 v2, v8, v2
+; GFX7-NEXT: v_sub_i32_e32 v8, vcc, 0, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v8, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v8
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT: v_alignbit_b32 v5, v6, v4, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_abs_v8i16:
@@ -682,155 +674,147 @@ define <16 x i16> @v_abs_v16i16(<16 x i16> %arg) {
; GFX6-LABEL: v_abs_v16i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
+; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v16, v0
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v1
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v1, v16, v1
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v4
+; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v4, v16, v4
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v5
+; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v5, v16, v5
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v8
+; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v8, v16, v8
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v9
+; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v9, v16, v9
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v12
; GFX6-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v12, v16, v12
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v13
; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v13, v16, v13
; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v14
-; GFX6-NEXT: v_max_i32_e32 v14, v14, v16
+; GFX6-NEXT: v_max_i32_e32 v14, v16, v14
; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v15
-; GFX6-NEXT: v_max_i32_e32 v15, v15, v16
-; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v14, v14, v15
-; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
-; GFX6-NEXT: v_max_i32_e32 v12, v12, v15
-; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
-; GFX6-NEXT: v_max_i32_e32 v13, v13, v15
+; GFX6-NEXT: v_max_i32_e32 v15, v16, v15
; GFX6-NEXT: v_bfe_i32 v10, v10, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX6-NEXT: v_bfe_i32 v11, v11, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v12, v12, v13
-; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
-; GFX6-NEXT: v_max_i32_e32 v10, v10, v13
-; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
-; GFX6-NEXT: v_max_i32_e32 v11, v11, v13
-; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
-; GFX6-NEXT: v_max_i32_e32 v8, v8, v11
-; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
-; GFX6-NEXT: v_max_i32_e32 v9, v9, v11
+; GFX6-NEXT: v_or_b32_e32 v14, v14, v16
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v10
+; GFX6-NEXT: v_max_i32_e32 v10, v16, v10
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v11
+; GFX6-NEXT: v_max_i32_e32 v11, v16, v11
; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v11
; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v8, v8, v9
-; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
-; GFX6-NEXT: v_max_i32_e32 v6, v6, v9
-; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
-; GFX6-NEXT: v_max_i32_e32 v7, v7, v9
-; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
-; GFX6-NEXT: v_max_i32_e32 v4, v4, v7
-; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
-; GFX6-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX6-NEXT: v_or_b32_e32 v10, v10, v16
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
+; GFX6-NEXT: v_max_i32_e32 v6, v16, v6
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v7
+; GFX6-NEXT: v_max_i32_e32 v7, v16, v7
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v7
; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v5
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GFX6-NEXT: v_max_i32_e32 v3, v3, v5
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX6-NEXT: v_or_b32_e32 v6, v6, v16
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v16, v2
+; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v16, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v16, 16, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v16
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT: v_alignbit_b32 v5, v6, v4, 16
-; GFX6-NEXT: v_alignbit_b32 v9, v10, v8, 16
-; GFX6-NEXT: v_alignbit_b32 v13, v14, v12, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX6-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX6-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX6-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; GFX6-NEXT: v_alignbit_b32 v13, v14, v13, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_abs_v16i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
+; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v0, v16, v0
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v1
+; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v1, v16, v1
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v4
+; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v4, v16, v4
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v5
+; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v5, v16, v5
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v8
+; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v8, v16, v8
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v9
+; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v9, v16, v9
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v12
; GFX7-NEXT: v_bfe_i32 v14, v14, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v12, v16, v12
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v13
; GFX7-NEXT: v_bfe_i32 v15, v15, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v13, v16, v13
; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v14
-; GFX7-NEXT: v_max_i32_e32 v14, v14, v16
+; GFX7-NEXT: v_max_i32_e32 v14, v16, v14
; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v15
-; GFX7-NEXT: v_max_i32_e32 v15, v15, v16
-; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v14, v14, v15
-; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
-; GFX7-NEXT: v_max_i32_e32 v12, v12, v15
-; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
-; GFX7-NEXT: v_max_i32_e32 v13, v13, v15
+; GFX7-NEXT: v_max_i32_e32 v15, v16, v15
; GFX7-NEXT: v_bfe_i32 v10, v10, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v15
; GFX7-NEXT: v_bfe_i32 v11, v11, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v12, v12, v13
-; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
-; GFX7-NEXT: v_max_i32_e32 v10, v10, v13
-; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
-; GFX7-NEXT: v_max_i32_e32 v11, v11, v13
-; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
-; GFX7-NEXT: v_max_i32_e32 v8, v8, v11
-; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
-; GFX7-NEXT: v_max_i32_e32 v9, v9, v11
+; GFX7-NEXT: v_or_b32_e32 v14, v14, v16
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v10
+; GFX7-NEXT: v_max_i32_e32 v10, v16, v10
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v11
+; GFX7-NEXT: v_max_i32_e32 v11, v16, v11
; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v11
; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v8, v8, v9
-; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
-; GFX7-NEXT: v_max_i32_e32 v6, v6, v9
-; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
-; GFX7-NEXT: v_max_i32_e32 v7, v7, v9
-; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
-; GFX7-NEXT: v_max_i32_e32 v4, v4, v7
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
-; GFX7-NEXT: v_max_i32_e32 v5, v5, v7
+; GFX7-NEXT: v_or_b32_e32 v10, v10, v16
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
+; GFX7-NEXT: v_max_i32_e32 v6, v16, v6
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v7
+; GFX7-NEXT: v_max_i32_e32 v7, v16, v7
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v7
; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GFX7-NEXT: v_max_i32_e32 v2, v2, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GFX7-NEXT: v_max_i32_e32 v3, v3, v5
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v16
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v2
+; GFX7-NEXT: v_max_i32_e32 v2, v16, v2
+; GFX7-NEXT: v_sub_i32_e32 v16, vcc, 0, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v16, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v16, 16, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v16
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT: v_alignbit_b32 v5, v6, v4, 16
-; GFX7-NEXT: v_alignbit_b32 v9, v10, v8, 16
-; GFX7-NEXT: v_alignbit_b32 v13, v14, v12, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v14
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_abs_v16i16:
@@ -974,303 +958,287 @@ define <32 x i16> @v_abs_v32i16(<32 x i16> %arg) {
; GFX6-LABEL: v_abs_v32i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v0
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v0, v31, v0
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v1
+; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v1, v31, v1
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v4
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v4, v31, v4
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v5
+; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v5, v31, v5
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v8
+; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v8, v31, v8
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v9
+; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v9, v31, v9
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v12
+; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v12, v31, v12
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v13
+; GFX6-NEXT: v_bfe_i32 v16, v16, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v13, v31, v13
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v16
+; GFX6-NEXT: v_bfe_i32 v17, v17, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v16, v31, v16
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v17
+; GFX6-NEXT: v_bfe_i32 v20, v20, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v17, v31, v17
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v20
+; GFX6-NEXT: v_bfe_i32 v21, v21, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v20, v31, v20
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v21
+; GFX6-NEXT: v_bfe_i32 v24, v24, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v21, v31, v21
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v24
+; GFX6-NEXT: v_bfe_i32 v25, v25, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v24, v31, v24
+; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v25
; GFX6-NEXT: v_bfe_i32 v28, v28, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v25, v31, v25
; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v28
; GFX6-NEXT: v_bfe_i32 v29, v29, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v28, v28, v31
+; GFX6-NEXT: v_max_i32_e32 v28, v31, v28
; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v29
; GFX6-NEXT: v_bfe_i32 v30, v30, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v29, v29, v31
+; GFX6-NEXT: v_max_i32_e32 v29, v31, v29
; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v30
+; GFX6-NEXT: v_max_i32_e32 v30, v31, v30
+; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX6-NEXT: v_bfe_i32 v26, v26, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v30, v30, v31
-; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v26
; GFX6-NEXT: v_bfe_i32 v27, v27, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v26, v26, v31
-; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v27
-; GFX6-NEXT: v_bfe_i32 v24, v24, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v27, v27, v31
-; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v24
-; GFX6-NEXT: v_bfe_i32 v25, v25, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v24, v24, v31
-; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v25
; GFX6-NEXT: v_bfe_i32 v22, v22, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v25, v25, v31
-; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v22
; GFX6-NEXT: v_bfe_i32 v23, v23, 0, 16
-; GFX6-NEXT: v_max_i32_e32 v22, v22, v31
-; GFX6-NEXT: v_sub_i32_e32 v31, vcc, 0, v23
-; GFX6-NEXT: v_max_i32_e32 v23, v23, v31
-; GFX6-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX6-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX6-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX6-NEXT: v_or_b32_e32 v22, v22, v23
-; GFX6-NEXT: v_or_b32_e32 v24, v24, v25
-; GFX6-NEXT: v_bfe_i32 v21, v21, 0, 16
-; GFX6-NEXT: v_bfe_i32 v20, v20, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v29, 16, v29
-; GFX6-NEXT: v_or_b32_e32 v28, v28, v29
-; GFX6-NEXT: v_sub_i32_e32 v29, vcc, 0, v20
-; GFX6-NEXT: v_max_i32_e32 v20, v20, v29
; GFX6-NEXT: v_bfe_i32 v18, v18, 0, 16
; GFX6-NEXT: v_bfe_i32 v19, v19, 0, 16
-; GFX6-NEXT: v_bfe_i32 v16, v16, 0, 16
-; GFX6-NEXT: v_bfe_i32 v17, v17, 0, 16
; GFX6-NEXT: v_bfe_i32 v14, v14, 0, 16
; GFX6-NEXT: v_bfe_i32 v15, v15, 0, 16
-; GFX6-NEXT: v_bfe_i32 v12, v12, 0, 16
-; GFX6-NEXT: v_bfe_i32 v13, v13, 0, 16
; GFX6-NEXT: v_bfe_i32 v10, v10, 0, 16
; GFX6-NEXT: v_bfe_i32 v11, v11, 0, 16
-; GFX6-NEXT: v_bfe_i32 v8, v8, 0, 16
-; GFX6-NEXT: v_bfe_i32 v9, v9, 0, 16
; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16
; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16
-; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; GFX6-NEXT: v_or_b32_e32 v26, v26, v27
-; GFX6-NEXT: v_lshrrev_b32_e32 v27, 16, v26
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_bfe_i32 v23, v31, 0, 16
-; GFX6-NEXT: v_sub_i32_e32 v25, vcc, 0, v23
-; GFX6-NEXT: v_max_i32_e32 v23, v23, v25
-; GFX6-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX6-NEXT: v_or_b32_e32 v30, v30, v23
-; GFX6-NEXT: v_sub_i32_e32 v23, vcc, 0, v21
-; GFX6-NEXT: v_max_i32_e32 v21, v21, v23
-; GFX6-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX6-NEXT: v_or_b32_e32 v20, v20, v21
-; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 0, v18
-; GFX6-NEXT: v_max_i32_e32 v18, v18, v21
-; GFX6-NEXT: v_sub_i32_e32 v21, vcc, 0, v19
-; GFX6-NEXT: v_max_i32_e32 v19, v19, v21
-; GFX6-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; GFX6-NEXT: v_or_b32_e32 v18, v18, v19
-; GFX6-NEXT: v_sub_i32_e32 v19, vcc, 0, v16
-; GFX6-NEXT: v_max_i32_e32 v16, v16, v19
-; GFX6-NEXT: v_sub_i32_e32 v19, vcc, 0, v17
-; GFX6-NEXT: v_max_i32_e32 v17, v17, v19
-; GFX6-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX6-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 0, v14
-; GFX6-NEXT: v_max_i32_e32 v14, v14, v17
-; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 0, v15
-; GFX6-NEXT: v_max_i32_e32 v15, v15, v17
-; GFX6-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX6-NEXT: v_or_b32_e32 v14, v14, v15
-; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
-; GFX6-NEXT: v_max_i32_e32 v12, v12, v15
-; GFX6-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
-; GFX6-NEXT: v_max_i32_e32 v13, v13, v15
-; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX6-NEXT: v_or_b32_e32 v12, v12, v13
-; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
-; GFX6-NEXT: v_max_i32_e32 v10, v10, v13
-; GFX6-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
-; GFX6-NEXT: v_max_i32_e32 v11, v11, v13
-; GFX6-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX6-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
-; GFX6-NEXT: v_max_i32_e32 v8, v8, v11
-; GFX6-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
-; GFX6-NEXT: v_max_i32_e32 v9, v9, v11
-; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX6-NEXT: v_or_b32_e32 v8, v8, v9
-; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
-; GFX6-NEXT: v_max_i32_e32 v6, v6, v9
-; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
-; GFX6-NEXT: v_max_i32_e32 v7, v7, v9
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX6-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
-; GFX6-NEXT: v_max_i32_e32 v4, v4, v7
-; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
-; GFX6-NEXT: v_max_i32_e32 v5, v5, v7
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GFX6-NEXT: v_max_i32_e32 v2, v2, v5
-; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GFX6-NEXT: v_max_i32_e32 v3, v3, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX6-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX6-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX6-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX6-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX6-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX6-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX6-NEXT: v_alignbit_b32 v5, v6, v4, 16
-; GFX6-NEXT: v_alignbit_b32 v9, v10, v8, 16
-; GFX6-NEXT: v_alignbit_b32 v13, v14, v12, 16
-; GFX6-NEXT: v_alignbit_b32 v17, v18, v16, 16
-; GFX6-NEXT: v_alignbit_b32 v21, v22, v20, 16
-; GFX6-NEXT: v_alignbit_b32 v25, v26, v24, 16
-; GFX6-NEXT: v_alignbit_b32 v29, v30, v28, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX6-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; GFX6-NEXT: v_lshrrev_b32_e32 v19, 16, v18
-; GFX6-NEXT: v_lshrrev_b32_e32 v23, 16, v22
-; GFX6-NEXT: v_lshrrev_b32_e32 v31, 16, v30
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX6-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX6-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX6-NEXT: v_or_b32_e32 v16, v16, v17
+; GFX6-NEXT: v_or_b32_e32 v20, v20, v21
+; GFX6-NEXT: v_or_b32_e32 v24, v24, v25
+; GFX6-NEXT: v_or_b32_e32 v28, v28, v29
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_bfe_i32 v31, v31, 0, 16
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v31
+; GFX6-NEXT: v_max_i32_e32 v31, v32, v31
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v31
+; GFX6-NEXT: v_or_b32_e32 v30, v30, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v26
+; GFX6-NEXT: v_max_i32_e32 v26, v32, v26
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v27
+; GFX6-NEXT: v_max_i32_e32 v27, v32, v27
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v27
+; GFX6-NEXT: v_or_b32_e32 v26, v26, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v22
+; GFX6-NEXT: v_max_i32_e32 v22, v32, v22
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v23
+; GFX6-NEXT: v_max_i32_e32 v23, v32, v23
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v23
+; GFX6-NEXT: v_or_b32_e32 v22, v22, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v18
+; GFX6-NEXT: v_max_i32_e32 v18, v32, v18
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v19
+; GFX6-NEXT: v_max_i32_e32 v19, v32, v19
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v19
+; GFX6-NEXT: v_or_b32_e32 v18, v18, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v14
+; GFX6-NEXT: v_max_i32_e32 v14, v32, v14
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v15
+; GFX6-NEXT: v_max_i32_e32 v15, v32, v15
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX6-NEXT: v_or_b32_e32 v14, v14, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v10
+; GFX6-NEXT: v_max_i32_e32 v10, v32, v10
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v11
+; GFX6-NEXT: v_max_i32_e32 v11, v32, v11
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GFX6-NEXT: v_or_b32_e32 v10, v10, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v6
+; GFX6-NEXT: v_max_i32_e32 v6, v32, v6
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v7
+; GFX6-NEXT: v_max_i32_e32 v7, v32, v7
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v7
+; GFX6-NEXT: v_or_b32_e32 v6, v6, v32
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v2
+; GFX6-NEXT: v_max_i32_e32 v2, v32, v2
+; GFX6-NEXT: v_sub_i32_e32 v32, vcc, 0, v3
+; GFX6-NEXT: v_max_i32_e32 v3, v32, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v32, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v32
+; GFX6-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX6-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; GFX6-NEXT: v_alignbit_b32 v13, v14, v13, 16
+; GFX6-NEXT: v_alignbit_b32 v17, v18, v17, 16
+; GFX6-NEXT: v_alignbit_b32 v21, v22, v21, 16
+; GFX6-NEXT: v_alignbit_b32 v25, v26, v25, 16
+; GFX6-NEXT: v_alignbit_b32 v29, v30, v29, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_abs_v32i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v0
+; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v0, v31, v0
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v1
+; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v1, v31, v1
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v4
+; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v4, v31, v4
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v5
+; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v5, v31, v5
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v8
+; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v8, v31, v8
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v9
+; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v9, v31, v9
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v12
+; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v12, v31, v12
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v13
+; GFX7-NEXT: v_bfe_i32 v16, v16, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v13, v31, v13
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v16
+; GFX7-NEXT: v_bfe_i32 v17, v17, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v16, v31, v16
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v17
+; GFX7-NEXT: v_bfe_i32 v20, v20, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v17, v31, v17
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v20
+; GFX7-NEXT: v_bfe_i32 v21, v21, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v20, v31, v20
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v21
+; GFX7-NEXT: v_bfe_i32 v24, v24, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v21, v31, v21
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v24
+; GFX7-NEXT: v_bfe_i32 v25, v25, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v24, v31, v24
+; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v25
; GFX7-NEXT: v_bfe_i32 v28, v28, 0, 16
+; GFX7-NEXT: v_max_i32_e32 v25, v31, v25
; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v28
; GFX7-NEXT: v_bfe_i32 v29, v29, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v28, v28, v31
+; GFX7-NEXT: v_max_i32_e32 v28, v31, v28
; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v29
; GFX7-NEXT: v_bfe_i32 v30, v30, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v29, v29, v31
+; GFX7-NEXT: v_max_i32_e32 v29, v31, v29
; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v30
+; GFX7-NEXT: v_max_i32_e32 v30, v31, v30
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
; GFX7-NEXT: v_bfe_i32 v26, v26, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v30, v30, v31
-; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v26
; GFX7-NEXT: v_bfe_i32 v27, v27, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v26, v26, v31
-; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v27
-; GFX7-NEXT: v_bfe_i32 v24, v24, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v27, v27, v31
-; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v24
-; GFX7-NEXT: v_bfe_i32 v25, v25, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v24, v24, v31
-; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v25
; GFX7-NEXT: v_bfe_i32 v22, v22, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v25, v25, v31
-; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v22
; GFX7-NEXT: v_bfe_i32 v23, v23, 0, 16
-; GFX7-NEXT: v_max_i32_e32 v22, v22, v31
-; GFX7-NEXT: v_sub_i32_e32 v31, vcc, 0, v23
-; GFX7-NEXT: v_max_i32_e32 v23, v23, v31
-; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; GFX7-NEXT: v_or_b32_e32 v22, v22, v23
-; GFX7-NEXT: v_or_b32_e32 v24, v24, v25
-; GFX7-NEXT: v_bfe_i32 v21, v21, 0, 16
-; GFX7-NEXT: v_bfe_i32 v20, v20, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
-; GFX7-NEXT: v_or_b32_e32 v28, v28, v29
-; GFX7-NEXT: v_sub_i32_e32 v29, vcc, 0, v20
-; GFX7-NEXT: v_max_i32_e32 v20, v20, v29
; GFX7-NEXT: v_bfe_i32 v18, v18, 0, 16
; GFX7-NEXT: v_bfe_i32 v19, v19, 0, 16
-; GFX7-NEXT: v_bfe_i32 v16, v16, 0, 16
-; GFX7-NEXT: v_bfe_i32 v17, v17, 0, 16
; GFX7-NEXT: v_bfe_i32 v14, v14, 0, 16
; GFX7-NEXT: v_bfe_i32 v15, v15, 0, 16
-; GFX7-NEXT: v_bfe_i32 v12, v12, 0, 16
-; GFX7-NEXT: v_bfe_i32 v13, v13, 0, 16
; GFX7-NEXT: v_bfe_i32 v10, v10, 0, 16
; GFX7-NEXT: v_bfe_i32 v11, v11, 0, 16
-; GFX7-NEXT: v_bfe_i32 v8, v8, 0, 16
-; GFX7-NEXT: v_bfe_i32 v9, v9, 0, 16
; GFX7-NEXT: v_bfe_i32 v6, v6, 0, 16
; GFX7-NEXT: v_bfe_i32 v7, v7, 0, 16
-; GFX7-NEXT: v_bfe_i32 v4, v4, 0, 16
-; GFX7-NEXT: v_bfe_i32 v5, v5, 0, 16
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX7-NEXT: v_bfe_i32 v3, v3, 0, 16
-; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX7-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; GFX7-NEXT: v_or_b32_e32 v26, v26, v27
-; GFX7-NEXT: v_lshrrev_b32_e32 v27, 16, v26
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_bfe_i32 v23, v31, 0, 16
-; GFX7-NEXT: v_sub_i32_e32 v25, vcc, 0, v23
-; GFX7-NEXT: v_max_i32_e32 v23, v23, v25
-; GFX7-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; GFX7-NEXT: v_or_b32_e32 v30, v30, v23
-; GFX7-NEXT: v_sub_i32_e32 v23, vcc, 0, v21
-; GFX7-NEXT: v_max_i32_e32 v21, v21, v23
-; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
-; GFX7-NEXT: v_or_b32_e32 v20, v20, v21
-; GFX7-NEXT: v_sub_i32_e32 v21, vcc, 0, v18
-; GFX7-NEXT: v_max_i32_e32 v18, v18, v21
-; GFX7-NEXT: v_sub_i32_e32 v21, vcc, 0, v19
-; GFX7-NEXT: v_max_i32_e32 v19, v19, v21
-; GFX7-NEXT: v_lshlrev_b32_e32 v19, 16, v19
-; GFX7-NEXT: v_or_b32_e32 v18, v18, v19
-; GFX7-NEXT: v_sub_i32_e32 v19, vcc, 0, v16
-; GFX7-NEXT: v_max_i32_e32 v16, v16, v19
-; GFX7-NEXT: v_sub_i32_e32 v19, vcc, 0, v17
-; GFX7-NEXT: v_max_i32_e32 v17, v17, v19
-; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17
-; GFX7-NEXT: v_or_b32_e32 v16, v16, v17
-; GFX7-NEXT: v_sub_i32_e32 v17, vcc, 0, v14
-; GFX7-NEXT: v_max_i32_e32 v14, v14, v17
-; GFX7-NEXT: v_sub_i32_e32 v17, vcc, 0, v15
-; GFX7-NEXT: v_max_i32_e32 v15, v15, v17
-; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15
-; GFX7-NEXT: v_or_b32_e32 v14, v14, v15
-; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v12
-; GFX7-NEXT: v_max_i32_e32 v12, v12, v15
-; GFX7-NEXT: v_sub_i32_e32 v15, vcc, 0, v13
-; GFX7-NEXT: v_max_i32_e32 v13, v13, v15
-; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
-; GFX7-NEXT: v_or_b32_e32 v12, v12, v13
-; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v10
-; GFX7-NEXT: v_max_i32_e32 v10, v10, v13
-; GFX7-NEXT: v_sub_i32_e32 v13, vcc, 0, v11
-; GFX7-NEXT: v_max_i32_e32 v11, v11, v13
-; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11
-; GFX7-NEXT: v_or_b32_e32 v10, v10, v11
-; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v8
-; GFX7-NEXT: v_max_i32_e32 v8, v8, v11
-; GFX7-NEXT: v_sub_i32_e32 v11, vcc, 0, v9
-; GFX7-NEXT: v_max_i32_e32 v9, v9, v11
-; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-NEXT: v_or_b32_e32 v8, v8, v9
-; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v6
-; GFX7-NEXT: v_max_i32_e32 v6, v6, v9
-; GFX7-NEXT: v_sub_i32_e32 v9, vcc, 0, v7
-; GFX7-NEXT: v_max_i32_e32 v7, v7, v9
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: v_or_b32_e32 v6, v6, v7
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
-; GFX7-NEXT: v_max_i32_e32 v4, v4, v7
-; GFX7-NEXT: v_sub_i32_e32 v7, vcc, 0, v5
-; GFX7-NEXT: v_max_i32_e32 v5, v5, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GFX7-NEXT: v_max_i32_e32 v2, v2, v5
-; GFX7-NEXT: v_sub_i32_e32 v5, vcc, 0, v3
-; GFX7-NEXT: v_max_i32_e32 v3, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
-; GFX7-NEXT: v_max_i32_e32 v0, v0, v3
-; GFX7-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; GFX7-NEXT: v_max_i32_e32 v1, v1, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GFX7-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GFX7-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_alignbit_b32 v1, v2, v0, 16
-; GFX7-NEXT: v_alignbit_b32 v5, v6, v4, 16
-; GFX7-NEXT: v_alignbit_b32 v9, v10, v8, 16
-; GFX7-NEXT: v_alignbit_b32 v13, v14, v12, 16
-; GFX7-NEXT: v_alignbit_b32 v17, v18, v16, 16
-; GFX7-NEXT: v_alignbit_b32 v21, v22, v20, 16
-; GFX7-NEXT: v_alignbit_b32 v25, v26, v24, 16
-; GFX7-NEXT: v_alignbit_b32 v29, v30, v28, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v10
-; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v14
-; GFX7-NEXT: v_lshrrev_b32_e32 v19, 16, v18
-; GFX7-NEXT: v_lshrrev_b32_e32 v23, 16, v22
-; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v30
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX7-NEXT: v_or_b32_e32 v8, v8, v9
+; GFX7-NEXT: v_or_b32_e32 v12, v12, v13
+; GFX7-NEXT: v_or_b32_e32 v16, v16, v17
+; GFX7-NEXT: v_or_b32_e32 v20, v20, v21
+; GFX7-NEXT: v_or_b32_e32 v24, v24, v25
+; GFX7-NEXT: v_or_b32_e32 v28, v28, v29
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_bfe_i32 v31, v31, 0, 16
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v31
+; GFX7-NEXT: v_max_i32_e32 v31, v32, v31
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v31
+; GFX7-NEXT: v_or_b32_e32 v30, v30, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v26
+; GFX7-NEXT: v_max_i32_e32 v26, v32, v26
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v27
+; GFX7-NEXT: v_max_i32_e32 v27, v32, v27
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v27
+; GFX7-NEXT: v_or_b32_e32 v26, v26, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v22
+; GFX7-NEXT: v_max_i32_e32 v22, v32, v22
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v23
+; GFX7-NEXT: v_max_i32_e32 v23, v32, v23
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v23
+; GFX7-NEXT: v_or_b32_e32 v22, v22, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v18
+; GFX7-NEXT: v_max_i32_e32 v18, v32, v18
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v19
+; GFX7-NEXT: v_max_i32_e32 v19, v32, v19
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v19
+; GFX7-NEXT: v_or_b32_e32 v18, v18, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v14
+; GFX7-NEXT: v_max_i32_e32 v14, v32, v14
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v15
+; GFX7-NEXT: v_max_i32_e32 v15, v32, v15
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v15
+; GFX7-NEXT: v_or_b32_e32 v14, v14, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v10
+; GFX7-NEXT: v_max_i32_e32 v10, v32, v10
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v11
+; GFX7-NEXT: v_max_i32_e32 v11, v32, v11
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v11
+; GFX7-NEXT: v_or_b32_e32 v10, v10, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v6
+; GFX7-NEXT: v_max_i32_e32 v6, v32, v6
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v7
+; GFX7-NEXT: v_max_i32_e32 v7, v32, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v6, v6, v32
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v2
+; GFX7-NEXT: v_max_i32_e32 v2, v32, v2
+; GFX7-NEXT: v_sub_i32_e32 v32, vcc, 0, v3
+; GFX7-NEXT: v_max_i32_e32 v3, v32, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v32, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v32
+; GFX7-NEXT: v_alignbit_b32 v1, v2, v1, 16
+; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX7-NEXT: v_alignbit_b32 v9, v10, v9, 16
+; GFX7-NEXT: v_alignbit_b32 v13, v14, v13, 16
+; GFX7-NEXT: v_alignbit_b32 v17, v18, v17, 16
+; GFX7-NEXT: v_alignbit_b32 v21, v22, v21, 16
+; GFX7-NEXT: v_alignbit_b32 v25, v26, v25, 16
+; GFX7-NEXT: v_alignbit_b32 v29, v30, v29, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_abs_v32i16:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index e71bf15384727..e34aaf205cb95 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -136,7 +136,7 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: v_cndmask_b32_e32 v0, 5, v1, vcc
; GCN-NEXT: v_sub_u32_e32 v1, vcc, 0, v0
-; GCN-NEXT: v_max_i32_e32 v1, v0, v1
+; GCN-NEXT: v_max_i32_e32 v1, v1, v0
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1
; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v1
; GCN-NEXT: s_mov_b32 s4, 0xf4240
@@ -218,7 +218,7 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 5, vcc
; GCN-NEXT: v_sub_u32_e32 v1, vcc, 0, v0
-; GCN-NEXT: v_max_i32_e32 v1, v0, v1
+; GCN-NEXT: v_max_i32_e32 v1, v1, v0
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1
; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0, v1
; GCN-NEXT: s_mov_b32 s4, 0xf4240
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index e27164c2d6d69..948811ea45f77 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -6191,37 +6191,34 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3
-; GFX6-NEXT: s_ashr_i32 s8, s3, 31
-; GFX6-NEXT: s_add_i32 s3, s3, s8
-; GFX6-NEXT: s_xor_b32 s3, s3, s8
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX6-NEXT: s_sub_i32 s4, 0, s3
-; GFX6-NEXT: s_ashr_i32 s9, s2, 31
-; GFX6-NEXT: s_add_i32 s2, s2, s9
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT: s_xor_b32 s2, s2, s9
+; GFX6-NEXT: s_abs_i32 s8, s3
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8
+; GFX6-NEXT: s_sub_i32 s4, 0, s8
+; GFX6-NEXT: s_abs_i32 s9, s2
; GFX6-NEXT: s_mov_b32 s5, s1
+; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0
; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0
; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: s_mul_i32 s0, s0, s3
-; GFX6-NEXT: s_sub_i32 s0, s2, s0
-; GFX6-NEXT: s_sub_i32 s1, s0, s3
+; GFX6-NEXT: s_mul_i32 s0, s0, s8
+; GFX6-NEXT: s_sub_i32 s0, s9, s0
+; GFX6-NEXT: s_sub_i32 s1, s0, s8
; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT: s_cmp_ge_u32 s0, s3
+; GFX6-NEXT: s_cmp_ge_u32 s0, s8
; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
; GFX6-NEXT: s_cselect_b32 s0, s1, s0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX6-NEXT: v_add_i32_e32 v1, vcc, 1, v0
-; GFX6-NEXT: s_cmp_ge_u32 s0, s3
+; GFX6-NEXT: s_cmp_ge_u32 s0, s8
; GFX6-NEXT: s_cselect_b64 vcc, -1, 0
+; GFX6-NEXT: s_xor_b32 s0, s2, s3
; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT: s_xor_b32 s0, s9, s8
+; GFX6-NEXT: s_ashr_i32 s0, s0, 31
; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -6233,35 +6230,32 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_add_i32 s3, s3, s4
-; GFX9-NEXT: s_xor_b32 s3, s3, s4
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_sub_i32 s6, 0, s3
-; GFX9-NEXT: s_ashr_i32 s5, s2, 31
-; GFX9-NEXT: s_add_i32 s2, s2, s5
+; GFX9-NEXT: s_abs_i32 s4, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT: s_sub_i32 s6, 0, s4
+; GFX9-NEXT: s_abs_i32 s5, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_xor_b32 s2, s2, s5
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s7, v0
; GFX9-NEXT: s_mul_i32 s6, s6, s7
; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6
; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_mul_hi_u32 s6, s2, s7
-; GFX9-NEXT: s_mul_i32 s8, s6, s3
-; GFX9-NEXT: s_sub_i32 s2, s2, s8
+; GFX9-NEXT: s_mul_hi_u32 s6, s5, s7
+; GFX9-NEXT: s_mul_i32 s8, s6, s4
+; GFX9-NEXT: s_sub_i32 s5, s5, s8
; GFX9-NEXT: s_add_i32 s7, s6, 1
-; GFX9-NEXT: s_sub_i32 s8, s2, s3
-; GFX9-NEXT: s_cmp_ge_u32 s2, s3
+; GFX9-NEXT: s_sub_i32 s8, s5, s4
+; GFX9-NEXT: s_cmp_ge_u32 s5, s4
; GFX9-NEXT: s_cselect_b32 s6, s7, s6
-; GFX9-NEXT: s_cselect_b32 s2, s8, s2
+; GFX9-NEXT: s_cselect_b32 s5, s8, s5
; GFX9-NEXT: s_add_i32 s7, s6, 1
-; GFX9-NEXT: s_cmp_ge_u32 s2, s3
-; GFX9-NEXT: s_cselect_b32 s2, s7, s6
-; GFX9-NEXT: s_xor_b32 s3, s5, s4
+; GFX9-NEXT: s_cmp_ge_u32 s5, s4
+; GFX9-NEXT: s_cselect_b32 s4, s7, s6
; GFX9-NEXT: s_xor_b32 s2, s2, s3
-; GFX9-NEXT: s_sub_i32 s2, s2, s3
+; GFX9-NEXT: s_ashr_i32 s2, s2, 31
+; GFX9-NEXT: s_xor_b32 s3, s4, s2
+; GFX9-NEXT: s_sub_i32 s2, s3, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -6706,38 +6700,37 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
; GFX6-LABEL: srem_i32_pow2_shl_denom:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3
-; GFX6-NEXT: s_ashr_i32 s4, s3, 31
-; GFX6-NEXT: s_add_i32 s3, s3, s4
-; GFX6-NEXT: s_xor_b32 s4, s3, s4
-; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4
-; GFX6-NEXT: s_sub_i32 s3, 0, s4
-; GFX6-NEXT: s_ashr_i32 s5, s2, 31
-; GFX6-NEXT: s_add_i32 s2, s2, s5
+; GFX6-NEXT: s_abs_i32 s3, s3
+; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX6-NEXT: s_sub_i32 s4, 0, s3
+; GFX6-NEXT: s_abs_i32 s8, s2
+; GFX6-NEXT: s_mov_b32 s5, s1
; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT: s_xor_b32 s6, s2, s5
-; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0
-; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0
+; GFX6-NEXT: s_mov_b32 s4, s0
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT: v_readfirstlane_b32 s7, v0
-; GFX6-NEXT: s_mul_i32 s7, s7, s4
-; GFX6-NEXT: s_sub_i32 s6, s6, s7
-; GFX6-NEXT: s_sub_i32 s7, s6, s4
-; GFX6-NEXT: s_cmp_ge_u32 s6, s4
-; GFX6-NEXT: s_cselect_b32 s6, s7, s6
-; GFX6-NEXT: s_sub_i32 s7, s6, s4
-; GFX6-NEXT: s_cmp_ge_u32 s6, s4
-; GFX6-NEXT: s_cselect_b32 s4, s7, s6
-; GFX6-NEXT: s_xor_b32 s4, s4, s5
-; GFX6-NEXT: s_sub_i32 s4, s4, s5
-; GFX6-NEXT: v_mov_b32_e32 v0, s4
-; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: s_mul_i32 s0, s0, s3
+; GFX6-NEXT: s_sub_i32 s0, s8, s0
+; GFX6-NEXT: s_sub_i32 s1, s0, s3
+; GFX6-NEXT: s_cmp_ge_u32 s0, s3
+; GFX6-NEXT: s_cselect_b32 s0, s1, s0
+; GFX6-NEXT: s_sub_i32 s1, s0, s3
+; GFX6-NEXT: s_cmp_ge_u32 s0, s3
+; GFX6-NEXT: s_cselect_b32 s0, s1, s0
+; GFX6-NEXT: s_ashr_i32 s1, s2, 31
+; GFX6-NEXT: s_xor_b32 s0, s0, s1
+; GFX6-NEXT: s_sub_i32 s0, s0, s1
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
+; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: srem_i32_pow2_shl_denom:
@@ -6746,32 +6739,29 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_add_i32 s3, s3, s4
-; GFX9-NEXT: s_xor_b32 s3, s3, s4
+; GFX9-NEXT: s_abs_i32 s3, s3
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX9-NEXT: s_sub_i32 s5, 0, s3
-; GFX9-NEXT: s_ashr_i32 s4, s2, 31
-; GFX9-NEXT: s_add_i32 s2, s2, s4
+; GFX9-NEXT: s_abs_i32 s4, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_xor_b32 s2, s2, s4
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s6, v0
; GFX9-NEXT: s_mul_i32 s5, s5, s6
; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5
; GFX9-NEXT: s_add_i32 s6, s6, s5
-; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6
+; GFX9-NEXT: s_mul_hi_u32 s5, s4, s6
; GFX9-NEXT: s_mul_i32 s5, s5, s3
-; GFX9-NEXT: s_sub_i32 s2, s2, s5
-; GFX9-NEXT: s_sub_i32 s5, s2, s3
-; GFX9-NEXT: s_cmp_ge_u32 s2, s3
-; GFX9-NEXT: s_cselect_b32 s2, s5, s2
-; GFX9-NEXT: s_sub_i32 s5, s2, s3
-; GFX9-NEXT: s_cmp_ge_u32 s2, s3
-; GFX9-NEXT: s_cselect_b32 s2, s5, s2
-; GFX9-NEXT: s_xor_b32 s2, s2, s4
-; GFX9-NEXT: s_sub_i32 s2, s2, s4
+; GFX9-NEXT: s_sub_i32 s4, s4, s5
+; GFX9-NEXT: s_sub_i32 s5, s4, s3
+; GFX9-NEXT: s_cmp_ge_u32 s4, s3
+; GFX9-NEXT: s_cselect_b32 s4, s5, s4
+; GFX9-NEXT: s_sub_i32 s5, s4, s3
+; GFX9-NEXT: s_cmp_ge_u32 s4, s3
+; GFX9-NEXT: s_cselect_b32 s3, s5, s4
+; GFX9-NEXT: s_ashr_i32 s2, s2, 31
+; GFX9-NEXT: s_xor_b32 s3, s3, s2
+; GFX9-NEXT: s_sub_i32 s2, s3, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index 3cf70c42390c2..d7d697ef85b9f 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -576,11 +576,11 @@ define i32 @sdiv32(i32 %a, i32 %b) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v2, 0, v1
-; GFX9-NEXT: v_max_i32_e32 v2, v1, v2
+; GFX9-NEXT: v_max_i32_e32 v2, v2, v1
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v2
; GFX9-NEXT: v_sub_u32_e32 v4, 0, v2
; GFX9-NEXT: v_sub_u32_e32 v5, 0, v0
-; GFX9-NEXT: v_max_i32_e32 v5, v0, v5
+; GFX9-NEXT: v_max_i32_e32 v5, v5, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v0
@@ -640,11 +640,11 @@ define i32 @srem32(i32 %a, i32 %b) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v2, 0, v1
-; GFX9-NEXT: v_max_i32_e32 v1, v1, v2
+; GFX9-NEXT: v_max_i32_e32 v1, v2, v1
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v1
; GFX9-NEXT: v_sub_u32_e32 v3, 0, v1
; GFX9-NEXT: v_sub_u32_e32 v4, 0, v0
-; GFX9-NEXT: v_max_i32_e32 v4, v0, v4
+; GFX9-NEXT: v_max_i32_e32 v4, v4, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v0
; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll
index 68ae9854bd7d2..f72c164d6ff80 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-abs.ll
@@ -15,8 +15,8 @@ define amdgpu_kernel void @s_abs_i32(ptr addrspace(1) %out, i32 %val) nounwind {
}
; GCN-LABEL: name: v_abs_i32
-; SI: V_SUB_CO_U32_e64
-; GFX900: V_SUB_U32_e64
+; SI: V_SUB_CO_U32_e32
+; GFX900: V_SUB_U32_e32
; GCN: V_MAX_I32_e64
define amdgpu_kernel void @v_abs_i32(ptr addrspace(1) %out, ptr addrspace(1) %src) nounwind {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -47,8 +47,8 @@ define amdgpu_kernel void @s_abs_v2i32(ptr addrspace(1) %out, <2 x i32> %val) no
}
; GCN-LABEL: name: v_abs_v2i32
-; SI: V_SUB_CO_U32_e64
-; GFX900: V_SUB_U32_e64
+; SI: V_SUB_CO_U32_e32
+; GFX900: V_SUB_U32_e32
; GCN: V_MAX_I32_e64
; GCN: V_MAX_I32_e64
define amdgpu_kernel void @v_abs_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %src) nounwind {
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index 5c0f813c8c829..441509ba01f64 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -391,156 +391,144 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa
define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GCN-LABEL: sdiv_v2i32:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s10, s2
-; GCN-NEXT: s_mov_b32 s11, s3
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s8, s6
-; GCN-NEXT: s_mov_b32 s9, s7
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readfirstlane_b32 s0, v2
-; GCN-NEXT: s_abs_i32 s1, s0
-; GCN-NEXT: v_cvt_f32_u32_e32 v2, s1
-; GCN-NEXT: s_sub_i32 s6, 0, s1
-; GCN-NEXT: v_readfirstlane_b32 s8, v3
-; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT: v_mul_lo_u32 v4, s6, v2
-; GCN-NEXT: v_readfirstlane_b32 s6, v0
-; GCN-NEXT: s_abs_i32 s7, s6
-; GCN-NEXT: s_xor_b32 s0, s6, s0
-; GCN-NEXT: v_mul_hi_u32 v4, v2, v4
-; GCN-NEXT: s_ashr_i32 s6, s0, 31
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v4
-; GCN-NEXT: v_mul_hi_u32 v0, s7, v0
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: s_mul_i32 s0, s0, s1
-; GCN-NEXT: s_sub_i32 s0, s7, s0
-; GCN-NEXT: s_sub_i32 s7, s0, s1
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0
-; GCN-NEXT: s_cmp_ge_u32 s0, s1
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT: s_cselect_b32 s0, s7, s0
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0
-; GCN-NEXT: s_cmp_ge_u32 s0, s1
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_abs_i32 s7, s8
-; GCN-NEXT: v_cvt_f32_u32_e32 v3, s7
-; GCN-NEXT: s_mov_b32 s0, s4
-; GCN-NEXT: s_sub_i32 s4, 0, s7
-; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, s6, v0
-; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0
-; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT: v_mul_lo_u32 v4, s4, v3
-; GCN-NEXT: v_readfirstlane_b32 s4, v1
-; GCN-NEXT: s_xor_b32 s5, s4, s8
-; GCN-NEXT: s_abs_i32 s4, s4
-; GCN-NEXT: v_mul_hi_u32 v1, v3, v4
-; GCN-NEXT: s_ashr_i32 s5, s5, 31
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1
-; GCN-NEXT: v_mul_hi_u32 v1, s4, v1
-; GCN-NEXT: v_readfirstlane_b32 s6, v1
-; GCN-NEXT: s_mul_i32 s6, s6, s7
-; GCN-NEXT: s_sub_i32 s4, s4, s6
-; GCN-NEXT: s_sub_i32 s6, s4, s7
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1
-; GCN-NEXT: s_cmp_ge_u32 s4, s7
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT: s_cselect_b32 s4, s6, s4
-; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1
-; GCN-NEXT: s_cmp_ge_u32 s4, s7
-; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-NEXT: v_xor_b32_e32 v1, s5, v1
-; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s5, v1
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v3
+; GCN-NEXT: v_xor_b32_e32 v4, v0, v2
+; GCN-NEXT: v_xor_b32_e32 v7, v1, v3
+; GCN-NEXT: v_max_i32_e32 v2, v2, v6
+; GCN-NEXT: v_max_i32_e32 v3, v3, v9
+; GCN-NEXT: v_cvt_f32_u32_e32 v6, v2
+; GCN-NEXT: v_cvt_f32_u32_e32 v9, v3
+; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0
+; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; GCN-NEXT: v_max_i32_e32 v0, v0, v5
+; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v9
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2
+; GCN-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GCN-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; GCN-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v3
+; GCN-NEXT: v_mul_lo_u32 v9, v9, v6
+; GCN-NEXT: v_mul_lo_u32 v10, v10, v5
+; GCN-NEXT: v_sub_i32_e32 v8, vcc, 0, v1
+; GCN-NEXT: v_mul_hi_u32 v9, v6, v9
+; GCN-NEXT: v_max_i32_e32 v1, v1, v8
+; GCN-NEXT: v_mul_hi_u32 v8, v5, v10
+; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v4
+; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GCN-NEXT: v_mul_hi_u32 v6, v0, v6
+; GCN-NEXT: v_mul_hi_u32 v5, v1, v5
+; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7
+; GCN-NEXT: v_mul_lo_u32 v8, v6, v2
+; GCN-NEXT: v_mul_lo_u32 v10, v5, v3
+; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v6
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
+; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v10
+; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v5
+; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
+; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3
+; GCN-NEXT: v_sub_i32_e32 v8, vcc, v0, v2
+; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1]
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, v1, v3
+; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3]
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1]
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v6
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3]
+; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v5
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
+; GCN-NEXT: v_xor_b32_e32 v0, v0, v4
+; GCN-NEXT: v_xor_b32_e32 v1, v1, v7
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
; TONGA-LABEL: sdiv_v2i32:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
-; TONGA-NEXT: s_mov_b32 s3, 0xf000
-; TONGA-NEXT: s_mov_b32 s2, -1
-; TONGA-NEXT: s_mov_b32 s10, s2
-; TONGA-NEXT: s_mov_b32 s11, s3
+; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; TONGA-NEXT: s_mov_b32 s7, 0xf000
+; TONGA-NEXT: s_mov_b32 s6, -1
+; TONGA-NEXT: s_mov_b32 s10, s6
+; TONGA-NEXT: s_mov_b32 s11, s7
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s6
-; TONGA-NEXT: s_mov_b32 s9, s7
+; TONGA-NEXT: s_mov_b32 s8, s2
+; TONGA-NEXT: s_mov_b32 s9, s3
; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; TONGA-NEXT: s_mov_b32 s4, s0
+; TONGA-NEXT: s_mov_b32 s5, s1
; TONGA-NEXT: s_waitcnt vmcnt(0)
-; TONGA-NEXT: v_readfirstlane_b32 s0, v2
-; TONGA-NEXT: s_abs_i32 s1, s0
-; TONGA-NEXT: v_cvt_f32_u32_e32 v2, s1
-; TONGA-NEXT: s_sub_i32 s6, 0, s1
-; TONGA-NEXT: v_readfirstlane_b32 s8, v3
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; TONGA-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; TONGA-NEXT: v_cvt_u32_f32_e32 v2, v2
-; TONGA-NEXT: v_mul_lo_u32 v4, s6, v2
-; TONGA-NEXT: v_readfirstlane_b32 s6, v0
-; TONGA-NEXT: s_abs_i32 s7, s6
-; TONGA-NEXT: s_xor_b32 s0, s6, s0
-; TONGA-NEXT: v_mul_hi_u32 v4, v2, v4
-; TONGA-NEXT: s_ashr_i32 s6, s0, 31
-; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v4
-; TONGA-NEXT: v_mul_hi_u32 v0, s7, v0
-; TONGA-NEXT: v_readfirstlane_b32 s0, v0
-; TONGA-NEXT: s_mul_i32 s0, s0, s1
-; TONGA-NEXT: s_sub_i32 s0, s7, s0
-; TONGA-NEXT: s_sub_i32 s7, s0, s1
-; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v0
-; TONGA-NEXT: s_cmp_ge_u32 s0, s1
-; TONGA-NEXT: s_cselect_b64 vcc, -1, 0
-; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; TONGA-NEXT: s_cselect_b32 s0, s7, s0
-; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v0
-; TONGA-NEXT: s_cmp_ge_u32 s0, s1
-; TONGA-NEXT: s_cselect_b64 vcc, -1, 0
-; TONGA-NEXT: s_abs_i32 s7, s8
-; TONGA-NEXT: v_cvt_f32_u32_e32 v3, s7
-; TONGA-NEXT: s_mov_b32 s0, s4
-; TONGA-NEXT: s_sub_i32 s4, 0, s7
-; TONGA-NEXT: s_mov_b32 s1, s5
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; TONGA-NEXT: v_xor_b32_e32 v0, s6, v0
-; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, s6, v0
-; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3
-; TONGA-NEXT: v_mul_lo_u32 v4, s4, v3
-; TONGA-NEXT: v_readfirstlane_b32 s4, v1
-; TONGA-NEXT: s_xor_b32 s5, s4, s8
-; TONGA-NEXT: s_abs_i32 s4, s4
-; TONGA-NEXT: v_mul_hi_u32 v1, v3, v4
-; TONGA-NEXT: s_ashr_i32 s5, s5, 31
-; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1
-; TONGA-NEXT: v_mul_hi_u32 v1, s4, v1
-; TONGA-NEXT: v_readfirstlane_b32 s6, v1
-; TONGA-NEXT: s_mul_i32 s6, s6, s7
-; TONGA-NEXT: s_sub_i32 s4, s4, s6
-; TONGA-NEXT: s_sub_i32 s6, s4, s7
-; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v1
-; TONGA-NEXT: s_cmp_ge_u32 s4, s7
-; TONGA-NEXT: s_cselect_b64 vcc, -1, 0
-; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; TONGA-NEXT: s_cselect_b32 s4, s6, s4
-; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v1
-; TONGA-NEXT: s_cmp_ge_u32 s4, s7
-; TONGA-NEXT: s_cselect_b64 vcc, -1, 0
-; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; TONGA-NEXT: v_xor_b32_e32 v1, s5, v1
-; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s5, v1
-; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v2
+; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v3
+; TONGA-NEXT: v_xor_b32_e32 v4, v0, v2
+; TONGA-NEXT: v_xor_b32_e32 v7, v1, v3
+; TONGA-NEXT: v_max_i32_e32 v2, v2, v6
+; TONGA-NEXT: v_max_i32_e32 v3, v3, v9
+; TONGA-NEXT: v_cvt_f32_u32_e32 v6, v2
+; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v3
+; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; TONGA-NEXT: v_max_i32_e32 v0, v0, v5
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v5, v9
+; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2
+; TONGA-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; TONGA-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; TONGA-NEXT: v_cvt_u32_f32_e32 v6, v6
+; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5
+; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v3
+; TONGA-NEXT: v_mul_lo_u32 v9, v9, v6
+; TONGA-NEXT: v_mul_lo_u32 v10, v10, v5
+; TONGA-NEXT: v_sub_u32_e32 v8, vcc, 0, v1
+; TONGA-NEXT: v_mul_hi_u32 v9, v6, v9
+; TONGA-NEXT: v_max_i32_e32 v1, v1, v8
+; TONGA-NEXT: v_mul_hi_u32 v8, v5, v10
+; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v4
+; TONGA-NEXT: v_add_u32_e32 v6, vcc, v6, v9
+; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v8
+; TONGA-NEXT: v_mul_hi_u32 v6, v0, v6
+; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5
+; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v7
+; TONGA-NEXT: v_mul_lo_u32 v8, v6, v2
+; TONGA-NEXT: v_mul_lo_u32 v10, v5, v3
+; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v6
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8
+; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v10
+; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v5
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3
+; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v0, v2
+; TONGA-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1]
+; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v1, v3
+; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3]
+; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1]
+; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v6
+; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3]
+; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v5
+; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; TONGA-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc
+; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; TONGA-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
+; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4
+; TONGA-NEXT: v_xor_b32_e32 v1, v1, v7
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v4
+; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v7
+; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v2i32:
@@ -558,44 +546,44 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_readfirstlane_b32 s0, v2
; GFX9-NEXT: s_abs_i32 s1, s0
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s1
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_xor_b32 s0, s4, s0
+; GFX9-NEXT: v_readfirstlane_b32 s5, v0
+; GFX9-NEXT: s_xor_b32 s0, s5, s0
; GFX9-NEXT: s_ashr_i32 s6, s0, 31
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX9-NEXT: s_sub_i32 s0, 0, s1
-; GFX9-NEXT: s_abs_i32 s4, s4
-; GFX9-NEXT: v_readfirstlane_b32 s5, v3
+; GFX9-NEXT: s_abs_i32 s5, s5
+; GFX9-NEXT: v_readfirstlane_b32 s4, v3
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s7, v0
; GFX9-NEXT: s_mul_i32 s0, s0, s7
; GFX9-NEXT: s_mul_hi_u32 s0, s7, s0
; GFX9-NEXT: s_add_i32 s7, s7, s0
-; GFX9-NEXT: s_mul_hi_u32 s0, s4, s7
+; GFX9-NEXT: s_mul_hi_u32 s0, s5, s7
; GFX9-NEXT: s_mul_i32 s7, s0, s1
-; GFX9-NEXT: s_sub_i32 s4, s4, s7
+; GFX9-NEXT: s_sub_i32 s5, s5, s7
; GFX9-NEXT: s_add_i32 s10, s0, 1
-; GFX9-NEXT: s_sub_i32 s7, s4, s1
-; GFX9-NEXT: s_cmp_ge_u32 s4, s1
+; GFX9-NEXT: s_sub_i32 s7, s5, s1
+; GFX9-NEXT: s_cmp_ge_u32 s5, s1
; GFX9-NEXT: s_cselect_b32 s0, s10, s0
-; GFX9-NEXT: s_cselect_b32 s4, s7, s4
+; GFX9-NEXT: s_cselect_b32 s5, s7, s5
; GFX9-NEXT: s_add_i32 s7, s0, 1
-; GFX9-NEXT: s_cmp_ge_u32 s4, s1
-; GFX9-NEXT: s_cselect_b32 s4, s7, s0
-; GFX9-NEXT: s_abs_i32 s7, s5
+; GFX9-NEXT: s_cmp_ge_u32 s5, s1
+; GFX9-NEXT: s_cselect_b32 s5, s7, s0
+; GFX9-NEXT: s_abs_i32 s7, s4
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT: s_xor_b32 s4, s4, s6
+; GFX9-NEXT: s_xor_b32 s5, s5, s6
; GFX9-NEXT: s_mov_b32 s1, s9
; GFX9-NEXT: s_sub_i32 s9, 0, s7
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_sub_i32 s4, s4, s6
+; GFX9-NEXT: s_sub_i32 s5, s5, s6
; GFX9-NEXT: s_mov_b32 s0, s8
; GFX9-NEXT: v_readfirstlane_b32 s8, v1
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: s_xor_b32 s5, s8, s5
+; GFX9-NEXT: s_xor_b32 s4, s8, s4
; GFX9-NEXT: s_abs_i32 s8, s8
-; GFX9-NEXT: s_ashr_i32 s5, s5, 31
+; GFX9-NEXT: s_ashr_i32 s4, s4, 31
; GFX9-NEXT: v_readfirstlane_b32 s6, v0
; GFX9-NEXT: s_mul_i32 s9, s9, s6
; GFX9-NEXT: s_mul_hi_u32 s9, s6, s9
@@ -611,10 +599,10 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_add_i32 s9, s6, 1
; GFX9-NEXT: s_cmp_ge_u32 s8, s7
; GFX9-NEXT: s_cselect_b32 s6, s9, s6
-; GFX9-NEXT: s_xor_b32 s6, s6, s5
-; GFX9-NEXT: s_sub_i32 s5, s6, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_xor_b32 s6, s6, s4
+; GFX9-NEXT: s_sub_i32 s4, s6, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -804,255 +792,255 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: sdiv_v4i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NEXT: s_mov_b32 s10, -1
-; GCN-NEXT: s_mov_b32 s6, s10
-; GCN-NEXT: s_mov_b32 s7, s11
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s4, s2
-; GCN-NEXT: s_mov_b32 s5, s3
-; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
-; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0
-; GCN-NEXT: s_mov_b32 s8, s0
-; GCN-NEXT: s_mov_b32 s9, s1
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_waitcnt vmcnt(1)
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: v_readfirstlane_b32 s1, v1
-; GCN-NEXT: v_readfirstlane_b32 s2, v2
-; GCN-NEXT: s_abs_i32 s13, s0
-; GCN-NEXT: s_abs_i32 s14, s1
-; GCN-NEXT: s_abs_i32 s15, s2
-; GCN-NEXT: v_cvt_f32_u32_e32 v0, s13
-; GCN-NEXT: v_cvt_f32_u32_e32 v1, s14
-; GCN-NEXT: v_cvt_f32_u32_e32 v2, s15
-; GCN-NEXT: v_readfirstlane_b32 s6, v3
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v4
+; GCN-NEXT: v_xor_b32_e32 v8, v0, v4
+; GCN-NEXT: v_max_i32_e32 v4, v4, v10
+; GCN-NEXT: v_cvt_f32_u32_e32 v10, v4
+; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v5
+; GCN-NEXT: v_xor_b32_e32 v11, v1, v5
+; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10
+; GCN-NEXT: v_max_i32_e32 v5, v5, v13
+; GCN-NEXT: v_cvt_f32_u32_e32 v13, v5
+; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v4
+; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10
+; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10
+; GCN-NEXT: v_rcp_iflag_f32_e32 v13, v13
+; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v1
+; GCN-NEXT: v_mul_lo_u32 v16, v16, v10
+; GCN-NEXT: v_mul_f32_e32 v13, 0x4f7ffffe, v13
+; GCN-NEXT: v_cvt_u32_f32_e32 v13, v13
+; GCN-NEXT: v_max_i32_e32 v0, v0, v9
+; GCN-NEXT: v_mul_hi_u32 v16, v10, v16
+; GCN-NEXT: v_max_i32_e32 v1, v1, v12
+; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v6
+; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v16
+; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v5
+; GCN-NEXT: v_mul_lo_u32 v16, v16, v13
+; GCN-NEXT: v_mul_hi_u32 v10, v0, v10
+; GCN-NEXT: v_xor_b32_e32 v14, v2, v6
+; GCN-NEXT: v_max_i32_e32 v6, v6, v15
+; GCN-NEXT: v_mul_hi_u32 v12, v13, v16
+; GCN-NEXT: v_cvt_f32_u32_e32 v15, v6
+; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v8
+; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11
+; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GCN-NEXT: v_mul_lo_u32 v13, v10, v4
+; GCN-NEXT: v_mul_hi_u32 v12, v1, v12
+; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v15
+; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v14
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v13
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10
+; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
+; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1]
+; GCN-NEXT: v_sub_i32_e32 v13, vcc, v0, v4
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v13, s[0:1]
+; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
+; GCN-NEXT: v_mul_lo_u32 v0, v12, v5
+; GCN-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9
+; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9
+; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v6
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v12
+; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5
+; GCN-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3]
+; GCN-NEXT: v_sub_i32_e32 v12, vcc, v0, v5
+; GCN-NEXT: v_mul_lo_u32 v4, v4, v9
+; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3]
+; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0, v7
+; GCN-NEXT: v_max_i32_e32 v5, v7, v0
+; GCN-NEXT: v_cvt_f32_u32_e32 v0, v5
+; GCN-NEXT: v_mul_hi_u32 v4, v9, v4
+; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10
; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT: s_abs_i32 s17, s6
-; GCN-NEXT: v_cvt_f32_u32_e32 v3, s17
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2
+; GCN-NEXT: v_max_i32_e32 v2, v2, v9
+; GCN-NEXT: v_mul_hi_u32 v4, v2, v4
; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_readfirstlane_b32 s3, v4
-; GCN-NEXT: v_readfirstlane_b32 s4, v5
-; GCN-NEXT: v_readfirstlane_b32 s5, v6
-; GCN-NEXT: s_xor_b32 s12, s3, s0
-; GCN-NEXT: s_xor_b32 s0, s4, s1
-; GCN-NEXT: s_xor_b32 s1, s5, s2
-; GCN-NEXT: s_sub_i32 s2, 0, s13
-; GCN-NEXT: s_ashr_i32 s18, s0, 31
-; GCN-NEXT: s_sub_i32 s0, 0, s14
-; GCN-NEXT: s_ashr_i32 s19, s1, 31
-; GCN-NEXT: s_sub_i32 s1, 0, s15
-; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GCN-NEXT: v_mul_lo_u32 v4, s2, v0
-; GCN-NEXT: v_mul_lo_u32 v5, s0, v1
-; GCN-NEXT: v_mul_lo_u32 v6, s1, v2
-; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT: v_mul_hi_u32 v4, v0, v4
-; GCN-NEXT: v_mul_hi_u32 v5, v1, v5
-; GCN-NEXT: v_mul_hi_u32 v6, v2, v6
-; GCN-NEXT: s_sub_i32 s20, 0, s17
-; GCN-NEXT: v_readfirstlane_b32 s7, v7
-; GCN-NEXT: s_abs_i32 s3, s3
-; GCN-NEXT: s_abs_i32 s4, s4
-; GCN-NEXT: s_abs_i32 s5, s5
-; GCN-NEXT: v_mul_lo_u32 v7, s20, v3
-; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5
-; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; GCN-NEXT: v_mul_hi_u32 v0, s3, v0
-; GCN-NEXT: v_mul_hi_u32 v1, s4, v1
-; GCN-NEXT: v_mul_hi_u32 v2, s5, v2
-; GCN-NEXT: v_mul_hi_u32 v7, v3, v7
-; GCN-NEXT: v_mul_lo_u32 v4, v0, s13
-; GCN-NEXT: v_mul_lo_u32 v6, v1, s14
-; GCN-NEXT: v_mul_lo_u32 v8, v2, s15
-; GCN-NEXT: s_abs_i32 s16, s7
-; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GCN-NEXT: v_mul_hi_u32 v3, s16, v3
-; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v4
-; GCN-NEXT: v_sub_i32_e32 v6, vcc, s4, v6
-; GCN-NEXT: v_sub_i32_e32 v8, vcc, s5, v8
-; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v0
-; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v1
-; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v2
-; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4
-; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v6
-; GCN-NEXT: v_cmp_le_u32_e64 s[4:5], s15, v8
-; GCN-NEXT: v_subrev_i32_e32 v10, vcc, s13, v4
-; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1]
-; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s14, v6
-; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3]
-; GCN-NEXT: v_subrev_i32_e32 v7, vcc, s15, v8
-; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[4:5]
-; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1]
-; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v0
-; GCN-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3]
-; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v1
-; GCN-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[4:5]
-; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v2
-; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v4
-; GCN-NEXT: v_mul_lo_u32 v4, v3, s17
-; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
-; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v5
-; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
-; GCN-NEXT: v_cmp_le_u32_e32 vcc, s15, v7
-; GCN-NEXT: s_ashr_i32 s12, s12, 31
-; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
-; GCN-NEXT: v_xor_b32_e32 v0, s12, v0
-; GCN-NEXT: v_xor_b32_e32 v1, s18, v1
-; GCN-NEXT: v_xor_b32_e32 v2, s19, v2
-; GCN-NEXT: v_sub_i32_e32 v4, vcc, s16, v4
-; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0
-; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s18, v1
-; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s19, v2
-; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3
-; GCN-NEXT: v_subrev_i32_e32 v6, vcc, s17, v4
-; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v4
-; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3
-; GCN-NEXT: s_xor_b32 s0, s7, s6
-; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v4
-; GCN-NEXT: s_ashr_i32 s0, s0, 31
-; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GCN-NEXT: v_xor_b32_e32 v3, s0, v3
-; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s0, v3
-; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GCN-NEXT: v_cvt_u32_f32_e32 v9, v0
+; GCN-NEXT: v_cndmask_b32_e64 v0, v10, v13, s[0:1]
+; GCN-NEXT: v_xor_b32_e32 v0, v0, v8
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
+; GCN-NEXT: v_mul_lo_u32 v8, v4, v6
+; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v1
+; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v5
+; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
+; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3]
+; GCN-NEXT: v_mul_lo_u32 v10, v10, v9
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6
+; GCN-NEXT: v_xor_b32_e32 v1, v1, v11
+; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1]
+; GCN-NEXT: v_sub_i32_e32 v8, vcc, v2, v6
+; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v11
+; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1]
+; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
+; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc
+; GCN-NEXT: v_mul_hi_u32 v4, v9, v10
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v3
+; GCN-NEXT: v_max_i32_e32 v6, v3, v6
+; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; GCN-NEXT: v_mul_hi_u32 v4, v6, v4
+; GCN-NEXT: v_xor_b32_e32 v2, v2, v14
+; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v14
+; GCN-NEXT: v_mul_lo_u32 v8, v4, v5
+; GCN-NEXT: v_xor_b32_e32 v3, v3, v7
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4
+; GCN-NEXT: v_sub_i32_e32 v6, vcc, v6, v8
+; GCN-NEXT: v_sub_i32_e32 v8, vcc, v6, v5
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
+; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4
+; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
+; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3
+; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; GCN-NEXT: v_xor_b32_e32 v4, v4, v3
+; GCN-NEXT: v_sub_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
; TONGA-LABEL: sdiv_v4i32:
; TONGA: ; %bb.0:
; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; TONGA-NEXT: s_mov_b32 s11, 0xf000
-; TONGA-NEXT: s_mov_b32 s10, -1
-; TONGA-NEXT: s_mov_b32 s6, s10
-; TONGA-NEXT: s_mov_b32 s7, s11
+; TONGA-NEXT: s_mov_b32 s7, 0xf000
+; TONGA-NEXT: s_mov_b32 s6, -1
+; TONGA-NEXT: s_mov_b32 s10, s6
+; TONGA-NEXT: s_mov_b32 s11, s7
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s4, s2
-; TONGA-NEXT: s_mov_b32 s5, s3
-; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
-; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0
-; TONGA-NEXT: s_mov_b32 s8, s0
-; TONGA-NEXT: s_mov_b32 s9, s1
+; TONGA-NEXT: s_mov_b32 s8, s2
+; TONGA-NEXT: s_mov_b32 s9, s3
+; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; TONGA-NEXT: s_mov_b32 s4, s0
+; TONGA-NEXT: s_mov_b32 s5, s1
; TONGA-NEXT: s_waitcnt vmcnt(1)
-; TONGA-NEXT: v_readfirstlane_b32 s0, v0
-; TONGA-NEXT: v_readfirstlane_b32 s1, v1
-; TONGA-NEXT: v_readfirstlane_b32 s2, v2
-; TONGA-NEXT: s_abs_i32 s13, s0
-; TONGA-NEXT: s_abs_i32 s14, s1
-; TONGA-NEXT: s_abs_i32 s15, s2
-; TONGA-NEXT: v_cvt_f32_u32_e32 v0, s13
-; TONGA-NEXT: v_cvt_f32_u32_e32 v1, s14
-; TONGA-NEXT: v_cvt_f32_u32_e32 v2, s15
-; TONGA-NEXT: v_readfirstlane_b32 s6, v3
+; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v0
+; TONGA-NEXT: s_waitcnt vmcnt(0)
+; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v4
+; TONGA-NEXT: v_xor_b32_e32 v8, v0, v4
+; TONGA-NEXT: v_max_i32_e32 v4, v4, v10
+; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v4
+; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v5
+; TONGA-NEXT: v_xor_b32_e32 v11, v1, v5
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10
+; TONGA-NEXT: v_max_i32_e32 v5, v5, v13
+; TONGA-NEXT: v_cvt_f32_u32_e32 v13, v5
+; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v4
+; TONGA-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10
+; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v10
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v13, v13
+; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v1
+; TONGA-NEXT: v_mul_lo_u32 v16, v16, v10
+; TONGA-NEXT: v_mul_f32_e32 v13, 0x4f7ffffe, v13
+; TONGA-NEXT: v_cvt_u32_f32_e32 v13, v13
+; TONGA-NEXT: v_max_i32_e32 v0, v0, v9
+; TONGA-NEXT: v_mul_hi_u32 v16, v10, v16
+; TONGA-NEXT: v_max_i32_e32 v1, v1, v12
+; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v6
+; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v16
+; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v5
+; TONGA-NEXT: v_mul_lo_u32 v16, v16, v13
+; TONGA-NEXT: v_mul_hi_u32 v10, v0, v10
+; TONGA-NEXT: v_xor_b32_e32 v14, v2, v6
+; TONGA-NEXT: v_max_i32_e32 v6, v6, v15
+; TONGA-NEXT: v_mul_hi_u32 v12, v13, v16
+; TONGA-NEXT: v_cvt_f32_u32_e32 v15, v6
+; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v8
+; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v11
+; TONGA-NEXT: v_add_u32_e32 v12, vcc, v13, v12
+; TONGA-NEXT: v_mul_lo_u32 v13, v10, v4
+; TONGA-NEXT: v_mul_hi_u32 v12, v1, v12
+; TONGA-NEXT: v_rcp_iflag_f32_e32 v9, v15
+; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v14
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v13
+; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
+; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1]
+; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v0, v4
+; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v13, s[0:1]
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4
+; TONGA-NEXT: v_mul_lo_u32 v0, v12, v5
+; TONGA-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9
+; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9
+; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v6
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0
+; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v12
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5
+; TONGA-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3]
+; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v0, v5
+; TONGA-NEXT: v_mul_lo_u32 v4, v4, v9
+; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3]
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, 0, v7
+; TONGA-NEXT: v_max_i32_e32 v5, v7, v0
+; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v5
+; TONGA-NEXT: v_mul_hi_u32 v4, v9, v4
+; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10
; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; TONGA-NEXT: s_abs_i32 s17, s6
-; TONGA-NEXT: v_cvt_f32_u32_e32 v3, s17
+; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4
+; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2
+; TONGA-NEXT: v_max_i32_e32 v2, v2, v9
+; TONGA-NEXT: v_mul_hi_u32 v4, v2, v4
; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; TONGA-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; TONGA-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; TONGA-NEXT: v_cvt_u32_f32_e32 v0, v0
-; TONGA-NEXT: v_cvt_u32_f32_e32 v1, v1
-; TONGA-NEXT: v_cvt_u32_f32_e32 v2, v2
-; TONGA-NEXT: s_waitcnt vmcnt(0)
-; TONGA-NEXT: v_readfirstlane_b32 s3, v4
-; TONGA-NEXT: v_readfirstlane_b32 s4, v5
-; TONGA-NEXT: v_readfirstlane_b32 s5, v6
-; TONGA-NEXT: s_xor_b32 s12, s3, s0
-; TONGA-NEXT: s_xor_b32 s0, s4, s1
-; TONGA-NEXT: s_xor_b32 s1, s5, s2
-; TONGA-NEXT: s_sub_i32 s2, 0, s13
-; TONGA-NEXT: s_ashr_i32 s18, s0, 31
-; TONGA-NEXT: s_sub_i32 s0, 0, s14
-; TONGA-NEXT: s_ashr_i32 s19, s1, 31
-; TONGA-NEXT: s_sub_i32 s1, 0, s15
-; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; TONGA-NEXT: v_mul_lo_u32 v4, s2, v0
-; TONGA-NEXT: v_mul_lo_u32 v5, s0, v1
-; TONGA-NEXT: v_mul_lo_u32 v6, s1, v2
-; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3
-; TONGA-NEXT: v_mul_hi_u32 v4, v0, v4
-; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5
-; TONGA-NEXT: v_mul_hi_u32 v6, v2, v6
-; TONGA-NEXT: s_sub_i32 s20, 0, s17
-; TONGA-NEXT: v_readfirstlane_b32 s7, v7
-; TONGA-NEXT: s_abs_i32 s3, s3
-; TONGA-NEXT: s_abs_i32 s4, s4
-; TONGA-NEXT: s_abs_i32 s5, s5
-; TONGA-NEXT: v_mul_lo_u32 v7, s20, v3
-; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4
-; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5
-; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6
-; TONGA-NEXT: v_mul_hi_u32 v0, s3, v0
-; TONGA-NEXT: v_mul_hi_u32 v1, s4, v1
-; TONGA-NEXT: v_mul_hi_u32 v2, s5, v2
-; TONGA-NEXT: v_mul_hi_u32 v7, v3, v7
-; TONGA-NEXT: v_mul_lo_u32 v4, v0, s13
-; TONGA-NEXT: v_mul_lo_u32 v6, v1, s14
-; TONGA-NEXT: v_mul_lo_u32 v8, v2, s15
-; TONGA-NEXT: s_abs_i32 s16, s7
-; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7
-; TONGA-NEXT: v_mul_hi_u32 v3, s16, v3
-; TONGA-NEXT: v_sub_u32_e32 v4, vcc, s3, v4
-; TONGA-NEXT: v_sub_u32_e32 v6, vcc, s4, v6
-; TONGA-NEXT: v_sub_u32_e32 v8, vcc, s5, v8
-; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v0
-; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v1
-; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v2
-; TONGA-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4
-; TONGA-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v6
-; TONGA-NEXT: v_cmp_le_u32_e64 s[4:5], s15, v8
-; TONGA-NEXT: v_subrev_u32_e32 v10, vcc, s13, v4
-; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1]
-; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, s14, v6
-; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3]
-; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, s15, v8
-; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[4:5]
-; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1]
-; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v0
-; TONGA-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3]
-; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v1
-; TONGA-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[4:5]
-; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v2
-; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s13, v4
-; TONGA-NEXT: v_mul_lo_u32 v4, v3, s17
-; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
-; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s14, v5
-; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
-; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s15, v7
-; TONGA-NEXT: s_ashr_i32 s12, s12, 31
-; TONGA-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
-; TONGA-NEXT: v_xor_b32_e32 v0, s12, v0
-; TONGA-NEXT: v_xor_b32_e32 v1, s18, v1
-; TONGA-NEXT: v_xor_b32_e32 v2, s19, v2
-; TONGA-NEXT: v_sub_u32_e32 v4, vcc, s16, v4
-; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, s12, v0
-; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s18, v1
-; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, s19, v2
-; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3
-; TONGA-NEXT: v_subrev_u32_e32 v6, vcc, s17, v4
-; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s17, v4
-; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3
-; TONGA-NEXT: s_xor_b32 s0, s7, s6
-; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s17, v4
-; TONGA-NEXT: s_ashr_i32 s0, s0, 31
-; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; TONGA-NEXT: v_xor_b32_e32 v3, s0, v3
-; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, s0, v3
-; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v0
+; TONGA-NEXT: v_cndmask_b32_e64 v0, v10, v13, s[0:1]
+; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8
+; TONGA-NEXT: v_mul_lo_u32 v8, v4, v6
+; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v1
+; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v5
+; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v8
+; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3]
+; TONGA-NEXT: v_mul_lo_u32 v10, v10, v9
+; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4
+; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6
+; TONGA-NEXT: v_xor_b32_e32 v1, v1, v11
+; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1]
+; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v2, v6
+; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v11
+; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1]
+; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4
+; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
+; TONGA-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc
+; TONGA-NEXT: v_mul_hi_u32 v4, v9, v10
+; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v3
+; TONGA-NEXT: v_max_i32_e32 v6, v3, v6
+; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4
+; TONGA-NEXT: v_mul_hi_u32 v4, v6, v4
+; TONGA-NEXT: v_xor_b32_e32 v2, v2, v14
+; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v14
+; TONGA-NEXT: v_mul_lo_u32 v8, v4, v5
+; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7
+; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4
+; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v8
+; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v6, v5
+; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
+; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; TONGA-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc
+; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4
+; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5
+; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v3
+; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; TONGA-NEXT: v_xor_b32_e32 v4, v4, v3
+; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v4, v3
+; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v4i32:
@@ -2006,7 +1994,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_mul_lo_u32 v1, v3, v2
; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v3
; GCN-NEXT: v_sub_i32_e32 v1, vcc, v5, v1
-; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v2, v1
+; GCN-NEXT: v_sub_i32_e32 v5, vcc, v1, v2
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -2014,7 +2002,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GCN-NEXT: v_xor_b32_e32 v1, v1, v0
-; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
@@ -2053,7 +2041,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_mul_lo_u32 v1, v3, v2
; TONGA-NEXT: v_add_u32_e32 v4, vcc, 1, v3
; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v5, v1
-; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, v2, v1
+; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v1, v2
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -2061,7 +2049,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0
-; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v0, v1
+; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0
; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25
; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; TONGA-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.ll b/llvm/test/CodeGen/AMDGPU/sminmax.ll
index dbcb4b75e7818..002efac8039d5 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.ll
@@ -21,7 +21,7 @@ define amdgpu_kernel void @s_abs_i32(ptr addrspace(1) %out, i32 %val) nounwind {
; SIVI: v_sub_{{i|u}}32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]]
; GFX9: v_sub_u32_e32 [[NEG:v[0-9]+]], 0, [[SRC:v[0-9]+]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[SRC]], [[NEG]]
+; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG]], [[SRC]]
; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc
; GFX9: v_add_u32_e32 v{{[0-9]+}}, 2
@@ -42,7 +42,7 @@ define amdgpu_kernel void @v_abs_i32(ptr addrspace(1) %out, ptr addrspace(1) %sr
; GCN-LABEL: {{^}}v_abs_i32_repeat_user:
; SIVI: v_sub_{{i|u}}32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]]
; GFX9: v_sub_u32_e32 [[NEG:v[0-9]+]], 0, [[SRC:v[0-9]+]]
-; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[SRC]], [[NEG]]
+; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[NEG]], [[SRC]]
; GCN: v_mul_lo_u32 v{{[0-9]+}}, [[MAX]], [[MAX]]
define amdgpu_kernel void @v_abs_i32_repeat_user(ptr addrspace(1) %out, ptr addrspace(1) %src) nounwind {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -84,8 +84,8 @@ define amdgpu_kernel void @s_abs_v2i32(ptr addrspace(1) %out, <2 x i32> %val) no
; GFX9-DAG: v_sub_u32_e32 [[NEG0:v[0-9]+]], 0, [[SRC0:v[0-9]+]]
; GFX9-DAG: v_sub_u32_e32 [[NEG1:v[0-9]+]], 0, [[SRC1:v[0-9]+]]
-; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC0]], [[NEG0]]
-; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC1]], [[NEG1]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc
; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc
@@ -156,10 +156,10 @@ define amdgpu_kernel void @s_abs_v4i32(ptr addrspace(1) %out, <4 x i32> %val) no
; GFX9-DAG: v_sub_u32_e32 [[NEG2:v[0-9]+]], 0, [[SRC2:v[0-9]+]]
; GFX9-DAG: v_sub_u32_e32 [[NEG3:v[0-9]+]], 0, [[SRC3:v[0-9]+]]
-; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC0]], [[NEG0]]
-; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC1]], [[NEG1]]
-; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC2]], [[NEG2]]
-; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[SRC3]], [[NEG3]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]]
; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc,
; SIVI: v_add_{{i|u}}32_e32 v{{[0-9]+}}, vcc,
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index 5944342b2642a..bbd179364374c 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -467,28 +467,28 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-NEXT: v_readfirstlane_b32 s2, v2
; GCN-NEXT: s_abs_i32 s2, s2
; GCN-NEXT: v_cvt_f32_u32_e32 v2, s2
-; GCN-NEXT: v_readfirstlane_b32 s3, v0
+; GCN-NEXT: v_readfirstlane_b32 s4, v0
; GCN-NEXT: s_sub_i32 s6, 0, s2
-; GCN-NEXT: s_ashr_i32 s5, s3, 31
+; GCN-NEXT: s_ashr_i32 s5, s4, 31
; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GCN-NEXT: s_abs_i32 s3, s3
-; GCN-NEXT: v_readfirstlane_b32 s4, v3
+; GCN-NEXT: s_abs_i32 s4, s4
+; GCN-NEXT: v_readfirstlane_b32 s3, v3
; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v2
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_readfirstlane_b32 s7, v0
; GCN-NEXT: s_mul_i32 s6, s6, s7
; GCN-NEXT: s_mul_hi_u32 s6, s7, s6
; GCN-NEXT: s_add_i32 s7, s7, s6
-; GCN-NEXT: s_mul_hi_u32 s6, s3, s7
+; GCN-NEXT: s_mul_hi_u32 s6, s4, s7
; GCN-NEXT: s_mul_i32 s6, s6, s2
-; GCN-NEXT: s_sub_i32 s3, s3, s6
-; GCN-NEXT: s_sub_i32 s6, s3, s2
-; GCN-NEXT: s_cmp_ge_u32 s3, s2
-; GCN-NEXT: s_cselect_b32 s3, s6, s3
-; GCN-NEXT: s_sub_i32 s6, s3, s2
-; GCN-NEXT: s_cmp_ge_u32 s3, s2
-; GCN-NEXT: s_cselect_b32 s2, s6, s3
-; GCN-NEXT: s_abs_i32 s3, s4
+; GCN-NEXT: s_sub_i32 s4, s4, s6
+; GCN-NEXT: s_sub_i32 s6, s4, s2
+; GCN-NEXT: s_cmp_ge_u32 s4, s2
+; GCN-NEXT: s_cselect_b32 s4, s6, s4
+; GCN-NEXT: s_sub_i32 s6, s4, s2
+; GCN-NEXT: s_cmp_ge_u32 s4, s2
+; GCN-NEXT: s_cselect_b32 s2, s6, s4
+; GCN-NEXT: s_abs_i32 s3, s3
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3
; GCN-NEXT: s_xor_b32 s2, s2, s5
; GCN-NEXT: s_sub_i32 s7, 0, s3
More information about the llvm-commits
mailing list