[llvm] [CodeGen] MachineVerifier to check early-clobber constraint (PR #151421)
Abhay Kanhere via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 7 13:52:30 PDT 2025
https://github.com/AbhayKanhere updated https://github.com/llvm/llvm-project/pull/151421
>From ec793aaafc6973291b3763c35e11004e2bd261e8 Mon Sep 17 00:00:00 2001
From: Abhay Kanhere <abhay at kanhere.net>
Date: Wed, 30 Jul 2025 16:23:35 -0700
Subject: [PATCH 1/4] [CodeGen] MachineVerifier to check early-clobber
constraint
Currently MachineVerifier is missing verifying early-clobber operand
constraint. The only other machine operand constraint - TiedTo is
already verified.
---
llvm/lib/CodeGen/MachineVerifier.cpp | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 01703fe09b79a..ebef1c9034f4a 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -2325,6 +2325,13 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
report("Missing mayStore flag", MI);
}
+ // Verify earlyClobber def operand
+ if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
+ if (!MI->getOperand(0).isReg())
+ report("Early clobber must be a register", MI);
+ if (!MI->getOperand(0).isEarlyClobber())
+ report("Missing earlyClobber flag", MI);
+ }
// Debug values must not have a slot index.
// Other instructions must have one, unless they are inside a bundle.
if (LiveInts) {
>From f70f3baf0010fc22680593945b75cb2e88c8c903 Mon Sep 17 00:00:00 2001
From: Abhay Kanhere <abhay at kanhere.net>
Date: Mon, 4 Aug 2025 10:35:16 -0700
Subject: [PATCH 2/4] Fix the test failure with this PR, update AMDGPU code
that did not set earlyclobber
---
.../AMDGPU/AMDGPUInstructionSelector.cpp | 14 +-
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 4 +
.../GlobalISel/llvm.amdgcn.set.inactive.ll | 10 +-
.../AMDGPU/GlobalISel/mul-known-bits.i64.ll | 16 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 902 ++---
.../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 2638 +++++++-------
.../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 1050 +++---
.../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 3164 ++++++++---------
.../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 926 ++---
llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 123 +-
llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 266 +-
.../CodeGen/AMDGPU/integer-mad-patterns.ll | 518 +--
.../AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll | 74 +-
llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll | 466 ++-
14 files changed, 5118 insertions(+), 5053 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index b0d3b12471a38..5f3fe38ae011b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -116,8 +116,13 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
if (!DstRC || DstRC != SrcRC)
return false;
- return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
- RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
+ auto result = RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
+ RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
+ const MCInstrDesc &MCID = MI.getDesc();
+ if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
+ MI.getOperand(0).setIsEarlyClobber(true);
+ }
+ return result;
}
bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
@@ -593,6 +598,7 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
I.setDesc(TII.get(Opc));
I.addOperand(*MF, MachineOperand::CreateImm(0));
I.addImplicitDefUseOperands(*MF);
+ I.getOperand(0).setIsEarlyClobber(true);
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
@@ -3795,6 +3801,10 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
MI.removeOperand(1); // Intrinsic ID
MI.addOperand(VDst_In); // Readd VDst_In to the end
MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
+ const MCInstrDesc &MCID = MI.getDesc();
+ if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
+ MI.getOperand(0).setIsEarlyClobber(true);
+ }
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index b77da4d612dd4..f0d9aa2d0a112 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -687,6 +687,10 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
if (!TII->isOperandLegal(*MI, OpNo, &New))
return false;
+ const MCInstrDesc &MCID = MI->getDesc();
+ if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
+ MI->getOperand(0).setIsEarlyClobber(true);
+ }
Old.ChangeToImmediate(*ImmVal);
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index e411c23c77bbe..7b5621ff3b5a9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -27,11 +27,11 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 1
-; GCN-NEXT: v_mov_b32_e32 v0, v0
+; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
%tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
@@ -68,12 +68,12 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: v_mov_b32_e32 v0, v0
-; GCN-NEXT: v_mov_b32_e32 v1, v1
+; GCN-NEXT: v_mov_b32_e32 v2, v0
+; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
%tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 1cd9c0bfeb7e6..2351c969d5e49 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -165,10 +165,10 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v3, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v1, v2, 0
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, v2, v3, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
@@ -179,15 +179,15 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT: global_load_b32 v0, v0, s[4:5]
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v3, v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v1, v0, 0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 3daae98961bff..59824917592fa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -546,10 +546,11 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v4, v0
-; GCN-NEXT: v_mov_b32_e32 v5, v1
-; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
-; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GCN-NEXT: v_mov_b32_e32 v5, v2
+; GCN-NEXT: v_mov_b32_e32 v6, v1
+; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GCN-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
+; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i64:
@@ -742,10 +743,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1]
; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
-; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v2, v3, v[8:9]
-; GCN-NEXT: v_mov_b32_e32 v2, v8
-; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v4, v[1:2]
-; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[1:2]
+; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v3, v[8:9]
+; GCN-NEXT: v_mov_b32_e32 v2, v10
+; GCN-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v4, v[1:2]
+; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i96:
@@ -758,8 +759,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v3, 0
; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v2, v3, v[8:9]
; GFX10-NEXT: v_mov_b32_e32 v2, v8
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v6, v4, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[8:9]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_mul_i96:
@@ -771,8 +772,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v3, 0
; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v2, v3, v[8:9]
; GFX11-NEXT: v_mov_b32_e32 v2, v9
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v6, v4, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[8:9]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_mul_i96:
@@ -791,8 +792,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9]
; GFX12-NEXT: v_mov_b32_e32 v2, v8
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2]
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2]
+; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v6, v4, v[1:2]
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v3, v[8:9]
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: v_mul_i96:
@@ -1071,18 +1072,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX7-NEXT: v_mov_b32_e32 v9, v1
; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX7-NEXT: v_mov_b32_e32 v10, v2
+; GFX7-NEXT: v_mov_b32_e32 v11, v3
+; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v12, v4
+; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3]
; GFX7-NEXT: v_mul_lo_u32 v7, v8, v7
-; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
; GFX7-NEXT: v_mul_lo_u32 v6, v9, v6
-; GFX7-NEXT: v_mov_b32_e32 v2, v11
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX7-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX7-NEXT: v_mov_b32_e32 v2, v13
+; GFX7-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2]
+; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4]
+; GFX7-NEXT: v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5]
+; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i128:
@@ -1092,18 +1095,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX8-NEXT: v_mov_b32_e32 v9, v1
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX8-NEXT: v_mov_b32_e32 v10, v2
+; GFX8-NEXT: v_mov_b32_e32 v11, v3
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v12, v4
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3]
; GFX8-NEXT: v_mul_lo_u32 v7, v8, v7
-; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
; GFX8-NEXT: v_mul_lo_u32 v6, v9, v6
-; GFX8-NEXT: v_mov_b32_e32 v2, v11
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX8-NEXT: v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX8-NEXT: v_mov_b32_e32 v2, v13
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2]
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4]
+; GFX8-NEXT: v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5]
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i128:
@@ -1113,18 +1118,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX9-NEXT: v_mov_b32_e32 v9, v1
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
; GFX9-NEXT: v_mov_b32_e32 v10, v2
+; GFX9-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v12, v4
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3]
; GFX9-NEXT: v_mul_lo_u32 v7, v8, v7
-; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
; GFX9-NEXT: v_mul_lo_u32 v6, v9, v6
-; GFX9-NEXT: v_mov_b32_e32 v2, v11
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX9-NEXT: v_addc_co_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v6, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX9-NEXT: v_mov_b32_e32 v2, v13
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2]
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4]
+; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], v14, v7, s[4:5]
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i128:
@@ -1138,11 +1145,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX10-NEXT: v_mul_lo_u32 v6, v9, v6
; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0
-; GFX10-NEXT: v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12]
-; GFX10-NEXT: v_mov_b32_e32 v2, v11
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[1:2]
-; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v12, v7, s4
+; GFX10-NEXT: v_mad_u64_u32 v[13:14], s4, v10, v4, v[11:12]
+; GFX10-NEXT: v_mov_b32_e32 v2, v13
+; GFX10-NEXT: v_mad_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[11:12]
+; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v14, v7, s4
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v10, v5, v[6:7]
; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v4, v[5:6]
@@ -1155,15 +1162,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX11-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v4
; GFX11-NEXT: v_mov_b32_e32 v12, v3
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v6, 0
-; GFX11-NEXT: v_mul_lo_u32 v4, v9, v6
-; GFX11-NEXT: v_mul_lo_u32 v6, v8, v7
+; GFX11-NEXT: v_mul_lo_u32 v7, v8, v7
+; GFX11-NEXT: v_mul_lo_u32 v6, v9, v6
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v9, v5, v[0:1]
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v11, 0
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v10, v11, v[2:3]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[1:2]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, s0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v4, vcc_lo
+; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v10, v11, v[2:3]
+; GFX11-NEXT: v_mov_b32_e32 v2, v13
+; GFX11-NEXT: v_mad_u64_u32 v[3:4], vcc_lo, v8, v5, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[3:4]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v14, v7, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, vcc_lo
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v10, v5, v[3:4]
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v12, v11, v[6:7]
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -1184,14 +1192,14 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1]
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
-; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v10, v4, v[11:12]
+; GFX12-NEXT: v_mad_co_u64_u32 v[13:14], null, v10, v4, v[11:12]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v2, v11
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
+; GFX12-NEXT: v_mov_b32_e32 v2, v13
+; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[11:12]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v12, v7, s0
+; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v14, v7, s0
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, v7, v6, vcc_lo
@@ -2401,207 +2409,216 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX7-NEXT: v_mul_lo_u32 v28, v3, v12
+; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19]
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21]
; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX7-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX7-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX7-NEXT: v_addc_u32_e32 v23, vcc, 0, v22, vcc
-; GFX7-NEXT: v_mov_b32_e32 v22, v18
-; GFX7-NEXT: v_mov_b32_e32 v18, v19
-; GFX7-NEXT: v_mov_b32_e32 v19, v16
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX7-NEXT: v_mul_lo_u32 v16, v6, v9
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21]
+; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23]
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19]
+; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23]
+; GFX7-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19]
+; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21]
+; GFX7-NEXT: v_addc_u32_e32 v26, vcc, 0, v24, vcc
+; GFX7-NEXT: v_mov_b32_e32 v21, v22
+; GFX7-NEXT: v_mov_b32_e32 v22, v23
+; GFX7-NEXT: v_mov_b32_e32 v23, v18
+; GFX7-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23]
+; GFX7-NEXT: v_mul_lo_u32 v18, v6, v9
; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX7-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX7-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX7-NEXT: v_mul_lo_u32 v27, v3, v12
-; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
-; GFX7-NEXT: v_mul_lo_u32 v25, v5, v10
-; GFX7-NEXT: v_mul_lo_u32 v28, v2, v13
-; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17]
+; GFX7-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25]
+; GFX7-NEXT: v_mov_b32_e32 v20, v23
+; GFX7-NEXT: v_mul_lo_u32 v25, v4, v11
+; GFX7-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21]
+; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24]
+; GFX7-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11]
+; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17]
+; GFX7-NEXT: v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11]
+; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21]
+; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12]
; GFX7-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX7-NEXT: v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX7-NEXT: v_mov_b32_e32 v21, v20
-; GFX7-NEXT: v_mov_b32_e32 v20, v11
-; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX7-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX7-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX7-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11]
+; GFX7-NEXT: v_mov_b32_e32 v12, v22
+; GFX7-NEXT: v_mul_lo_u32 v2, v2, v13
+; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12]
+; GFX7-NEXT: v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11]
+; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21]
+; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17]
+; GFX7-NEXT: v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11]
; GFX7-NEXT: v_mul_lo_u32 v9, v1, v14
-; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX7-NEXT: v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX7-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17]
+; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23]
+; GFX7-NEXT: v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11]
; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15
-; GFX7-NEXT: v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX7-NEXT: v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11]
+; GFX7-NEXT: v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11]
+; GFX7-NEXT: v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11]
+; GFX7-NEXT: v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11]
; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7]
-; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc
-; GFX7-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7]
+; GFX7-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v0, v18, vcc
+; GFX7-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, v10
+; GFX7-NEXT: v_mov_b32_e32 v1, v13
+; GFX7-NEXT: v_mov_b32_e32 v2, v14
+; GFX7-NEXT: v_mov_b32_e32 v7, v11
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_mul_i256:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX8-NEXT: v_mul_lo_u32 v28, v3, v12
+; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19]
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21]
; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX8-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX8-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX8-NEXT: v_addc_u32_e32 v23, vcc, 0, v22, vcc
-; GFX8-NEXT: v_mov_b32_e32 v22, v18
-; GFX8-NEXT: v_mov_b32_e32 v18, v19
-; GFX8-NEXT: v_mov_b32_e32 v19, v16
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX8-NEXT: v_mul_lo_u32 v16, v6, v9
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21]
+; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23]
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19]
+; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23]
+; GFX8-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19]
+; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21]
+; GFX8-NEXT: v_addc_u32_e32 v26, vcc, 0, v24, vcc
+; GFX8-NEXT: v_mov_b32_e32 v21, v22
+; GFX8-NEXT: v_mov_b32_e32 v22, v23
+; GFX8-NEXT: v_mov_b32_e32 v23, v18
+; GFX8-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23]
+; GFX8-NEXT: v_mul_lo_u32 v18, v6, v9
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX8-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX8-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX8-NEXT: v_mul_lo_u32 v27, v3, v12
-; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
-; GFX8-NEXT: v_mul_lo_u32 v25, v5, v10
-; GFX8-NEXT: v_mul_lo_u32 v28, v2, v13
-; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17]
+; GFX8-NEXT: v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25]
+; GFX8-NEXT: v_mov_b32_e32 v20, v23
+; GFX8-NEXT: v_mul_lo_u32 v25, v4, v11
+; GFX8-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21]
+; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24]
+; GFX8-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11]
+; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17]
+; GFX8-NEXT: v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11]
+; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21]
+; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12]
; GFX8-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX8-NEXT: v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX8-NEXT: v_mov_b32_e32 v21, v20
-; GFX8-NEXT: v_mov_b32_e32 v20, v11
-; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX8-NEXT: v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX8-NEXT: v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11]
+; GFX8-NEXT: v_mov_b32_e32 v12, v22
+; GFX8-NEXT: v_mul_lo_u32 v2, v2, v13
+; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12]
+; GFX8-NEXT: v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11]
+; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21]
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17]
+; GFX8-NEXT: v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11]
; GFX8-NEXT: v_mul_lo_u32 v9, v1, v14
-; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX8-NEXT: v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17]
+; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23]
+; GFX8-NEXT: v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11]
; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15
-; GFX8-NEXT: v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX8-NEXT: v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11]
; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7]
-; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v16, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v0, v18, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, v10
+; GFX8-NEXT: v_mov_b32_e32 v1, v13
+; GFX8-NEXT: v_mov_b32_e32 v2, v14
+; GFX8-NEXT: v_mov_b32_e32 v7, v11
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mul_i256:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX9-NEXT: v_mul_lo_u32 v28, v3, v12
+; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19]
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21]
; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX9-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v22, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v22, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v22, vcc
-; GFX9-NEXT: v_mov_b32_e32 v22, v18
-; GFX9-NEXT: v_mov_b32_e32 v18, v19
-; GFX9-NEXT: v_mov_b32_e32 v19, v16
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX9-NEXT: v_mul_lo_u32 v16, v6, v9
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21]
+; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23]
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19]
+; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v24, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23]
+; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v24, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19]
+; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21]
+; GFX9-NEXT: v_addc_co_u32_e32 v26, vcc, 0, v24, vcc
+; GFX9-NEXT: v_mov_b32_e32 v21, v22
+; GFX9-NEXT: v_mov_b32_e32 v22, v23
+; GFX9-NEXT: v_mov_b32_e32 v23, v18
+; GFX9-NEXT: v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23]
+; GFX9-NEXT: v_mul_lo_u32 v18, v6, v9
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX9-NEXT: v_addc_co_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX9-NEXT: v_mul_lo_u32 v26, v4, v11
-; GFX9-NEXT: v_mul_lo_u32 v27, v3, v12
-; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
-; GFX9-NEXT: v_mul_lo_u32 v25, v5, v10
-; GFX9-NEXT: v_mul_lo_u32 v28, v2, v13
-; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17]
+; GFX9-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, v6, s[4:5]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25]
+; GFX9-NEXT: v_mov_b32_e32 v20, v23
+; GFX9-NEXT: v_mul_lo_u32 v25, v4, v11
+; GFX9-NEXT: v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21]
+; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24]
+; GFX9-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[10:11]
+; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17]
+; GFX9-NEXT: v_addc_co_u32_e64 v24, s[10:11], 0, v23, s[10:11]
+; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21]
+; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12]
; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX9-NEXT: v_addc_co_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX9-NEXT: v_mov_b32_e32 v21, v20
-; GFX9-NEXT: v_mov_b32_e32 v20, v11
-; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX9-NEXT: v_addc_co_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[10:11], 0, v2, s[10:11]
+; GFX9-NEXT: v_mov_b32_e32 v12, v22
+; GFX9-NEXT: v_mul_lo_u32 v2, v2, v13
+; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12]
+; GFX9-NEXT: v_addc_co_u32_e64 v13, s[10:11], 0, v24, s[10:11]
+; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21]
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17]
+; GFX9-NEXT: v_addc_co_u32_e64 v16, s[10:11], 0, v13, s[10:11]
; GFX9-NEXT: v_mul_lo_u32 v9, v1, v14
-; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX9-NEXT: v_addc_co_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[16:17]
+; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23]
+; GFX9-NEXT: v_addc_co_u32_e64 v3, s[10:11], v3, v4, s[10:11]
; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15
-; GFX9-NEXT: v_addc_co_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX9-NEXT: v_addc_co_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v17, v0, s[10:11]
+; GFX9-NEXT: v_addc_co_u32_e64 v4, s[10:11], v6, v5, s[10:11]
+; GFX9-NEXT: v_addc_co_u32_e64 v5, s[10:11], v16, v11, s[10:11]
+; GFX9-NEXT: v_addc_co_u32_e64 v6, s[10:11], v26, v12, s[10:11]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v19, v0, s[10:11]
; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v27, s[8:9]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v26, s[6:7]
-; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v16, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[10:11], v0, v2, s[12:13]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[8:9], v0, v28, s[8:9]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[6:7], v0, v25, s[6:7]
+; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v0, v18, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, v10
+; GFX9-NEXT: v_mov_b32_e32 v1, v13
+; GFX9-NEXT: v_mov_b32_e32 v2, v14
+; GFX9-NEXT: v_mov_b32_e32 v7, v11
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i256:
@@ -2609,68 +2626,69 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v16, v0
; GFX10-NEXT: v_mov_b32_e32 v17, v1
-; GFX10-NEXT: v_mul_lo_u32 v27, v6, v9
-; GFX10-NEXT: v_mul_lo_u32 v28, v5, v10
+; GFX10-NEXT: v_mul_lo_u32 v29, v4, v11
+; GFX10-NEXT: v_mul_lo_u32 v31, v3, v12
+; GFX10-NEXT: v_mul_lo_u32 v30, v2, v13
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v14, 0
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v12, 0
-; GFX10-NEXT: v_mul_lo_u32 v30, v17, v14
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19]
-; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v16, v10, 0
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX10-NEXT: v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
-; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21]
-; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
-; GFX10-NEXT: v_mov_b32_e32 v20, v22
-; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
-; GFX10-NEXT: v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20]
-; GFX10-NEXT: v_mov_b32_e32 v20, v18
+; GFX10-NEXT: v_mul_lo_u32 v28, v17, v14
+; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v13, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v12, 0
+; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v2, v12, v[18:19]
+; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v17, v11, v[0:1]
+; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, 1, s4
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v3, v11, v[20:21]
+; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
+; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, 0, v22, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v4, v10, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
+; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v22, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[20:21], s4, v5, v9, v[18:19]
+; GFX10-NEXT: v_mad_u64_u32 v[18:19], s4, v16, v10, 0
+; GFX10-NEXT: v_mad_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1]
+; GFX10-NEXT: v_add_co_ci_u32_e32 v26, vcc_lo, 0, v26, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[24:25], s4, v6, v8, v[20:21]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[18:19]
+; GFX10-NEXT: v_mov_b32_e32 v18, v23
+; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
+; GFX10-NEXT: v_mul_lo_u32 v23, v6, v9
+; GFX10-NEXT: v_mov_b32_e32 v19, v24
+; GFX10-NEXT: v_mul_lo_u32 v24, v5, v10
+; GFX10-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1]
+; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v27, vcc_lo
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v16, v13, v[18:19]
; GFX10-NEXT: v_mov_b32_e32 v19, v22
-; GFX10-NEXT: v_mul_lo_u32 v22, v16, v15
-; GFX10-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
-; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20]
+; GFX10-NEXT: v_mul_lo_u32 v27, v16, v15
+; GFX10-NEXT: v_mov_b32_e32 v18, v21
+; GFX10-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1]
+; GFX10-NEXT: v_mad_u64_u32 v[14:15], s6, v16, v11, v[18:19]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s5, v16, v8, 0
-; GFX10-NEXT: v_mul_lo_u32 v20, v4, v11
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25]
-; GFX10-NEXT: v_mul_lo_u32 v25, v3, v12
+; GFX10-NEXT: v_cndmask_b32_e64 v32, 0, 1, s6
+; GFX10-NEXT: v_mad_u64_u32 v[18:19], s5, v2, v11, v[21:22]
; GFX10-NEXT: v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15]
-; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
-; GFX10-NEXT: v_mul_lo_u32 v24, v2, v13
-; GFX10-NEXT: v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19]
; GFX10-NEXT: v_mov_b32_e32 v13, v1
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12]
-; GFX10-NEXT: v_mov_b32_e32 v14, v21
-; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
-; GFX10-NEXT: v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19]
-; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14]
-; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8
-; GFX10-NEXT: v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2]
-; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s8, 0, v6, s8
-; GFX10-NEXT: v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13]
-; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v9, v3, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v29, v4, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v14, v5, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v6, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v23, v22, s9
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v30, s8
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v24, s6
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v25, s7
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v20, s5
-; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
-; GFX10-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s4
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s6, 0, v32, s6
+; GFX10-NEXT: v_mov_b32_e32 v14, v20
+; GFX10-NEXT: v_mad_u64_u32 v[21:22], s7, v3, v10, v[18:19]
+; GFX10-NEXT: v_mad_u64_u32 v[18:19], s6, v2, v9, v[11:12]
+; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s6, 0, v1, s6
+; GFX10-NEXT: v_mad_u64_u32 v[10:11], s8, v16, v9, v[13:14]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s6, v4, v9, v[21:22]
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s8
+; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v3, v8, v[18:19]
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, 0, v15, s8
+; GFX10-NEXT: v_mad_u64_u32 v[14:15], s8, v5, v8, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[10:11]
+; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v4, v12, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v6, v13, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v9, v14, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v26, v15, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s9, v25, v27, s9
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s8, v9, v28, s8
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v30, s6
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s6, v9, v31, s7
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s5, v9, v29, s5
+; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v24, vcc_lo
+; GFX10-NEXT: v_add_co_ci_u32_e64 v9, vcc_lo, v9, v23, s4
; GFX10-NEXT: v_mad_u64_u32 v[7:8], s4, v7, v8, v[9:10]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -2681,66 +2699,65 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX11-NEXT: v_dual_mov_b32 v18, v8 :: v_dual_mov_b32 v19, v7
; GFX11-NEXT: v_mul_lo_u32 v30, v4, v11
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v14, 0
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v16, v12, 0
+; GFX11-NEXT: v_mul_lo_u32 v28, v16, v15
; GFX11-NEXT: v_mul_lo_u32 v29, v17, v14
-; GFX11-NEXT: v_mul_lo_u32 v28, v5, v10
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], s0, v17, v11, v[7:8]
-; GFX11-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v2, v10, v[7:8]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v16, v10, 0
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v3, v9, v[7:8]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[7:8], vcc_lo, v4, v18, v[7:8]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v24, vcc_lo
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[22:23], null, v6, v18, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX11-NEXT: v_mov_b32_e32 v20, v8
-; GFX11-NEXT: v_cndmask_b32_e64 v26, 0, 1, s0
-; GFX11-NEXT: v_mov_b32_e32 v21, v22
-; GFX11-NEXT: v_mul_lo_u32 v22, v6, v9
-; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v2, v18, v[0:1]
+; GFX11-NEXT: v_mul_lo_u32 v32, v3, v12
+; GFX11-NEXT: v_mul_lo_u32 v31, v2, v13
+; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v17, v13, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v12, 0
+; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v2, v12, v[7:8]
+; GFX11-NEXT: v_mad_u64_u32 v[7:8], s0, v17, v11, v[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v11, v[20:21]
+; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[7:8]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
+; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v4, v10, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo
+; GFX11-NEXT: v_mad_u64_u32 v[20:21], null, v5, v9, v[7:8]
+; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v16, v10, 0
+; GFX11-NEXT: v_mad_u64_u32 v[22:23], vcc_lo, v4, v18, v[0:1]
; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[20:21]
-; GFX11-NEXT: v_mov_b32_e32 v6, v25
-; GFX11-NEXT: v_mul_lo_u32 v25, v16, v15
-; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v17, v12, v[0:1]
-; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v16, v11, v[6:7]
+; GFX11-NEXT: v_mad_u64_u32 v[24:25], null, v6, v18, v[20:21]
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[7:8]
+; GFX11-NEXT: v_mov_b32_e32 v7, v23
+; GFX11-NEXT: v_cndmask_b32_e64 v27, 0, 1, s0
+; GFX11-NEXT: v_mul_lo_u32 v23, v6, v9
+; GFX11-NEXT: v_mov_b32_e32 v8, v24
+; GFX11-NEXT: v_mul_lo_u32 v24, v5, v10
+; GFX11-NEXT: v_mad_u64_u32 v[20:21], vcc_lo, v2, v18, v[0:1]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v16, v13, v[7:8]
+; GFX11-NEXT: v_dual_mov_b32 v7, v22 :: v_dual_mov_b32 v6, v21
+; GFX11-NEXT: v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1]
+; GFX11-NEXT: v_mad_u64_u32 v[14:15], s2, v16, v11, v[6:7]
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v18, 0
; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s2
-; GFX11-NEXT: v_mad_u64_u32 v[14:15], s1, v2, v11, v[20:21]
-; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v17, v10, v[6:7]
-; GFX11-NEXT: v_mul_lo_u32 v20, v2, v13
-; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v8, s2
-; GFX11-NEXT: v_mov_b32_e32 v11, v1
-; GFX11-NEXT: v_mad_u64_u32 v[13:14], s3, v3, v10, v[14:15]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v2, v9, v[6:7]
-; GFX11-NEXT: v_mul_lo_u32 v21, v3, v12
-; GFX11-NEXT: v_mov_b32_e32 v12, v24
-; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v8, s2
-; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v4, v9, v[13:14]
-; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v16, v9, v[11:12]
-; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4
-; GFX11-NEXT: v_mad_u64_u32 v[3:4], s4, v3, v18, v[1:2]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, s4
-; GFX11-NEXT: v_mad_u64_u32 v[5:6], s4, v5, v18, v[6:7]
-; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v18, v[8:9]
-; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v11, v3, s5
-; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v26, v4, s5
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v10, v5, s5
-; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v27, v6, s5
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v23, v25, s5
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], s1, v2, v11, v[21:22]
+; GFX11-NEXT: v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15]
+; GFX11-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v8, s2
+; GFX11-NEXT: v_mad_u64_u32 v[21:22], s3, v3, v10, v[6:7]
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], s2, v2, v9, v[11:12]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v1, s2
+; GFX11-NEXT: v_mad_u64_u32 v[10:11], s4, v16, v9, v[13:14]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], s2, v4, v9, v[21:22]
+; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v3, v18, v[6:7]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v12, s4
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], s4, v5, v18, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v18, v[10:11]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v4, v8, s5
+; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v27, v9, s5
+; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v12, v6, s5
+; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v7, s5
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v25, v28, s5
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v29, s4
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v20, s2
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v21, s3
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v31, s2
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v32, s3
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s1
-; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v28, vcc_lo
-; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v22, s0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v24, vcc_lo
+; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v23, s0
; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v19, v18, v[9:10]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -2752,101 +2769,103 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
-; GFX12-NEXT: v_mul_lo_u32 v27, v6, v9
-; GFX12-NEXT: v_mul_lo_u32 v28, v5, v10
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_mul_lo_u32 v29, v4, v11
+; GFX12-NEXT: v_mul_lo_u32 v31, v3, v12
+; GFX12-NEXT: v_mul_lo_u32 v30, v2, v13
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v14, 0
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
-; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
+; GFX12-NEXT: v_mul_lo_u32 v28, v17, v14
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v17, v13, v[0:1]
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v12, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v2, v12, v[18:19]
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1]
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[20:21]
+; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0
-; GFX12-NEXT: v_mov_b32_e32 v20, v22
+; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v4, v10, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v25, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
+; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v5, v9, v[18:19]
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v10, 0
+; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], null, v6, v8, v[20:21]
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT: v_mov_b32_e32 v18, v23
+; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: v_cndmask_b32_e64 v27, 0, 1, s0
+; GFX12-NEXT: v_mul_lo_u32 v23, v6, v9
+; GFX12-NEXT: v_mov_b32_e32 v19, v24
+; GFX12-NEXT: v_mul_lo_u32 v24, v5, v10
+; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1]
+; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v27, vcc_lo
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[18:19]
; GFX12-NEXT: v_mov_b32_e32 v19, v22
-; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
+; GFX12-NEXT: v_mul_lo_u32 v27, v16, v15
+; GFX12-NEXT: v_mov_b32_e32 v18, v21
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1]
+; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[18:19]
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0
-; GFX12-NEXT: v_mov_b32_e32 v20, v18
-; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25]
-; GFX12-NEXT: v_mul_lo_u32 v20, v4, v11
-; GFX12-NEXT: v_mul_lo_u32 v25, v3, v12
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2
-; GFX12-NEXT: v_mul_lo_u32 v24, v2, v13
+; GFX12-NEXT: v_cndmask_b32_e64 v32, 0, 1, s2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[21:22]
; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15]
-; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2
-; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v21
+; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v32, s2
+; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], s3, v3, v10, v[18:19]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12]
+; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s2, v2, v9, v[11:12]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2
-; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19]
-; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14]
+; GFX12-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v1, s2
+; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s4, v16, v9, v[13:14]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v4, v9, v[21:22]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4
-; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2]
+; GFX12-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v3, v8, v[18:19]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v6, s4
-; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11]
-; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13]
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v15, s4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s4, v5, v8, v[1:2]
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[10:11]
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v4, v12, s5
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
+; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v6, v13, s5
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v14, v5, s5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v9, v14, s5
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v6, s5
+; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v15, s5
; GFX12-NEXT: s_wait_alu 0xf1ff
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v23, v22, s5
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v25, v27, s5
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, s4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s4
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, s2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, s3
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v20, s1
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v30, s2
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v31, s3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v29, s1
; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v24, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v27, s0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v23, s0
; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
@@ -2948,52 +2967,52 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b64 s[0:1], 0
-; GFX7-NEXT: buffer_load_dword v2, v[2:3], s[0:3], 0 addr64
-; GFX7-NEXT: v_mov_b32_e32 v3, 0x50
+; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
+; GFX7-NEXT: v_mov_b32_e32 v5, 0x50
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v3, 0
+; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_mul_u64_zext_with_vregs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: flat_load_dword v2, v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v3, 0x50
+; GFX8-NEXT: flat_load_dword v4, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x50
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_zext_with_vregs:
; GFX9: ; %bb.0:
-; GFX9-NEXT: global_load_dword v2, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x50
+; GFX9-NEXT: global_load_dword v4, v[2:3], off
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x50
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_mul_u64_zext_with_vregs:
; GFX10: ; %bb.0:
-; GFX10-NEXT: global_load_dword v2, v[2:3], off
+; GFX10-NEXT: global_load_dword v4, v[2:3], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v4, 0
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_mul_u64_zext_with_vregs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: global_load_b32 v2, v[2:3], off
+; GFX11-NEXT: global_load_b32 v4, v[2:3], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_mul_u64_zext_with_vregs:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_load_b32 v2, v[2:3], off
+; GFX12-NEXT: global_load_b32 v4, v[2:3], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v4, 0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_endpgm
;
@@ -3129,33 +3148,36 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b64 s[0:1], 0
; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
-; GFX7-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX7-NEXT: v_mov_b32_e32 v6, 0x50
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
-; GFX7-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v5, v[3:4]
+; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0
+; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v4
+; GFX7-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v6, v[3:4]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_mul_u64_sext_with_vregs:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_dword v4, v[2:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX8-NEXT: v_mov_b32_e32 v6, 0x50
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
-; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0
+; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v4
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v3, v4
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_sext_with_vregs:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dword v4, v[2:3], off
-; GFX9-NEXT: v_mov_b32_e32 v5, 0x50
+; GFX9-NEXT: v_mov_b32_e32 v6, 0x50
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
-; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v4
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0
+; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v4
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4]
+; GFX9-NEXT: v_mov_b32_e32 v3, v4
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_endpgm
;
@@ -3182,9 +3204,9 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
;
; GFX12-LABEL: s_mul_u64_sext_with_vregs:
; GFX12: ; %bb.0:
-; GFX12-NEXT: global_load_b32 v2, v[2:3], off
+; GFX12-NEXT: global_load_b32 v4, v[2:3], off
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
+; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v4, 0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX12-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 4031fe0be2823..1a6bf974ff875 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -31,128 +31,128 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
; CHECK-NEXT: v_xor_b32_e32 v1, v3, v0
; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v2
; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1
-; CHECK-NEXT: v_sub_i32_e32 v10, vcc, 0, v2
-; CHECK-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc
+; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v2
+; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc
; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v6
; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3
; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3
; CHECK-NEXT: v_trunc_f32_e32 v8, v6
; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v8
-; CHECK-NEXT: v_cvt_u32_f32_e32 v9, v3
-; CHECK-NEXT: v_cvt_u32_f32_e32 v12, v8
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0
+; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v3
+; CHECK-NEXT: v_cvt_u32_f32_e32 v14, v8
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0
; CHECK-NEXT: v_mov_b32_e32 v3, v7
-; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6
-; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; CHECK-NEXT: v_mul_hi_u32 v8, v9, v6
-; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6
-; CHECK-NEXT: v_mul_lo_u32 v13, v9, v7
-; CHECK-NEXT: v_mul_lo_u32 v14, v12, v7
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4]
+; CHECK-NEXT: v_mul_lo_u32 v3, v14, v6
+; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT: v_mul_hi_u32 v7, v11, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v14, v6
+; CHECK-NEXT: v_mul_lo_u32 v8, v11, v9
+; CHECK-NEXT: v_mul_lo_u32 v10, v14, v9
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8
-; CHECK-NEXT: v_mul_hi_u32 v8, v9, v7
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; CHECK-NEXT: v_mul_hi_u32 v7, v11, v9
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v13, v3
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v14, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v6
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; CHECK-NEXT: v_mul_hi_u32 v7, v12, v7
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_mul_hi_u32 v8, v14, v9
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v3
-; CHECK-NEXT: v_addc_u32_e32 v12, vcc, v12, v6, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v3
+; CHECK-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0
; CHECK-NEXT: v_mov_b32_e32 v3, v7
-; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v10, 31, v5
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v10
-; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v10, vcc
-; CHECK-NEXT: v_xor_b32_e32 v8, v3, v10
-; CHECK-NEXT: v_mul_lo_u32 v3, v12, v6
-; CHECK-NEXT: v_mul_lo_u32 v5, v9, v7
-; CHECK-NEXT: v_xor_b32_e32 v11, v4, v10
-; CHECK-NEXT: v_mul_hi_u32 v4, v9, v6
-; CHECK-NEXT: v_mul_hi_u32 v6, v12, v6
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4]
+; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v5
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v12
+; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v12, vcc
+; CHECK-NEXT: v_xor_b32_e32 v10, v3, v12
+; CHECK-NEXT: v_mul_lo_u32 v3, v14, v6
+; CHECK-NEXT: v_mul_lo_u32 v5, v11, v9
+; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12
+; CHECK-NEXT: v_mul_hi_u32 v4, v11, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v14, v6
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v12, v7
+; CHECK-NEXT: v_mul_lo_u32 v4, v14, v9
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v5, v9, v7
+; CHECK-NEXT: v_mul_hi_u32 v5, v11, v9
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v6, v12, v7
+; CHECK-NEXT: v_mul_hi_u32 v6, v14, v9
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v9, v3
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc
-; CHECK-NEXT: v_mul_lo_u32 v5, v11, v3
-; CHECK-NEXT: v_mul_lo_u32 v6, v8, v4
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v11, v3
-; CHECK-NEXT: v_mul_hi_u32 v9, v11, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v11, v3
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc
+; CHECK-NEXT: v_mul_lo_u32 v5, v13, v3
+; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4
+; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v13, v3
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v11, v4
+; CHECK-NEXT: v_mul_lo_u32 v7, v13, v4
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v6, v8, v4
+; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v3, v5
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v7, 0
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5
+; CHECK-NEXT: v_mul_hi_u32 v7, v13, v4
+; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v9, v5
-; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v7, v[4:5]
-; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v11, v4, vcc
-; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v11, v4
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v7, v5
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5]
+; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6]
+; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v7, vcc
+; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2
; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5]
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v7
-; CHECK-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v9
+; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v8
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v9, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v3, v10, v0
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v6
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v7, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
+; CHECK-NEXT: v_xor_b32_e32 v3, v12, v0
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3
; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
@@ -223,65 +223,65 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; CHECK-NEXT: v_trunc_f32_e32 v2, v1
; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0
-; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v6, v3, v1
-; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1
-; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0
+; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v2
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT: v_mul_hi_u32 v2, v6, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT: v_mul_lo_u32 v3, v6, v4
+; CHECK-NEXT: v_mul_lo_u32 v5, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v8, v6, v4
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v0
+; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT: v_mul_hi_u32 v3, v6, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT: v_mul_lo_u32 v2, v6, v4
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT: v_mul_hi_u32 v6, v3, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v5, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_mul_hi_u32 v2, v6, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; CHECK-NEXT: v_mul_lo_u32 v2, s13, v0
; CHECK-NEXT: v_mul_lo_u32 v3, s12, v1
; CHECK-NEXT: v_mul_hi_u32 v4, s12, v0
; CHECK-NEXT: v_mul_hi_u32 v0, s13, v0
-; CHECK-NEXT: v_mul_hi_u32 v5, s13, v1
+; CHECK-NEXT: v_mov_b32_e32 v7, s13
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
@@ -294,39 +294,39 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v0, v2
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v0, v2
+; CHECK-NEXT: v_mul_hi_u32 v4, s13, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v2, v[1:2]
-; CHECK-NEXT: v_mov_b32_e32 v5, s13
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v2
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v4, v[1:2]
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s12, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2]
-; CHECK-NEXT: v_mov_b32_e32 v3, s11
-; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v5, v1, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[0:1], s13, v1
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; CHECK-NEXT: v_mov_b32_e32 v1, s11
+; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v7, v4, vcc
+; CHECK-NEXT: v_sub_i32_e64 v3, s[0:1], s13, v4
; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v2
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v0
; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v4
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v2
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s11, v1
-; CHECK-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v2, v4, v5, s[0:1]
+; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s10, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s11, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v3
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; CHECK-NEXT: v_xor_b32_e32 v0, s6, v0
; CHECK-NEXT: s_mov_b32 s0, 0
; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0
@@ -384,263 +384,263 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_xor_b32_e32 v4, v5, v8
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v10
; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v4
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v10
-; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v4, vcc
+; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v10
+; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v4, vcc
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v9
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5
; GISEL-NEXT: v_trunc_f32_e32 v9, v9
; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v5
; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
; GISEL-NEXT: v_mov_b32_e32 v5, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v9, v[5:6]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v9, v[5:6]
; GISEL-NEXT: v_mul_lo_u32 v5, v9, v11
-; GISEL-NEXT: v_mul_hi_u32 v17, v14, v11
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; GISEL-NEXT: v_mul_hi_u32 v13, v16, v11
; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11
-; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12
+; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v17
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_mul_lo_u32 v13, v9, v14
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT: v_mul_hi_u32 v12, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13
-; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v13, v9, v14
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v5
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v9, v11, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v5
+; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v9, v11, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; GISEL-NEXT: v_mov_b32_e32 v5, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[5:6]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[5:6]
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT: v_xor_b32_e32 v15, v0, v9
-; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11
-; GISEL-NEXT: v_mul_lo_u32 v5, v14, v12
-; GISEL-NEXT: v_xor_b32_e32 v16, v1, v9
-; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; GISEL-NEXT: v_xor_b32_e32 v17, v0, v9
+; GISEL-NEXT: v_mul_lo_u32 v0, v19, v11
+; GISEL-NEXT: v_mul_lo_u32 v5, v16, v14
+; GISEL-NEXT: v_xor_b32_e32 v18, v1, v9
+; GISEL-NEXT: v_mul_hi_u32 v1, v16, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12
+; GISEL-NEXT: v_mul_lo_u32 v1, v19, v14
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_mul_hi_u32 v5, v14, v12
+; GISEL-NEXT: v_mul_hi_u32 v5, v16, v14
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5
-; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12
+; GISEL-NEXT: v_mul_hi_u32 v11, v19, v14
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v16, v0
-; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1
-; GISEL-NEXT: v_mul_hi_u32 v12, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
-; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v5, v18, v0
+; GISEL-NEXT: v_mul_lo_u32 v11, v17, v1
+; GISEL-NEXT: v_mul_hi_u32 v12, v17, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v12
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v12, v16, v1
+; GISEL-NEXT: v_mul_lo_u32 v12, v18, v1
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5
-; GISEL-NEXT: v_mul_hi_u32 v11, v15, v1
+; GISEL-NEXT: v_mul_hi_u32 v11, v17, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v12, v11
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
+; GISEL-NEXT: v_mul_hi_u32 v1, v18, v1
; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v10, v0, 0
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v5
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v1, v5
; GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v14, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v10, v16, v[1:2]
; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v7
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v5
; GISEL-NEXT: v_addc_u32_e32 v6, vcc, v7, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v0, v[12:13]
; GISEL-NEXT: v_xor_b32_e32 v7, v1, v5
; GISEL-NEXT: v_xor_b32_e32 v6, v6, v5
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v4, v0, v[12:13]
; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v7
-; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v6
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11
-; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12
-; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13
+; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6
+; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v17, v11
+; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v18, v14
+; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12
; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc
-; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v4, vcc
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v18, v14, vcc
+; GISEL-NEXT: v_subb_u32_e32 v14, vcc, v11, v4, vcc
; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1
-; GISEL-NEXT: v_trunc_f32_e32 v16, v11
-; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16
+; GISEL-NEXT: v_trunc_f32_e32 v15, v11
+; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v15
; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1
; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v7
; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v6, vcc
; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v10
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v10
-; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc
+; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v15
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v10
+; GISEL-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v14, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10
; GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v4
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13]
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v10
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v1, v22, v11
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v21, v4
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13]
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v4
-; GISEL-NEXT: v_mul_lo_u32 v10, v18, v12
+; GISEL-NEXT: v_mul_lo_u32 v10, v18, v14
; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10
; GISEL-NEXT: v_mul_hi_u32 v10, v18, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7]
-; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7]
+; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11
; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v10
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v4
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v21, v4
; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v15, v21, s[4:5]
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v10
-; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v4, v13, v15, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v10
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v17, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v15, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[6:7]
-; GISEL-NEXT: v_cndmask_b32_e32 v15, v15, v21, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10
-; GISEL-NEXT: v_mul_lo_u32 v13, v16, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT: v_mul_hi_u32 v13, v18, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v17, v13
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT: v_mul_lo_u32 v12, v22, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_mul_hi_u32 v12, v18, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v14, v22, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v18, v10
-; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v16, v11, vcc
+; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v22, v11, vcc
; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v19, v12, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v11
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v13, v[0:1]
-; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[0:1]
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v11
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v11, vcc
-; GISEL-NEXT: v_xor_b32_e32 v15, v1, v11
-; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v16, v13, vcc
+; GISEL-NEXT: v_xor_b32_e32 v13, v9, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v19, v14, v[0:1]
+; GISEL-NEXT: v_xor_b32_e32 v11, v1, v13
+; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[8:9]
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v15
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v15, vcc
+; GISEL-NEXT: v_xor_b32_e32 v16, v1, v15
+; GISEL-NEXT: v_mul_lo_u32 v1, v14, v10
; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0
-; GISEL-NEXT: v_xor_b32_e32 v16, v2, v11
+; GISEL-NEXT: v_xor_b32_e32 v17, v2, v15
; GISEL-NEXT: v_mul_hi_u32 v2, v12, v10
-; GISEL-NEXT: v_xor_b32_e32 v9, v4, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v14, v10
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v4, v13, v10
+; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1
; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3
+; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1
-; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v13, v0, vcc
-; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1
-; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v4, v15, v1
-; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
-; GISEL-NEXT: v_xor_b32_e32 v10, v14, v8
+; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc
+; GISEL-NEXT: v_mul_lo_u32 v2, v17, v1
+; GISEL-NEXT: v_mul_lo_u32 v3, v16, v0
+; GISEL-NEXT: v_xor_b32_e32 v8, v4, v13
+; GISEL-NEXT: v_mul_hi_u32 v4, v16, v1
+; GISEL-NEXT: v_mul_hi_u32 v1, v17, v1
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0
+; GISEL-NEXT: v_mul_lo_u32 v4, v17, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT: v_mul_hi_u32 v3, v15, v0
+; GISEL-NEXT: v_mul_hi_u32 v3, v16, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v3
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v2
-; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v1, v2
+; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v10, 0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v1
; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v9, v8
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v12, v[3:4]
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v8, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v15, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v6
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[0:1]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v11, v13
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v13, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v10, v[3:4]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v16, v2
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v17, v8, vcc
+; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v17, v8
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6
+; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v6, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v7
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v6
-; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v8, v9, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v12
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v6
+; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v8, v9, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v10
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v7
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v8
; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
-; GISEL-NEXT: v_xor_b32_e32 v4, v11, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v6, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc
+; GISEL-NEXT: v_xor_b32_e32 v4, v15, v5
; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4
; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
@@ -669,100 +669,100 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_xor_b32_e32 v1, v3, v0
; CGP-NEXT: v_cvt_f32_u32_e32 v3, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1
-; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2
-; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v1, vcc
+; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v2
+; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc
; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4
; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3
; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
; CGP-NEXT: v_trunc_f32_e32 v5, v4
; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5
-; CGP-NEXT: v_cvt_u32_f32_e32 v12, v3
-; CGP-NEXT: v_cvt_u32_f32_e32 v15, v5
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5]
-; CGP-NEXT: v_mul_hi_u32 v16, v12, v3
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v5, v15, v3
-; CGP-NEXT: v_mul_hi_u32 v3, v15, v3
-; CGP-NEXT: v_mul_lo_u32 v17, v12, v4
-; CGP-NEXT: v_mul_lo_u32 v18, v15, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16
-; CGP-NEXT: v_mul_hi_u32 v16, v12, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v14, v3
+; CGP-NEXT: v_cvt_u32_f32_e32 v17, v5
+; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5]
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13]
+; CGP-NEXT: v_mul_lo_u32 v5, v17, v3
+; CGP-NEXT: v_mul_hi_u32 v12, v14, v3
+; CGP-NEXT: v_mul_lo_u32 v13, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v3, v17, v3
+; CGP-NEXT: v_mul_lo_u32 v18, v17, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v14, v4
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v17, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5
; CGP-NEXT: v_add_i32_e32 v3, vcc, v18, v3
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v16
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT: v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_mul_hi_u32 v4, v17, v4
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v3
-; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v4, vcc
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5]
-; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v11
-; CGP-NEXT: v_mul_hi_u32 v16, v12, v3
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5]
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v13
-; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v13, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v13
-; CGP-NEXT: v_mul_lo_u32 v5, v15, v3
-; CGP-NEXT: v_mul_lo_u32 v14, v12, v4
-; CGP-NEXT: v_mul_hi_u32 v3, v15, v3
-; CGP-NEXT: v_xor_b32_e32 v10, v10, v13
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v16
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v3
+; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v4, vcc
+; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5]
+; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v11
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13]
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v15
+; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v15, vcc
+; CGP-NEXT: v_xor_b32_e32 v12, v5, v15
+; CGP-NEXT: v_mul_lo_u32 v5, v17, v3
+; CGP-NEXT: v_mul_lo_u32 v11, v14, v4
+; CGP-NEXT: v_xor_b32_e32 v13, v10, v15
+; CGP-NEXT: v_mul_hi_u32 v10, v14, v3
+; CGP-NEXT: v_mul_hi_u32 v3, v17, v3
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v16, v15, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5
-; CGP-NEXT: v_mul_hi_u32 v14, v12, v4
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v16, v3
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
-; CGP-NEXT: v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT: v_mul_lo_u32 v10, v17, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT: v_mul_hi_u32 v11, v14, v4
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_mul_hi_u32 v4, v17, v4
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3
-; CGP-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v10, v3
-; CGP-NEXT: v_mul_lo_u32 v12, v11, v4
-; CGP-NEXT: v_mul_hi_u32 v14, v11, v3
-; CGP-NEXT: v_mul_hi_u32 v3, v10, v3
-; CGP-NEXT: v_mul_hi_u32 v15, v10, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v14, v10, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT: v_mul_hi_u32 v12, v11, v4
; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12
+; CGP-NEXT: v_addc_u32_e32 v4, vcc, v17, v4, vcc
+; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
+; CGP-NEXT: v_mul_lo_u32 v10, v12, v4
+; CGP-NEXT: v_mul_hi_u32 v11, v12, v3
+; CGP-NEXT: v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v11, v13, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT: v_mul_hi_u32 v10, v12, v4
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; CGP-NEXT: v_add_i32_e32 v14, vcc, v3, v5
+; CGP-NEXT: v_mul_hi_u32 v11, v13, v4
; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v14, 0
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v5
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v12, v[4:5]
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, v11, v3
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[4:5]
-; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v10, v4, vcc
-; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v10, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v11, v5
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v2, v16, v[4:5]
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, v12, v3
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[10:11]
+; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v13, v4, vcc
+; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v13, v4
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1
; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
@@ -773,13 +773,13 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
; CGP-NEXT: v_cndmask_b32_e64 v5, v10, v11, s[4:5]
; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v14
-; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc
+; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v16, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v2, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v2, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v10
; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
@@ -787,8 +787,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc
-; CGP-NEXT: v_xor_b32_e32 v3, v13, v0
-; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; CGP-NEXT: v_xor_b32_e32 v3, v15, v0
+; CGP-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc
; CGP-NEXT: v_xor_b32_e32 v0, v1, v3
; CGP-NEXT: v_xor_b32_e32 v1, v2, v3
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
@@ -842,126 +842,126 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_xor_b32_e32 v3, v5, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v5, v4
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v4
-; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc
+; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v4
+; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v3, vcc
; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5
; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
; CGP-NEXT: v_trunc_f32_e32 v7, v6
; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7
-; CGP-NEXT: v_cvt_u32_f32_e32 v10, v5
-; CGP-NEXT: v_cvt_u32_f32_e32 v13, v7
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7]
-; CGP-NEXT: v_mul_hi_u32 v14, v10, v5
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7]
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT: v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT: v_mul_lo_u32 v15, v10, v6
-; CGP-NEXT: v_mul_lo_u32 v16, v13, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14
-; CGP-NEXT: v_mul_hi_u32 v14, v10, v6
+; CGP-NEXT: v_cvt_u32_f32_e32 v12, v5
+; CGP-NEXT: v_cvt_u32_f32_e32 v15, v7
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7]
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11]
+; CGP-NEXT: v_mul_lo_u32 v7, v15, v5
+; CGP-NEXT: v_mul_hi_u32 v10, v12, v5
+; CGP-NEXT: v_mul_lo_u32 v11, v12, v6
+; CGP-NEXT: v_mul_hi_u32 v5, v15, v5
+; CGP-NEXT: v_mul_lo_u32 v16, v15, v6
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v12, v6
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v15, v7
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7
; CGP-NEXT: v_add_i32_e32 v5, vcc, v16, v5
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_mul_hi_u32 v6, v13, v6
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT: v_mul_hi_u32 v6, v15, v6
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5
-; CGP-NEXT: v_addc_u32_e32 v13, vcc, v13, v6, vcc
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7]
-; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9
-; CGP-NEXT: v_mul_hi_u32 v14, v10, v5
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7]
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v11
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v11, vcc
-; CGP-NEXT: v_xor_b32_e32 v9, v7, v11
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT: v_mul_lo_u32 v12, v10, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT: v_xor_b32_e32 v8, v8, v11
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v5
+; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v6, vcc
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7]
+; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11]
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v13
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v13, vcc
+; CGP-NEXT: v_xor_b32_e32 v11, v7, v13
+; CGP-NEXT: v_mul_lo_u32 v7, v15, v5
+; CGP-NEXT: v_mul_lo_u32 v9, v12, v6
+; CGP-NEXT: v_xor_b32_e32 v14, v8, v13
+; CGP-NEXT: v_mul_hi_u32 v8, v12, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v15, v5
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v14, v13, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7
-; CGP-NEXT: v_mul_hi_u32 v12, v10, v6
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12
-; CGP-NEXT: v_mul_hi_u32 v6, v13, v6
+; CGP-NEXT: v_mul_lo_u32 v8, v15, v6
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; CGP-NEXT: v_mul_hi_u32 v9, v12, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT: v_mul_hi_u32 v6, v15, v6
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v8, v5
-; CGP-NEXT: v_mul_lo_u32 v10, v9, v6
-; CGP-NEXT: v_mul_hi_u32 v12, v9, v5
-; CGP-NEXT: v_mul_hi_u32 v5, v8, v5
-; CGP-NEXT: v_mul_hi_u32 v13, v8, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v12, v8, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7
-; CGP-NEXT: v_mul_hi_u32 v10, v9, v6
; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, v15, v6, vcc
+; CGP-NEXT: v_mul_lo_u32 v7, v14, v5
+; CGP-NEXT: v_mul_lo_u32 v8, v11, v6
+; CGP-NEXT: v_mul_hi_u32 v9, v11, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v14, v5
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v9, v14, v6
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_mul_hi_u32 v8, v11, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
; CGP-NEXT: v_add_i32_e32 v12, vcc, v5, v7
+; CGP-NEXT: v_mul_hi_u32 v9, v14, v6
; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v7
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[6:7]
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[6:7]
-; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v8, v6, vcc
-; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v8, v6
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3
-; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v7
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v15, v[6:7]
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8]
+; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v14, v9, vcc
+; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v14, v9
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4
; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3
-; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5]
; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v15, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
-; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8
; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v5, v11, v2
-; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc
+; CGP-NEXT: v_xor_b32_e32 v5, v13, v2
+; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v4, vcc
; CGP-NEXT: v_xor_b32_e32 v2, v3, v5
; CGP-NEXT: v_xor_b32_e32 v3, v4, v5
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
@@ -1051,82 +1051,82 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb
; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705
+; CHECK-NEXT: v_mov_b32_e32 v9, 0xffed2705
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v4, v3
; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0
; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1
; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb
+; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
@@ -1135,40 +1135,40 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v2
-; CHECK-NEXT: v_mul_hi_u32 v8, v9, v1
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0
+; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2]
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[1:2]
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[4:5]
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v7
-; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
+; CHECK-NEXT: v_subb_u32_e64 v1, s[4:5], v5, v2, vcc
+; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v6
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v1, -1, v3, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v7
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v6
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; CHECK-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v4
-; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v3
+; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v4, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = sdiv i64 %num, 1235195
ret i64 %result
@@ -1194,116 +1194,116 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
; GISEL-NEXT: v_mov_b32_e32 v9, v5
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10]
; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[9:10]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11]
; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9
-; GISEL-NEXT: v_mul_lo_u32 v4, v8, v9
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v13
+; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13
+; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v14, v7, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v7, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc
; GISEL-NEXT: v_mov_b32_e32 v4, v14
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v19, v13
+; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15]
; GISEL-NEXT: s_mov_b32 s6, 1
; GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14
+; GISEL-NEXT: v_mul_lo_u32 v9, v18, v16
; GISEL-NEXT: s_subb_u32 s6, 0, 0
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v9, v18, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13
-; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14
+; GISEL-NEXT: v_mul_hi_u32 v9, v19, v13
+; GISEL-NEXT: v_mul_lo_u32 v13, v19, v16
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4
+; GISEL-NEXT: v_mul_hi_u32 v14, v18, v16
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v9, v14
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9
; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9
+; GISEL-NEXT: v_xor_b32_e32 v15, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v19, v16
+; GISEL-NEXT: v_xor_b32_e32 v17, v1, v9
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0
-; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1
-; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v13, v17, v0
+; GISEL-NEXT: v_mul_lo_u32 v14, v15, v1
+; GISEL-NEXT: v_mul_hi_u32 v16, v15, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0
; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1
+; GISEL-NEXT: v_mul_lo_u32 v16, v17, v1
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v0, v13
-; GISEL-NEXT: v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v0, v13
+; GISEL-NEXT: v_mul_hi_u32 v18, v17, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v13
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v18, v0
-; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc
-; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v18, v13
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v18, v[1:2]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v17, v13, vcc
+; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], v17, v13
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v15
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc
+; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v16
+; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v18, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1]
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v7, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[13:14]
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v17
+; GISEL-NEXT: v_mul_lo_u32 v13, v7, v0
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v19, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v17, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1320,72 +1320,72 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v16, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; GISEL-NEXT: v_xor_b32_e32 v1, v11, v9
-; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6]
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v15, v13, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc
-; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11
-; GISEL-NEXT: v_mul_lo_u32 v2, v8, v0
-; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5
-; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11
-; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v1
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v16, v14, vcc
+; GISEL-NEXT: v_xor_b32_e32 v12, v7, v9
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v18, v5, vcc
+; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8]
+; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc
+; GISEL-NEXT: v_xor_b32_e32 v7, v2, v13
+; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT: v_xor_b32_e32 v8, v3, v13
+; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v8, v5
+; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3
-; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2
-; GISEL-NEXT: v_xor_b32_e32 v10, v10, v9
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v9
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v5, v8, v3
+; GISEL-NEXT: v_mul_lo_u32 v6, v7, v2
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v9
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; GISEL-NEXT: v_mul_hi_u32 v9, v7, v3
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, v13, v2
-; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT: v_mul_lo_u32 v9, v8, v2
+; GISEL-NEXT: v_mul_hi_u32 v3, v8, v3
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v2
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v6, v7, v2
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v3, v5
-; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v3, v5
+; GISEL-NEXT: v_mul_hi_u32 v10, v8, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4]
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v5
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
@@ -1393,24 +1393,24 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v7
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v9
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
; GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v6
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v11
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v11
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_v2i64_oddk_denom:
@@ -1429,112 +1429,112 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
; CGP-NEXT: v_mov_b32_e32 v9, v5
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10]
; CGP-NEXT: v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11]
; CGP-NEXT: v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT: v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT: v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mul_lo_u32 v4, v7, v13
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT: v_mul_hi_u32 v14, v7, v13
+; CGP-NEXT: v_mul_hi_u32 v13, v8, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc
; CGP-NEXT: v_mov_b32_e32 v4, v14
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT: v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; CGP-NEXT: v_mul_lo_u32 v4, v19, v13
+; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15]
+; CGP-NEXT: v_mul_lo_u32 v9, v18, v16
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v18, v13
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT: v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v15, v16, v14
+; CGP-NEXT: v_mul_hi_u32 v9, v19, v13
+; CGP-NEXT: v_mul_lo_u32 v13, v19, v16
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v14, v18, v16
; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9
; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT: v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT: v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT: v_xor_b32_e32 v15, v0, v9
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v19, v16
+; CGP-NEXT: v_xor_b32_e32 v17, v1, v9
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1
; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v19, v0
-; CGP-NEXT: v_mul_lo_u32 v14, v18, v1
-; CGP-NEXT: v_mul_hi_u32 v15, v18, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v19, v0
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; CGP-NEXT: v_mul_lo_u32 v13, v17, v0
+; CGP-NEXT: v_mul_lo_u32 v14, v15, v1
+; CGP-NEXT: v_mul_hi_u32 v16, v15, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v17, v0
; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v15, v19, v1
+; CGP-NEXT: v_mul_lo_u32 v16, v17, v1
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v14, v18, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v14, v15, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v0, v13
-; CGP-NEXT: v_mul_hi_u32 v16, v19, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v0, v13
+; CGP-NEXT: v_mul_hi_u32 v18, v17, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v13
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v18, v0
-; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v19, v13, vcc
-; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v19, v13
+; CGP-NEXT: v_add_i32_e32 v18, vcc, v18, v13
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v18, v[1:2]
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v15, v0
+; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v17, v13, vcc
+; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v17, v13
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc
-; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v15
-; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v16, vcc
+; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; CGP-NEXT: v_add_i32_e32 v17, vcc, 1, v16
+; CGP-NEXT: v_addc_u32_e32 v19, vcc, 0, v18, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
; CGP-NEXT: v_mov_b32_e32 v0, v5
-; CGP-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5]
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v19, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v15, -1, v14, s[4:5]
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1]
+; CGP-NEXT: v_cndmask_b32_e64 v20, 0, -1, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14]
+; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v20, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v17
-; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v18, vcc
+; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v19, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CGP-NEXT: v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v14, v17, v1, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11
@@ -1552,72 +1552,72 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v17, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT: v_cndmask_b32_e32 v10, v16, v13, vcc
-; CGP-NEXT: v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT: v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT: v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT: v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1
+; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
+; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v14, vcc
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; CGP-NEXT: v_cndmask_b32_e32 v12, v18, v13, vcc
+; CGP-NEXT: v_xor_b32_e32 v13, v5, v9
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8]
+; CGP-NEXT: v_xor_b32_e32 v1, v12, v9
+; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT: v_xor_b32_e32 v7, v2, v12
+; CGP-NEXT: v_mul_lo_u32 v2, v11, v0
+; CGP-NEXT: v_mul_lo_u32 v6, v10, v5
+; CGP-NEXT: v_xor_b32_e32 v8, v3, v12
+; CGP-NEXT: v_mul_hi_u32 v3, v10, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v11, v0
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v3, v11, v5
; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT: v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v11, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT: v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0
+; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; CGP-NEXT: v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT: v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v7, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT: v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT: v_mul_hi_u32 v3, v8, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v3, v5
-; CGP-NEXT: v_mul_hi_u32 v8, v13, v2
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v3, v5
+; CGP-NEXT: v_mul_hi_u32 v10, v8, v2
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v5
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4]
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v5
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4]
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
@@ -1625,24 +1625,24 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; CGP-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v7
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc
+; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v9
+; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
; CGP-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v6
-; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v12
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v12
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = sdiv <2 x i64> %num, <i64 1235195, i64 1235195>
ret <2 x i64> %result
@@ -1678,126 +1678,126 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: v_xor_b32_e32 v1, v5, v0
; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2
; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v1
-; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v2
-; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT: v_sub_i32_e32 v12, vcc, 0, v2
+; CHECK-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc
; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5
; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
; CHECK-NEXT: v_trunc_f32_e32 v7, v6
; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v7
-; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v5
-; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7
-; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7]
-; CHECK-NEXT: v_mul_hi_u32 v12, v8, v5
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_mul_lo_u32 v7, v11, v5
-; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT: v_mul_lo_u32 v13, v8, v6
-; CHECK-NEXT: v_mul_lo_u32 v14, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v12
-; CHECK-NEXT: v_mul_hi_u32 v12, v8, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v14, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v5
+; CHECK-NEXT: v_cvt_u32_f32_e32 v14, v7
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7]
+; CHECK-NEXT: v_mul_lo_u32 v6, v14, v5
+; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT: v_mul_hi_u32 v7, v11, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v14, v5
+; CHECK-NEXT: v_mul_lo_u32 v8, v11, v9
+; CHECK-NEXT: v_mul_lo_u32 v10, v14, v9
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT: v_mul_hi_u32 v7, v11, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v5
-; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v6, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7]
-; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v9, vcc
-; CHECK-NEXT: v_xor_b32_e32 v7, v3, v9
-; CHECK-NEXT: v_mul_lo_u32 v3, v11, v5
-; CHECK-NEXT: v_mul_lo_u32 v10, v8, v6
-; CHECK-NEXT: v_xor_b32_e32 v12, v4, v9
-; CHECK-NEXT: v_mul_hi_u32 v4, v8, v5
-; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v10
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_mul_hi_u32 v8, v14, v9
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v5
+; CHECK-NEXT: v_addc_u32_e32 v14, vcc, v14, v6, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7]
+; CHECK-NEXT: v_ashrrev_i32_e32 v12, 31, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12
+; CHECK-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v12, vcc
+; CHECK-NEXT: v_xor_b32_e32 v10, v3, v12
+; CHECK-NEXT: v_mul_lo_u32 v3, v14, v5
+; CHECK-NEXT: v_mul_lo_u32 v6, v11, v9
+; CHECK-NEXT: v_xor_b32_e32 v13, v4, v12
+; CHECK-NEXT: v_mul_hi_u32 v4, v11, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v14, v5
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3
-; CHECK-NEXT: v_mul_hi_u32 v10, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v4, v14, v9
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3
+; CHECK-NEXT: v_mul_hi_u32 v6, v11, v9
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v14, v9
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v11, v4, vcc
-; CHECK-NEXT: v_mul_lo_u32 v5, v12, v3
-; CHECK-NEXT: v_mul_lo_u32 v6, v7, v4
-; CHECK-NEXT: v_mul_hi_u32 v8, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v12, v3
-; CHECK-NEXT: v_mul_hi_u32 v10, v12, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v11, v3
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc
+; CHECK-NEXT: v_mul_lo_u32 v5, v13, v3
+; CHECK-NEXT: v_mul_lo_u32 v6, v10, v4
+; CHECK-NEXT: v_mul_hi_u32 v7, v10, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v13, v3
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v8, v12, v4
+; CHECK-NEXT: v_mul_lo_u32 v7, v13, v4
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v6, v7, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_mul_hi_u32 v6, v10, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v3, v5
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v8, 0
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v3, v5
+; CHECK-NEXT: v_mul_hi_u32 v7, v13, v4
+; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v10, v5
-; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v8, v[4:5]
-; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v4, vcc
-; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v4
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v7, v5
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5]
+; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v10, v3
+; CHECK-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6]
+; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v13, v7, vcc
+; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v7
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2
; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[4:5]
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, 1, v8
-; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v9
+; CHECK-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v2, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v7
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v10, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v2, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v6
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v7, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v3, v9, v0
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
+; CHECK-NEXT: v_xor_b32_e32 v3, v12, v0
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3
; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
@@ -1849,8 +1849,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_xor_b32_e32 v5, v7, v4
; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v8
; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v5
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v8
-; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v5, vcc
+; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v8
+; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v5, vcc
; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v11
; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7
; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v10
@@ -1858,182 +1858,183 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7
; GISEL-NEXT: v_trunc_f32_e32 v13, v11
; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v13
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v7
-; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v13
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v7
+; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v13
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8]
-; GISEL-NEXT: v_mul_lo_u32 v7, v17, v11
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13
-; GISEL-NEXT: v_mul_hi_u32 v13, v14, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8]
+; GISEL-NEXT: v_mul_lo_u32 v7, v19, v11
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v16, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v17, v12
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v18, v7
-; GISEL-NEXT: v_mul_hi_u32 v18, v14, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT: v_mul_lo_u32 v12, v19, v14
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v13, v7
+; GISEL-NEXT: v_mul_hi_u32 v13, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v18
-; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_mul_hi_u32 v13, v19, v14
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v7
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v11, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v7
+; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v19, v11, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8]
; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT: v_xor_b32_e32 v15, v0, v7
-; GISEL-NEXT: v_mul_lo_u32 v0, v17, v11
-; GISEL-NEXT: v_mul_lo_u32 v13, v14, v12
-; GISEL-NEXT: v_xor_b32_e32 v16, v1, v7
-; GISEL-NEXT: v_mul_hi_u32 v1, v14, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v17, v11
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v17, v0, v7
+; GISEL-NEXT: v_mul_lo_u32 v0, v19, v11
+; GISEL-NEXT: v_mul_lo_u32 v12, v16, v14
+; GISEL-NEXT: v_xor_b32_e32 v18, v1, v7
+; GISEL-NEXT: v_mul_hi_u32 v1, v16, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v19, v11
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v13, v14, v12
+; GISEL-NEXT: v_mul_lo_u32 v1, v19, v14
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT: v_mul_hi_u32 v12, v16, v14
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_mul_hi_u32 v12, v17, v12
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v19, v14
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v16, v0
-; GISEL-NEXT: v_mul_lo_u32 v12, v15, v1
-; GISEL-NEXT: v_mul_hi_u32 v13, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v16, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v11, v18, v0
+; GISEL-NEXT: v_mul_lo_u32 v12, v17, v1
+; GISEL-NEXT: v_mul_hi_u32 v13, v17, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v16, v1
+; GISEL-NEXT: v_mul_lo_u32 v13, v18, v1
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_mul_hi_u32 v12, v15, v1
+; GISEL-NEXT: v_mul_hi_u32 v12, v17, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v12
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
+; GISEL-NEXT: v_mul_hi_u32 v1, v18, v1
; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v0, 0
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v1, v13
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v1, v13
; GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v14, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v8, v16, v[1:2]
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v6
; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v10, v6, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v5, v0, v[12:13]
; GISEL-NEXT: v_xor_b32_e32 v10, v1, v6
; GISEL-NEXT: v_xor_b32_e32 v9, v9, v6
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v5, v0, v[12:13]
; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v10
-; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v9
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v11
-; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v16, v12
-; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v13
+; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v9
+; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v17, v11
+; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], v18, v14
+; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v12
; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v16, v12, vcc
-; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v11, v5, vcc
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], v18, v14, vcc
+; GISEL-NEXT: v_subb_u32_e32 v14, vcc, v11, v5, vcc
; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v1
-; GISEL-NEXT: v_trunc_f32_e32 v16, v11
-; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v16
+; GISEL-NEXT: v_trunc_f32_e32 v15, v11
+; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v15
; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v1
; GISEL-NEXT: v_sub_i32_e32 v19, vcc, 0, v10
; GISEL-NEXT: v_subb_u32_e32 v20, vcc, 0, v9, vcc
; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v16
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v8
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v8
-; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc
+; GISEL-NEXT: v_cvt_u32_f32_e32 v22, v15
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v13, v8
+; GISEL-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v14, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8
; GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v5
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13]
-; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v8
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v1, v22, v11
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v21, v5
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13]
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v17, v5
-; GISEL-NEXT: v_mul_lo_u32 v8, v18, v12
+; GISEL-NEXT: v_mul_lo_u32 v8, v18, v14
; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8
; GISEL-NEXT: v_mul_hi_u32 v8, v18, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7]
-; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7]
+; GISEL-NEXT: v_mul_hi_u32 v11, v22, v11
; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], v1, v8
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v15, v5
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v21, v5
; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v17, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v15, v21, s[4:5]
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v14, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v8
-; GISEL-NEXT: v_addc_u32_e32 v21, vcc, 0, v15, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v13, v15, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v16, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, 1, v8
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v13, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v17, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v21, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[6:7]
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT: v_mul_lo_u32 v15, v16, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11
-; GISEL-NEXT: v_mul_hi_u32 v15, v18, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v12, v16, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v15, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v13, v17, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7]
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_mul_lo_u32 v13, v22, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT: v_mul_hi_u32 v13, v18, v14
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15
+; GISEL-NEXT: v_mul_hi_u32 v14, v22, v14
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v12
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v11
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v16, v12, vcc
+; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v22, v12, vcc
; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v19, v13, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v19, v15, v[0:1]
-; GISEL-NEXT: v_ashrrev_i32_e32 v12, 31, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v14, v8, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[0:1]
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v12
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v12, vcc
-; GISEL-NEXT: v_xor_b32_e32 v14, v1, v12
-; GISEL-NEXT: v_mul_lo_u32 v1, v15, v11
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v14, v[0:1]
+; GISEL-NEXT: v_xor_b32_e32 v12, v1, v7
+; GISEL-NEXT: v_ashrrev_i32_e32 v15, 31, v3
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v15
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v3, v15, vcc
+; GISEL-NEXT: v_xor_b32_e32 v5, v1, v15
+; GISEL-NEXT: v_mul_lo_u32 v1, v14, v11
; GISEL-NEXT: v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT: v_xor_b32_e32 v16, v2, v12
+; GISEL-NEXT: v_xor_b32_e32 v16, v2, v15
; GISEL-NEXT: v_mul_hi_u32 v2, v13, v11
-; GISEL-NEXT: v_mul_hi_u32 v4, v15, v11
+; GISEL-NEXT: v_mul_hi_u32 v4, v14, v11
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v2, v15, v0
+; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v3, v1
; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
@@ -2041,25 +2042,25 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v15, v0, vcc
+; GISEL-NEXT: v_addc_u32_e32 v0, vcc, v14, v0, vcc
; GISEL-NEXT: v_mul_lo_u32 v2, v16, v1
-; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0
-; GISEL-NEXT: v_mul_hi_u32 v4, v14, v1
+; GISEL-NEXT: v_mul_lo_u32 v3, v5, v0
+; GISEL-NEXT: v_mul_hi_u32 v4, v5, v1
; GISEL-NEXT: v_mul_hi_u32 v1, v16, v1
-; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7
+; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_mul_lo_u32 v4, v16, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT: v_mul_hi_u32 v3, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v3, v5, v0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
@@ -2073,39 +2074,38 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v1
; GISEL-NEXT: v_mov_b32_e32 v0, v3
; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v13, v[0:1]
-; GISEL-NEXT: v_xor_b32_e32 v8, v8, v7
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v5, v7
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4]
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v7
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v14, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v16, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v16, v3
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v9
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v9, v11, v[3:4]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v5, v2
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v16, v7, vcc
+; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v16, v7
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v9
+; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v4, v9, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v10
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v9
-; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9
+; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v7, s[4:5]
; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v11
; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v5
; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v7, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
-; GISEL-NEXT: v_xor_b32_e32 v4, v12, v6
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc
+; GISEL-NEXT: v_xor_b32_e32 v4, v15, v6
; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4
; GISEL-NEXT: v_xor_b32_e32 v3, v3, v4
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
@@ -2137,126 +2137,126 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v1, v10, v0
; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4
; CGP-NEXT: v_cvt_f32_u32_e32 v11, v1
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4
-; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc
+; CGP-NEXT: v_sub_i32_e32 v17, vcc, 0, v4
+; CGP-NEXT: v_subb_u32_e32 v18, vcc, 0, v1, vcc
; CGP-NEXT: v_mac_f32_e32 v10, 0x4f800000, v11
; CGP-NEXT: v_rcp_iflag_f32_e32 v10, v10
; CGP-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10
; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v10
; CGP-NEXT: v_trunc_f32_e32 v12, v11
; CGP-NEXT: v_mac_f32_e32 v10, 0xcf800000, v12
-; CGP-NEXT: v_cvt_u32_f32_e32 v13, v10
-; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12]
-; CGP-NEXT: v_mul_hi_u32 v17, v13, v10
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT: v_mul_lo_u32 v12, v16, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT: v_mul_lo_u32 v18, v13, v11
-; CGP-NEXT: v_mul_lo_u32 v19, v16, v11
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v18
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v17
-; CGP-NEXT: v_mul_hi_u32 v17, v13, v11
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v18, v12
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v19, v10
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v18, v17
-; CGP-NEXT: v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10
+; CGP-NEXT: v_cvt_u32_f32_e32 v19, v12
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12]
+; CGP-NEXT: v_mul_lo_u32 v11, v19, v10
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; CGP-NEXT: v_mul_hi_u32 v12, v16, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v19, v10
+; CGP-NEXT: v_mul_lo_u32 v13, v16, v14
+; CGP-NEXT: v_mul_lo_u32 v15, v19, v14
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v16, v14
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v10
-; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v11, vcc
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12]
-; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v8, v14
-; CGP-NEXT: v_mul_lo_u32 v8, v16, v10
-; CGP-NEXT: v_mul_lo_u32 v15, v13, v11
-; CGP-NEXT: v_xor_b32_e32 v17, v9, v14
-; CGP-NEXT: v_mul_hi_u32 v9, v13, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_mul_hi_u32 v13, v19, v14
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v10
+; CGP-NEXT: v_addc_u32_e32 v19, vcc, v19, v11, vcc
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12]
+; CGP-NEXT: v_ashrrev_i32_e32 v17, 31, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v17
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v17, vcc
+; CGP-NEXT: v_xor_b32_e32 v15, v8, v17
+; CGP-NEXT: v_mul_lo_u32 v8, v19, v10
+; CGP-NEXT: v_mul_lo_u32 v11, v16, v14
+; CGP-NEXT: v_xor_b32_e32 v18, v9, v17
+; CGP-NEXT: v_mul_hi_u32 v9, v16, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v19, v10
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v9, v16, v11
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8
-; CGP-NEXT: v_mul_hi_u32 v15, v13, v11
+; CGP-NEXT: v_mul_lo_u32 v9, v19, v14
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8
+; CGP-NEXT: v_mul_hi_u32 v11, v16, v14
; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v15
-; CGP-NEXT: v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_mul_hi_u32 v11, v19, v14
; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, v16, v9, vcc
-; CGP-NEXT: v_mul_lo_u32 v10, v17, v8
-; CGP-NEXT: v_mul_lo_u32 v11, v12, v9
-; CGP-NEXT: v_mul_hi_u32 v13, v12, v8
-; CGP-NEXT: v_mul_hi_u32 v8, v17, v8
-; CGP-NEXT: v_mul_hi_u32 v15, v17, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, v19, v9, vcc
+; CGP-NEXT: v_mul_lo_u32 v10, v18, v8
+; CGP-NEXT: v_mul_lo_u32 v11, v15, v9
+; CGP-NEXT: v_mul_hi_u32 v12, v15, v8
+; CGP-NEXT: v_mul_hi_u32 v8, v18, v8
; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v17, v9
+; CGP-NEXT: v_mul_lo_u32 v12, v18, v9
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_mul_hi_u32 v11, v12, v9
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v11, v15, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v8, v10
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v13, 0
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v8, v10
+; CGP-NEXT: v_mul_hi_u32 v12, v18, v9
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v14, 0
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v10
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[9:10]
-; CGP-NEXT: v_sub_i32_e32 v8, vcc, v12, v8
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v13, v[9:10]
-; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v17, v9, vcc
-; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v9
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1
-; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v12, v10
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v4, v16, v[9:10]
+; CGP-NEXT: v_sub_i32_e32 v8, vcc, v15, v8
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v1, v14, v[10:11]
+; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v18, v12, vcc
+; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v18, v12
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1
+; CGP-NEXT: v_subb_u32_e32 v10, vcc, v10, v1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4
; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v1
-; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v10, v12, v15, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v13
-; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v1
+; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v9, v11, v12, s[4:5]
+; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v14
+; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v16, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v1
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v1
-; CGP-NEXT: v_cndmask_b32_e32 v1, v16, v4, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v12
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v1
+; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v4, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v11
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v12, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; CGP-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v4, v15, v8, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc
-; CGP-NEXT: v_xor_b32_e32 v8, v14, v0
-; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v8, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; CGP-NEXT: v_cndmask_b32_e32 v1, v14, v1, vcc
+; CGP-NEXT: v_xor_b32_e32 v8, v17, v0
+; CGP-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc
; CGP-NEXT: v_xor_b32_e32 v0, v1, v8
; CGP-NEXT: v_xor_b32_e32 v1, v4, v8
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
@@ -2312,128 +2312,128 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v3, v6, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4
; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3
-; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v4
-; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc
+; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v4
+; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc
; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8
; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6
; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6
; CGP-NEXT: v_trunc_f32_e32 v10, v8
; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10
-; CGP-NEXT: v_cvt_u32_f32_e32 v11, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v14, v10
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT: v_cvt_u32_f32_e32 v13, v6
+; CGP-NEXT: v_cvt_u32_f32_e32 v16, v10
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
; CGP-NEXT: v_mov_b32_e32 v6, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7]
-; CGP-NEXT: v_mul_lo_u32 v6, v14, v8
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v8
-; CGP-NEXT: v_mul_hi_u32 v8, v14, v8
-; CGP-NEXT: v_mul_lo_u32 v15, v11, v9
-; CGP-NEXT: v_mul_lo_u32 v16, v14, v9
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7]
+; CGP-NEXT: v_mul_lo_u32 v6, v16, v8
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT: v_mul_hi_u32 v9, v13, v8
+; CGP-NEXT: v_mul_hi_u32 v8, v16, v8
+; CGP-NEXT: v_mul_lo_u32 v10, v13, v11
+; CGP-NEXT: v_mul_lo_u32 v12, v16, v11
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v9
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; CGP-NEXT: v_mul_hi_u32 v9, v13, v11
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v15, v6
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v16, v8
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v10
-; CGP-NEXT: v_mul_hi_u32 v9, v14, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT: v_mul_hi_u32 v10, v16, v11
; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8
; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v6
-; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v8, vcc
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v6
+; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v8, vcc
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
; CGP-NEXT: v_mov_b32_e32 v6, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7]
-; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v12, vcc
-; CGP-NEXT: v_xor_b32_e32 v10, v5, v12
-; CGP-NEXT: v_mul_lo_u32 v5, v14, v8
-; CGP-NEXT: v_mul_lo_u32 v7, v11, v9
-; CGP-NEXT: v_xor_b32_e32 v13, v6, v12
-; CGP-NEXT: v_mul_hi_u32 v6, v11, v8
-; CGP-NEXT: v_mul_hi_u32 v8, v14, v8
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7]
+; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v14
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v14, vcc
+; CGP-NEXT: v_xor_b32_e32 v12, v5, v14
+; CGP-NEXT: v_mul_lo_u32 v5, v16, v8
+; CGP-NEXT: v_mul_lo_u32 v7, v13, v11
+; CGP-NEXT: v_xor_b32_e32 v15, v6, v14
+; CGP-NEXT: v_mul_hi_u32 v6, v13, v8
+; CGP-NEXT: v_mul_hi_u32 v8, v16, v8
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v6, v14, v9
+; CGP-NEXT: v_mul_lo_u32 v6, v16, v11
; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v11, v9
+; CGP-NEXT: v_mul_hi_u32 v7, v13, v11
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT: v_mul_hi_u32 v8, v14, v9
+; CGP-NEXT: v_mul_hi_u32 v8, v16, v11
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5
-; CGP-NEXT: v_addc_u32_e32 v6, vcc, v14, v6, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT: v_mul_lo_u32 v8, v10, v6
-; CGP-NEXT: v_mul_hi_u32 v9, v10, v5
-; CGP-NEXT: v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT: v_mul_hi_u32 v11, v13, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, v16, v6, vcc
+; CGP-NEXT: v_mul_lo_u32 v7, v15, v5
+; CGP-NEXT: v_mul_lo_u32 v8, v12, v6
+; CGP-NEXT: v_mul_hi_u32 v9, v12, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v15, v5
; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v9, v13, v6
+; CGP-NEXT: v_mul_lo_u32 v9, v15, v6
; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT: v_mul_hi_u32 v8, v10, v6
+; CGP-NEXT: v_mul_hi_u32 v8, v12, v6
; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v5, v7
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, 0
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v5, v7
+; CGP-NEXT: v_mul_hi_u32 v9, v15, v6
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v11, 0
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v7
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v8, v[6:7]
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v9, v[6:7]
-; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v13, v6, vcc
-; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v13, v6
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3
-; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v9, v7
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[6:7]
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v12, v5
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v11, v[7:8]
+; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v15, v9, vcc
+; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v15, v9
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4
; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v4
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3
-; CGP-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v9
-; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v6, v8, v9, s[4:5]
+; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v11
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
-; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v10
-; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v11, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v5, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v5, v12, v2
-; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v8
+; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; CGP-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
+; CGP-NEXT: v_xor_b32_e32 v5, v14, v2
+; CGP-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc
; CGP-NEXT: v_xor_b32_e32 v2, v3, v5
; CGP-NEXT: v_xor_b32_e32 v3, v4, v5
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
@@ -2503,15 +2503,15 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) {
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0
; CGP-NEXT: v_rcp_f32_e32 v1, v1
; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v1
; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3
-; CGP-NEXT: v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v2
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
+; CGP-NEXT: v_mul_lo_u32 v5, v1, v4
+; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0
+; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
; CGP-NEXT: v_mul_lo_u32 v0, v1, v3
; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v1
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0
@@ -2537,204 +2537,204 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1
; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v1
+; GISEL-NEXT: v_sub_i32_e32 v12, vcc, 0, v1
; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v5
; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3
; GISEL-NEXT: v_trunc_f32_e32 v7, v4
; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v7
-; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v3
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v12, v3
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT: v_mul_hi_u32 v8, v9, v3
-; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3
-; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7
-; GISEL-NEXT: v_mul_lo_u32 v14, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v7
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v14, v3
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; GISEL-NEXT: v_mul_hi_u32 v7, v11, v3
+; GISEL-NEXT: v_mul_hi_u32 v3, v14, v3
+; GISEL-NEXT: v_mul_lo_u32 v8, v11, v9
+; GISEL-NEXT: v_mul_lo_u32 v10, v14, v9
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v11, v9
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v14, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v10, v3
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT: v_mul_hi_u32 v8, v14, v9
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v3
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v12, v3
-; GISEL-NEXT: v_and_b32_e32 v10, 0xffffff, v0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v3
-; GISEL-NEXT: v_mul_hi_u32 v3, v12, v3
-; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7
-; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v2
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v3
+; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v14, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v11, 0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v14, v3
+; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v0
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; GISEL-NEXT: v_mul_hi_u32 v0, v11, v3
+; GISEL-NEXT: v_mul_hi_u32 v3, v14, v3
+; GISEL-NEXT: v_mul_lo_u32 v7, v11, v9
+; GISEL-NEXT: v_and_b32_e32 v13, 0xffffff, v2
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v4, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7
+; GISEL-NEXT: v_mul_lo_u32 v4, v14, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT: v_mul_hi_u32 v7, v11, v9
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v14, v9
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v3, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v14, v3, vcc
; GISEL-NEXT: v_mul_lo_u32 v7, 0, v0
-; GISEL-NEXT: v_mul_lo_u32 v8, v10, v4
+; GISEL-NEXT: v_mul_lo_u32 v8, v12, v4
; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v6, v12, v0
; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GISEL-NEXT: v_mul_lo_u32 v8, 0, v4
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4
+; GISEL-NEXT: v_mul_hi_u32 v7, v12, v4
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v0, 0
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v3
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v4, v[7:8]
-; GISEL-NEXT: v_mac_f32_e32 v9, 0x4f800000, v5
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v9
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v0, v[7:8]
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v10, v6
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v4, v[7:8]
+; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v3
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], 0, v0, v[8:9]
+; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v5
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v7
+; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v12, v6
+; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], 0, v10, vcc
; GISEL-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2
-; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], 0, v7, vcc
-; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 0, v7
; GISEL-NEXT: v_trunc_f32_e32 v7, v5
; GISEL-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7
; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v2
-; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v3
-; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v7
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1
+; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 0, v10
+; GISEL-NEXT: v_sub_i32_e64 v14, s[4:5], 0, v3
+; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v12, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v7
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1
; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[2:3]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, -1, v16, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[6:7]
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v16, v[2:3]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v17, -1, v9, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v15, v12, v[6:7]
+; GISEL-NEXT: v_mul_lo_u32 v6, v16, v5
; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v10, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, v15, v5
-; GISEL-NEXT: v_mul_lo_u32 v10, v12, v6
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v2, vcc
-; GISEL-NEXT: v_mul_hi_u32 v2, v12, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, v15, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v15, v5
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; GISEL-NEXT: v_mul_hi_u32 v10, v12, v6
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_mul_lo_u32 v7, v12, v8
+; GISEL-NEXT: v_mul_hi_u32 v10, v12, v5
+; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v11, v1
+; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v10, v16, v8
+; GISEL-NEXT: v_mul_hi_u32 v5, v16, v5
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_mul_hi_u32 v7, v12, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT: v_mul_hi_u32 v6, v15, v6
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v12, v2
-; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v15, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v4, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7
+; GISEL-NEXT: v_mul_hi_u32 v8, v16, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v5
+; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v16, v6, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v8, 0
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v0
+; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v4, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v1
; GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v13, v10, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v14, v7, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v8, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v12
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v15, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v10, v5
-; GISEL-NEXT: v_mul_lo_u32 v14, v7, v1
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
-; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v12, v10, v1
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v14, v10, v[1:2]
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v8, v[6:7]
+; GISEL-NEXT: v_cndmask_b32_e32 v9, -1, v9, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v11
+; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc
+; GISEL-NEXT: v_mul_lo_u32 v7, v10, v5
+; GISEL-NEXT: v_mul_lo_u32 v14, v8, v1
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v8, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v9, v10, v1
; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v13, v6
-; GISEL-NEXT: v_mul_hi_u32 v13, v7, v1
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v12, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7
+; GISEL-NEXT: v_mul_hi_u32 v11, v8, v1
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v9, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11
; GISEL-NEXT: v_mul_hi_u32 v1, v10, v1
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6
-; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v6
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v9, v7
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v7
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5
; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], v10, v1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v6, 0, v5
-; GISEL-NEXT: v_mul_lo_u32 v7, v11, v1
-; GISEL-NEXT: v_mul_hi_u32 v10, v11, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v8, vcc
+; GISEL-NEXT: v_mul_lo_u32 v7, 0, v5
+; GISEL-NEXT: v_mul_lo_u32 v8, v13, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v12, v6, vcc
+; GISEL-NEXT: v_mul_hi_u32 v6, v13, v5
; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT: v_mul_lo_u32 v7, 0, v1
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT: v_mul_hi_u32 v10, v11, v1
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT: v_mul_lo_u32 v8, 0, v1
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_mul_hi_u32 v7, v13, v1
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v5, v6
; GISEL-NEXT: v_mul_hi_u32 v1, 0, v1
; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v1, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v7
; GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[1:2]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v3, v11, v[1:2]
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v10, v[6:7]
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v10, v[6:7]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v5
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v6
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v6, vcc
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v5
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v8
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v8, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
@@ -2743,7 +2743,7 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5]
; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v10
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v11, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
@@ -2755,7 +2755,7 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_v2i64_24bit:
@@ -2775,27 +2775,27 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0
; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT: v_mul_lo_u32 v5, v1, v3
-; CGP-NEXT: v_mul_lo_u32 v0, v0, v6
+; CGP-NEXT: v_mul_lo_u32 v5, v0, v6
+; CGP-NEXT: v_mul_lo_u32 v0, v1, v3
; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v1
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
+; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v0
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v3
; CGP-NEXT: v_cndmask_b32_e32 v7, v1, v7, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT: v_and_b32_e32 v8, 0xffffff, v2
-; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v5, v3
-; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v6, v1
-; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v8, v1, 0
-; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
+; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v2
+; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v8, v3
+; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v1
+; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v6, 0
+; CGP-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v7
-; CGP-NEXT: v_mul_lo_u32 v5, v2, v4
+; CGP-NEXT: v_mul_lo_u32 v6, v2, v4
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc
; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v5
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v6
; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 1441591a5fcce..f4489c2239fda 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -175,65 +175,65 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_trunc_f32_e32 v2, v1
; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0
+; GFX8-NEXT: v_mov_b32_e32 v8, s11
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0
; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1
; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
@@ -246,36 +246,36 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT: v_mul_hi_u32 v4, s11, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v6, s11
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s10, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v5, s9
-; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s11, v1
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v8, v4, vcc
+; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s11, v4
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s8, v0
-; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4
-; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1]
+; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v0
+; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v3, vcc
+; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v6
+; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v5
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8
-; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v7
+; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v5
; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9
; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
@@ -283,20 +283,20 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v9, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v7, v10, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v0, v3, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1]
; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13]
; GFX8-NEXT: v_xor_b32_e32 v0, s0, v4
-; GFX8-NEXT: v_xor_b32_e32 v1, s1, v3
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_xor_b32_e32 v1, s1, v6
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT: v_xor_b32_e32 v3, s2, v5
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3
; GFX8-NEXT: v_xor_b32_e32 v4, s2, v2
; GFX8-NEXT: v_mov_b32_e32 v5, s2
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s2, v3
@@ -312,6 +312,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-LABEL: sdivrem_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s2, s17, 31
; GFX9-NEXT: s_ashr_i32 s4, s19, 31
@@ -335,64 +336,63 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_trunc_f32_e32 v2, v1
; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, s7
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0
; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1
; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0
; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX9-NEXT: v_mul_hi_u32 v6, s9, v1
+; GFX9-NEXT: v_mul_hi_u32 v5, s9, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
@@ -400,67 +400,67 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v6, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s6, v8, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2
-; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v6, v[2:3]
+; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v1, v4, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v1
+; GFX9-NEXT: v_sub_u32_e32 v2, s9, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s6, v0
-; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9
+; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v1
+; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v0
+; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v2, vcc
+; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v8, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v5
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v4
+; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9
-; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v8
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v5
+; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v4
; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10
-; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v10, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v11, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v0, v4, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1]
; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
-; GFX9-NEXT: v_xor_b32_e32 v0, s0, v5
-; GFX9-NEXT: v_xor_b32_e32 v1, s1, v3
+; GFX9-NEXT: v_xor_b32_e32 v0, s0, v3
+; GFX9-NEXT: v_xor_b32_e32 v1, s1, v6
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT: v_xor_b32_e32 v3, s2, v6
-; GFX9-NEXT: v_xor_b32_e32 v5, s2, v2
-; GFX9-NEXT: v_mov_b32_e32 v6, s2
+; GFX9-NEXT: v_xor_b32_e32 v3, s2, v4
+; GFX9-NEXT: v_xor_b32_e32 v4, s2, v2
+; GFX9-NEXT: v_mov_b32_e32 v5, s2
; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v3
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[14:15]
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v5, vcc
+; GFX9-NEXT: global_store_dwordx2 v9, v[0:1], s[12:13]
+; GFX9-NEXT: global_store_dwordx2 v9, v[2:3], s[14:15]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sdivrem_i64:
@@ -1311,68 +1311,68 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_trunc_f32_e32 v2, v1
; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX8-NEXT: v_mov_b32_e32 v8, s11
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX8-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7]
; GFX8-NEXT: s_ashr_i32 s6, s19, 31
; GFX8-NEXT: s_mov_b32 s7, s6
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0
; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1
; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
@@ -1385,38 +1385,38 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT: v_mul_hi_u32 v4, s11, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v6, s11
-; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s10, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v5, s9
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
+; GFX8-NEXT: v_sub_u32_e32 v9, vcc, s10, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NEXT: s_ashr_i32 s10, s3, 31
-; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v1
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
+; GFX8-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v4, vcc
+; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s11, v4
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6
-; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, s8, v7
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1]
-; GFX8-NEXT: v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc
-; GFX8-NEXT: v_add_u32_e64 v1, s[0:1], 1, v4
-; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9
+; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s8, v9
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX8-NEXT: v_subbrev_u32_e64 v10, s[0:1], 0, v0, vcc
+; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], 1, v6
+; GFX8-NEXT: v_addc_u32_e64 v5, s[0:1], 0, v7, s[0:1]
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v4
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10
; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
-; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v1
-; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
+; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v3
+; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v5, s[0:1]
; GFX8-NEXT: s_add_u32 s0, s18, s6
; GFX8-NEXT: s_addc_u32 s1, s19, s6
; GFX8-NEXT: s_add_u32 s2, s2, s10
@@ -1424,15 +1424,15 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: s_addc_u32 s3, s3, s10
; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11]
; GFX8-NEXT: v_cvt_f32_u32_e32 v14, s3
-; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s2
-; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v8
+; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s2
+; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, s8, v4
; GFX8-NEXT: v_subbrev_u32_e32 v16, vcc, 0, v0, vcc
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v14
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v1, v12, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc
; GFX8-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7]
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
@@ -1441,151 +1441,151 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v0
; GFX8-NEXT: s_sub_u32 s5, 0, s2
-; GFX8-NEXT: s_subb_u32 s20, 0, s3
-; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc
+; GFX8-NEXT: v_cvt_u32_f32_e32 v11, v11
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0
; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1]
-; GFX8-NEXT: v_cvt_u32_f32_e32 v5, v11
-; GFX8-NEXT: v_cndmask_b32_e64 v10, v3, v10, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v15, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s5, v5, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v3, v5, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v12, v[1:2]
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v16, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1]
-; GFX8-NEXT: v_mul_lo_u32 v8, v12, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v11, v[1:2]
+; GFX8-NEXT: s_subb_u32 s20, 0, s3
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v12, v[2:3]
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v16, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[0:1]
+; GFX8-NEXT: v_mul_lo_u32 v1, v11, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, v12, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[0:1]
; GFX8-NEXT: v_mul_hi_u32 v2, v12, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v5, v0
-; GFX8-NEXT: v_xor_b32_e32 v9, s17, v10
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8
-; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v3, v5, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2
-; GFX8-NEXT: v_mul_hi_u32 v8, v12, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v11, v0
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
-; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8
-; GFX8-NEXT: v_mul_hi_u32 v1, v5, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v12, v0
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; GFX8-NEXT: v_xor_b32_e32 v1, s16, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v2, v11, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_mul_hi_u32 v3, v12, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT: v_mul_hi_u32 v3, v11, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v12, v0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0
+; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v11, v1, vcc
+; GFX8-NEXT: v_xor_b32_e32 v1, s16, v6
; GFX8-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v10, s17
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1]
+; GFX8-NEXT: v_xor_b32_e32 v5, s17, v7
+; GFX8-NEXT: v_mov_b32_e32 v6, s17
; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s16, v1
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4]
-; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v9, v10, vcc
-; GFX8-NEXT: v_xor_b32_e32 v4, s4, v7
-; GFX8-NEXT: v_mul_lo_u32 v7, v5, v2
-; GFX8-NEXT: v_mul_lo_u32 v9, v8, v3
-; GFX8-NEXT: v_mul_hi_u32 v11, v8, v2
-; GFX8-NEXT: v_mul_hi_u32 v2, v5, v2
-; GFX8-NEXT: v_xor_b32_e32 v6, s4, v6
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9
-; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v11
+; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v5, v6, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4]
+; GFX8-NEXT: v_mul_lo_u32 v4, v11, v2
+; GFX8-NEXT: v_xor_b32_e32 v3, s4, v9
+; GFX8-NEXT: v_mul_lo_u32 v7, v10, v5
+; GFX8-NEXT: v_mul_hi_u32 v9, v10, v2
+; GFX8-NEXT: v_mul_hi_u32 v2, v11, v2
+; GFX8-NEXT: v_xor_b32_e32 v6, s4, v8
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v11, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT: v_mul_hi_u32 v9, v8, v3
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v11, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v9
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v9
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v9, v11, v5
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v7, v10, v5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2
; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, v11, v9
-; GFX8-NEXT: v_mul_hi_u32 v3, v5, v3
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v8, v2
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
-; GFX8-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NEXT: v_mul_lo_u32 v7, s9, v2
-; GFX8-NEXT: v_mul_lo_u32 v8, s8, v3
-; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v10, vcc
-; GFX8-NEXT: v_mul_hi_u32 v6, s8, v2
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT: v_mul_hi_u32 v5, v11, v5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v10, v2
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v11, v4, vcc
+; GFX8-NEXT: v_mul_lo_u32 v9, s9, v2
+; GFX8-NEXT: v_mul_lo_u32 v10, s8, v7
+; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s4, v3
+; GFX8-NEXT: v_mul_hi_u32 v3, s8, v2
+; GFX8-NEXT: v_mov_b32_e32 v8, s4
+; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v6, v8, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v9, v10
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v7, s9, v3
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v6, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v6, s9, v7
; GFX8-NEXT: v_mul_hi_u32 v2, s9, v2
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6
-; GFX8-NEXT: v_mul_hi_u32 v8, s8, v3
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v8, v3
+; GFX8-NEXT: v_mul_hi_u32 v8, s8, v7
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v2, v6
-; GFX8-NEXT: v_mul_hi_u32 v9, s9, v3
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
-; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4]
-; GFX8-NEXT: v_mov_b32_e32 v10, s9
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v2, v3
+; GFX8-NEXT: v_mul_hi_u32 v7, s9, v7
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
+; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, v7, v6
+; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v12, s9
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2
-; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7]
+; GFX8-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7]
; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: v_subb_u32_e64 v7, s[0:1], v10, v6, vcc
-; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s9, v6
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7
-; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1]
+; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v12, v8, vcc
+; GFX8-NEXT: v_sub_u32_e64 v7, s[0:1], s9, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6
+; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7
-; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s2, v2
-; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6
+; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1]
+; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, s2, v2
+; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v7, vcc
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12
; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v9
; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12
; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1]
-; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v8
-; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc
-; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1]
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v14
+; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v10
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc
+; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v11, s[0:1]
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v14
; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v11
+; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v9
; GFX8-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v8, v9, v14, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc
+; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1]
; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11]
-; GFX8-NEXT: v_xor_b32_e32 v2, s0, v6
+; GFX8-NEXT: v_xor_b32_e32 v2, s0, v7
; GFX8-NEXT: v_xor_b32_e32 v3, s1, v8
-; GFX8-NEXT: v_mov_b32_e32 v6, s1
+; GFX8-NEXT: v_mov_b32_e32 v7, s1
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc
-; GFX8-NEXT: v_xor_b32_e32 v6, s6, v9
-; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7
-; GFX8-NEXT: v_mov_b32_e32 v8, s6
-; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v6
-; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v8, vcc
+; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GFX8-NEXT: v_xor_b32_e32 v7, s6, v9
+; GFX8-NEXT: v_xor_b32_e32 v8, s6, v6
+; GFX8-NEXT: v_mov_b32_e32 v9, s6
+; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v7
+; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v8, v9, vcc
; GFX8-NEXT: v_mov_b32_e32 v8, s12
; GFX8-NEXT: v_mov_b32_e32 v9, s13
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
@@ -1622,66 +1622,67 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_trunc_f32_e32 v2, v1
; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX9-NEXT: s_xor_b64 s[16:17], s[4:5], s[6:7]
; GFX9-NEXT: s_ashr_i32 s6, s19, 31
; GFX9-NEXT: s_mov_b32 s7, s6
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v2, s11, v0
; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1
; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0
; GFX9-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX9-NEXT: v_mul_hi_u32 v6, s11, v1
+; GFX9-NEXT: v_mul_hi_u32 v5, s11, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
@@ -1693,51 +1694,50 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v6, s11
-; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s10, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v4, s9
+; GFX9-NEXT: v_add3_u32 v7, v3, v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
+; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: s_ashr_i32 s10, s3, 31
-; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX9-NEXT: v_sub_u32_e32 v0, s11, v1
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v8, v4, vcc
+; GFX9-NEXT: v_sub_u32_e32 v0, s11, v4
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6
-; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s8, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v2, s[0:1]
-; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s8, v9
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[0:1]
+; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v0, vcc
+; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], 0, v7, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v11
+; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v10
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[0:1]
-; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2
-; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v11
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v2
+; GFX9-NEXT: v_addc_co_u32_e64 v13, s[0:1], 0, v3, s[0:1]
; GFX9-NEXT: s_add_u32 s0, s18, s6
; GFX9-NEXT: s_addc_u32 s1, s19, s6
; GFX9-NEXT: s_add_u32 s2, s2, s10
; GFX9-NEXT: s_mov_b32 s11, s10
; GFX9-NEXT: s_addc_u32 s3, s3, s10
; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v14, s3
; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s2
-; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
+; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
+; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v14
; GFX9-NEXT: v_add_f32_e32 v1, v1, v15
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v9
+; GFX9-NEXT: v_subrev_co_u32_e32 v14, vcc, s8, v10
; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
@@ -1747,31 +1747,31 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v0
; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7]
; GFX9-NEXT: s_sub_u32 s5, 0, s2
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v12, v2, v13, vcc
-; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v16
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v12, vcc
+; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v16
+; GFX9-NEXT: v_cndmask_b32_e32 v13, v3, v13, vcc
; GFX9-NEXT: s_subb_u32 s20, 0, s3
-; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2]
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, v11, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v13, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v15, vcc
-; GFX9-NEXT: v_mul_lo_u32 v3, v17, v1
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v12, v[1:2]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, v5, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[18:19], s20, v17, v[2:3]
+; GFX9-NEXT: v_mul_lo_u32 v2, v12, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v14, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v17, v4
; GFX9-NEXT: v_mul_hi_u32 v10, v17, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v13, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v13, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v15, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v10
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v10, v13, v1
+; GFX9-NEXT: v_mul_lo_u32 v10, v12, v4
+; GFX9-NEXT: v_mul_hi_u32 v0, v12, v0
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT: v_mul_hi_u32 v3, v17, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1
+; GFX9-NEXT: v_mul_hi_u32 v3, v17, v4
+; GFX9-NEXT: v_mul_hi_u32 v4, v12, v4
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
@@ -1779,119 +1779,119 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_add_u32_e32 v3, v10, v3
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4
; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v17, v0
-; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1
+; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v12, v2, vcc
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0
-; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v1, vcc
+; GFX9-NEXT: v_xor_b32_e32 v1, s16, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, v3
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v4, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1]
-; GFX9-NEXT: v_xor_b32_e32 v5, s16, v5
-; GFX9-NEXT: v_xor_b32_e32 v8, s17, v8
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v9, s17
-; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v5
-; GFX9-NEXT: v_xor_b32_e32 v4, s4, v7
-; GFX9-NEXT: v_mul_lo_u32 v5, v11, v2
-; GFX9-NEXT: v_mul_lo_u32 v7, v10, v3
-; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v9, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1]
+; GFX9-NEXT: v_xor_b32_e32 v8, s17, v5
+; GFX9-NEXT: v_mov_b32_e32 v12, s17
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4]
+; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v1
+; GFX9-NEXT: v_mul_lo_u32 v3, v11, v2
+; GFX9-NEXT: v_mul_lo_u32 v6, v10, v5
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v12, vcc
; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3
+; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v8, v11, v5
; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT: v_mul_hi_u32 v7, v10, v3
-; GFX9-NEXT: v_mul_hi_u32 v3, v11, v3
+; GFX9-NEXT: v_add_u32_e32 v3, v6, v3
+; GFX9-NEXT: v_mul_hi_u32 v6, v10, v5
+; GFX9-NEXT: v_mul_hi_u32 v5, v11, v5
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_add_u32_e32 v7, v8, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v3, v7, v5, v3
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
+; GFX9-NEXT: v_add_u32_e32 v6, v8, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v3, v6, v3, v5
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2
-; GFX9-NEXT: v_mul_lo_u32 v7, s8, v3
+; GFX9-NEXT: v_mul_lo_u32 v6, s8, v3
+; GFX9-NEXT: v_xor_b32_e32 v4, s4, v9
; GFX9-NEXT: v_mul_hi_u32 v9, s8, v2
; GFX9-NEXT: v_mul_hi_u32 v2, s9, v2
-; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX9-NEXT: v_mul_lo_u32 v9, s9, v3
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT: v_mul_hi_u32 v7, s8, v3
-; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6
+; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT: v_mul_hi_u32 v6, s8, v3
+; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
-; GFX9-NEXT: v_mov_b32_e32 v8, s4
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v6, v9, v6
+; GFX9-NEXT: v_xor_b32_e32 v7, s4, v7
+; GFX9-NEXT: v_mov_b32_e32 v8, s4
; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v4
-; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v8, vcc
-; GFX9-NEXT: v_add_u32_e32 v6, v9, v7
-; GFX9-NEXT: v_add3_u32 v8, v6, v11, v12
-; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4]
-; GFX9-NEXT: v_mov_b32_e32 v9, s9
+; GFX9-NEXT: v_add3_u32 v11, v6, v11, v12
+; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v8, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4]
+; GFX9-NEXT: v_mov_b32_e32 v12, s9
; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v2
-; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v10, v[6:7]
+; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7]
; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v9, v6, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7
-; GFX9-NEXT: v_sub_u32_e32 v6, s9, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v12, v8, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6
+; GFX9-NEXT: v_sub_u32_e32 v7, s9, v8
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v7
-; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s2, v2
-; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6
+; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v3, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[0:1]
+; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s2, v2
+; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v9
; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12
; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v10
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v6, v3, vcc
-; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1]
-; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 1, v14
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
+; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v11, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v14
; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v7, v14, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v11
+; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v9
; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v7, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v8, v11, v14, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[0:1]
; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[10:11]
-; GFX9-NEXT: v_xor_b32_e32 v2, s0, v6
+; GFX9-NEXT: v_xor_b32_e32 v2, s0, v7
; GFX9-NEXT: v_xor_b32_e32 v3, s1, v8
-; GFX9-NEXT: v_mov_b32_e32 v6, s1
+; GFX9-NEXT: v_mov_b32_e32 v7, s1
; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
-; GFX9-NEXT: v_xor_b32_e32 v6, s6, v9
+; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-NEXT: v_xor_b32_e32 v7, s6, v9
; GFX9-NEXT: v_mov_b32_e32 v13, 0
-; GFX9-NEXT: v_xor_b32_e32 v7, s6, v7
-; GFX9-NEXT: v_mov_b32_e32 v8, s6
-; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v6
-; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v8, vcc
+; GFX9-NEXT: v_xor_b32_e32 v8, s6, v6
+; GFX9-NEXT: v_mov_b32_e32 v9, s6
+; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v7
+; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v9, vcc
; GFX9-NEXT: global_store_dwordx4 v13, v[0:3], s[12:13]
; GFX9-NEXT: global_store_dwordx4 v13, v[4:7], s[14:15]
; GFX9-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 1a10f5fb7a5ce..a914f89e4a0b7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -31,128 +31,128 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0
; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v1
-; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0
-; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT: v_sub_i32_e32 v11, vcc, 0, v0
+; CHECK-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v6, v3
; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v6
-; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v6
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2
-; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6
-; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v13, v6
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4]
+; CHECK-NEXT: v_mul_lo_u32 v3, v13, v2
+; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT: v_mul_hi_u32 v6, v10, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2
+; CHECK-NEXT: v_mul_lo_u32 v7, v10, v8
+; CHECK-NEXT: v_mul_lo_u32 v9, v13, v8
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v10, v8
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v12, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v13, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT: v_mul_hi_u32 v7, v13, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
-; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v3, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v5
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v9
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v9, vcc
-; CHECK-NEXT: v_xor_b32_e32 v5, v3, v9
-; CHECK-NEXT: v_mul_lo_u32 v3, v11, v2
-; CHECK-NEXT: v_mul_lo_u32 v7, v8, v6
-; CHECK-NEXT: v_xor_b32_e32 v10, v4, v9
-; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v11, v2
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v2
+; CHECK-NEXT: v_addc_u32_e32 v13, vcc, v13, v3, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4]
+; CHECK-NEXT: v_ashrrev_i32_e32 v11, 31, v5
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v11
+; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v11, vcc
+; CHECK-NEXT: v_xor_b32_e32 v9, v3, v11
+; CHECK-NEXT: v_mul_lo_u32 v3, v13, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v10, v8
+; CHECK-NEXT: v_xor_b32_e32 v12, v4, v11
+; CHECK-NEXT: v_mul_hi_u32 v4, v10, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v13, v2
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v4, v13, v8
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_mul_hi_u32 v5, v10, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v13, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2
-; CHECK-NEXT: v_mul_lo_u32 v6, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v7, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
-; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc
+; CHECK-NEXT: v_mul_lo_u32 v4, v12, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3
+; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v12, v2
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v10, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT: v_mul_hi_u32 v6, v5, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT: v_mul_lo_u32 v6, v12, v3
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v2, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v7, 0
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4
+; CHECK-NEXT: v_mul_hi_u32 v6, v12, v3
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v7, v[3:4]
-; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v4
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4]
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5]
+; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v6, vcc
+; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v6
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5]
; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0
-; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
+; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5]
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v11
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v11
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v11
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc
; CHECK-NEXT: ; implicit-def: $vgpr2
; CHECK-NEXT: ; implicit-def: $vgpr4
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -217,109 +217,109 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; CHECK-NEXT: v_trunc_f32_e32 v2, v1
; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v0
-; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v2
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v6, v3, v1
-; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1
-; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v0
+; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v2
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT: v_mul_hi_u32 v2, v6, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT: v_mul_lo_u32 v3, v6, v4
+; CHECK-NEXT: v_mul_lo_u32 v5, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v8, v6, v4
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v0
+; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT: v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT: v_mul_hi_u32 v3, v6, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT: v_mul_lo_u32 v2, v6, v4
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v0
-; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT: v_mul_hi_u32 v6, v3, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v5, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v6, v4, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_mul_hi_u32 v2, v6, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT: v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v6, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; CHECK-NEXT: v_mul_lo_u32 v2, s11, v0
; CHECK-NEXT: v_mul_lo_u32 v3, s10, v1
-; CHECK-NEXT: v_mul_hi_u32 v5, s10, v0
+; CHECK-NEXT: v_mul_hi_u32 v4, s10, v0
; CHECK-NEXT: v_mul_hi_u32 v0, s11, v0
-; CHECK-NEXT: v_mul_hi_u32 v6, s11, v1
+; CHECK-NEXT: v_mov_b32_e32 v6, s9
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v5, s11, v1
+; CHECK-NEXT: v_mul_lo_u32 v4, s11, v1
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_mul_hi_u32 v3, s10, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v0, v2
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v2
+; CHECK-NEXT: v_mul_hi_u32 v4, s11, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v7, 0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v2, v[1:2]
-; CHECK-NEXT: v_mov_b32_e32 v3, s11
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v2
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[1:2]
+; CHECK-NEXT: v_mov_b32_e32 v1, s11
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, s10, v0
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
-; CHECK-NEXT: v_mov_b32_e32 v4, s9
-; CHECK-NEXT: v_subb_u32_e64 v2, s[0:1], v3, v1, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[0:1], s11, v1
-; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s9, v7, v[2:3]
+; CHECK-NEXT: v_subb_u32_e64 v1, s[0:1], v1, v4, vcc
+; CHECK-NEXT: v_sub_i32_e64 v2, s[0:1], s11, v4
+; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v1
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
; CHECK-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v2, v3, v5, s[0:1]
+; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v1
+; CHECK-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v1, v3, v4, s[0:1]
; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s9, v1
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s9, v2
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s8, v3
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s9, v2
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
; CHECK-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; CHECK-NEXT: v_xor_b32_e32 v0, s6, v0
; CHECK-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0
; CHECK-NEXT: s_branch .LBB1_3
@@ -374,84 +374,84 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_xor_b32_e32 v8, v9, v8
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5
; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8
-; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v5
-; GISEL-NEXT: v_subb_u32_e32 v14, vcc, 0, v8, vcc
+; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v5
+; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v8, vcc
; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v9
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v4
; GISEL-NEXT: v_trunc_f32_e32 v11, v9
; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v11
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v11
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v11
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0
; GISEL-NEXT: v_mov_b32_e32 v4, v10
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v15, v9
-; GISEL-NEXT: v_mul_hi_u32 v16, v12, v9
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11]
-; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9
-; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v17, v9
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11]
+; GISEL-NEXT: v_mul_hi_u32 v11, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9
+; GISEL-NEXT: v_mul_lo_u32 v10, v14, v12
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v16
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v16, v15, v10
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4
-; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_mul_lo_u32 v11, v17, v12
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT: v_mul_hi_u32 v10, v14, v12
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11
-; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_mul_hi_u32 v11, v17, v12
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v9, v4
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v4
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v15, v9, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v4
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v9, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0
; GISEL-NEXT: v_mov_b32_e32 v4, v10
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5]
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11]
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_xor_b32_e32 v13, v0, v4
-; GISEL-NEXT: v_mul_lo_u32 v0, v15, v9
-; GISEL-NEXT: v_mul_lo_u32 v11, v12, v10
-; GISEL-NEXT: v_xor_b32_e32 v14, v1, v4
-; GISEL-NEXT: v_mul_hi_u32 v1, v12, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v15, v9
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v0, v17, v9
+; GISEL-NEXT: v_mul_lo_u32 v10, v14, v12
+; GISEL-NEXT: v_xor_b32_e32 v15, v1, v4
+; GISEL-NEXT: v_mul_hi_u32 v1, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v17, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v15, v10
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT: v_mul_hi_u32 v11, v12, v10
+; GISEL-NEXT: v_mul_lo_u32 v1, v17, v12
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v10, v14, v12
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v17, v12
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v15, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v9, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v9, v15, v0
; GISEL-NEXT: v_mul_lo_u32 v10, v13, v1
; GISEL-NEXT: v_mul_hi_u32 v11, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v14, v1
+; GISEL-NEXT: v_mul_lo_u32 v11, v15, v1
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GISEL-NEXT: v_mul_hi_u32 v10, v13, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
@@ -459,148 +459,148 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_hi_u32 v1, v14, v1
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v9
+; GISEL-NEXT: v_mul_hi_u32 v1, v15, v1
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v0, v9
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v1, v0
-; GISEL-NEXT: v_ashrrev_i32_e32 v10, 31, v7
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v10, vcc
-; GISEL-NEXT: v_xor_b32_e32 v6, v6, v10
-; GISEL-NEXT: v_xor_b32_e32 v7, v7, v10
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v1, v0
+; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v7
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc
+; GISEL-NEXT: v_xor_b32_e32 v6, v6, v9
+; GISEL-NEXT: v_xor_b32_e32 v7, v7, v9
; GISEL-NEXT: v_cvt_f32_u32_e32 v12, v6
-; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v7
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0
-; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
-; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v15
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v9, v[1:2]
+; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v7
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
+; GISEL-NEXT: v_mac_f32_e32 v12, 0x4f800000, v16
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v11, v[1:2]
; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v12
-; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v8, v11, v[9:10]
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v8, v14, v[9:10]
; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1
-; GISEL-NEXT: v_trunc_f32_e32 v12, v10
+; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v1
+; GISEL-NEXT: v_trunc_f32_e32 v12, v9
; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v12
-; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v12
-; GISEL-NEXT: v_sub_i32_e32 v13, vcc, v13, v0
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
-; GISEL-NEXT: v_mov_b32_e32 v0, v11
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, v[0:1]
-; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v14, v9, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v15, v[0:1]
-; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v14, v9
-; GISEL-NEXT: v_mul_lo_u32 v9, v12, v10
-; GISEL-NEXT: v_mul_lo_u32 v14, v15, v0
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v11, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1
+; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v12
+; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v14, 0
+; GISEL-NEXT: v_sub_i32_e32 v19, vcc, v13, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v10
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v18, v[0:1]
+; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v15, v11, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v14, v[12:13]
+; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v15, v11
+; GISEL-NEXT: v_mul_lo_u32 v11, v18, v9
+; GISEL-NEXT: v_mul_lo_u32 v12, v14, v0
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v10, v8
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14
-; GISEL-NEXT: v_mul_hi_u32 v14, v15, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_hi_u32 v10, v12, v10
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v13, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v11, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, v14, s[6:7]
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v13, v5
-; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v1, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v14, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v8
+; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v14, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v9, v18, v9
+; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v19, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v10, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[6:7]
+; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v19, v5
+; GISEL-NEXT: v_subbrev_u32_e64 v15, s[6:7], 0, v1, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v12, v5
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v15, v8
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v8
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v14, v5
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v15, v8
+; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v12, v5
; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7]
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v8, v19, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v15, v1, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v14, v12, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v18, v1
-; GISEL-NEXT: v_mul_hi_u32 v18, v15, v0
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18
-; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v1
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v8, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[1:2]
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc
+; GISEL-NEXT: v_mul_lo_u32 v12, v18, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT: v_mul_hi_u32 v13, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v14, v1
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v18, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v12, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v10, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v16, v13, v[1:2]
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc
; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v10, v[8:9]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v12, v[8:9]
; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v13, v2, v5
-; GISEL-NEXT: v_mul_lo_u32 v2, v12, v0
-; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5
-; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v11, v2, v5
+; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0
+; GISEL-NEXT: v_mul_lo_u32 v8, v12, v10
+; GISEL-NEXT: v_xor_b32_e32 v15, v3, v5
+; GISEL-NEXT: v_mul_hi_u32 v3, v12, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v12, v8
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT: v_mul_lo_u32 v3, v13, v10
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2
+; GISEL-NEXT: v_mul_hi_u32 v8, v12, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
-; GISEL-NEXT: v_mul_hi_u32 v8, v12, v8
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v13, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v8, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v12, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0
-; GISEL-NEXT: v_mul_lo_u32 v8, v13, v2
-; GISEL-NEXT: v_mul_hi_u32 v9, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
-; GISEL-NEXT: v_xor_b32_e32 v10, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0
+; GISEL-NEXT: v_mul_lo_u32 v8, v11, v2
+; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT: v_xor_b32_e32 v10, v14, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v9, v14, v2
+; GISEL-NEXT: v_mul_lo_u32 v9, v15, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3
-; GISEL-NEXT: v_mul_hi_u32 v8, v13, v2
+; GISEL-NEXT: v_mul_hi_u32 v8, v11, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v9, v14, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v0, v3
+; GISEL-NEXT: v_mul_hi_u32 v9, v15, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v12, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v9, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v8, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v13, v[0:1]
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v10, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[8:9]
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v13, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[8:9]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
@@ -653,128 +653,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_xor_b32_e32 v1, v2, v1
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0
; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1
-; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v0
-; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v1, vcc
+; CGP-NEXT: v_sub_i32_e32 v15, vcc, 0, v0
+; CGP-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc
; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CGP-NEXT: v_trunc_f32_e32 v4, v3
; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v5, v2
-; CGP-NEXT: v_cvt_u32_f32_e32 v14, v4
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4]
-; CGP-NEXT: v_mul_hi_u32 v15, v5, v2
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4]
-; CGP-NEXT: v_mul_lo_u32 v4, v14, v2
-; CGP-NEXT: v_mul_hi_u32 v2, v14, v2
-; CGP-NEXT: v_mul_lo_u32 v16, v5, v3
-; CGP-NEXT: v_mul_lo_u32 v17, v14, v3
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v16
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15
-; CGP-NEXT: v_mul_hi_u32 v15, v5, v3
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v16, v4
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v17, v2
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT: v_mul_hi_u32 v3, v14, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v14, v2
+; CGP-NEXT: v_cvt_u32_f32_e32 v17, v4
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4]
+; CGP-NEXT: v_mul_lo_u32 v3, v17, v2
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5]
+; CGP-NEXT: v_mul_hi_u32 v4, v14, v2
+; CGP-NEXT: v_mul_hi_u32 v2, v17, v2
+; CGP-NEXT: v_mul_lo_u32 v5, v14, v12
+; CGP-NEXT: v_mul_lo_u32 v13, v17, v12
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v2
-; CGP-NEXT: v_addc_u32_e32 v14, vcc, v14, v3, vcc
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4]
-; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v11
-; CGP-NEXT: v_mul_hi_u32 v15, v5, v2
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4]
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v12
-; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v12, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v4, v12
-; CGP-NEXT: v_mul_lo_u32 v4, v14, v2
-; CGP-NEXT: v_mul_lo_u32 v13, v5, v3
-; CGP-NEXT: v_mul_hi_u32 v2, v14, v2
-; CGP-NEXT: v_xor_b32_e32 v10, v10, v12
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v15, v14, v3
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_mul_hi_u32 v13, v5, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v15, v2
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT: v_mul_hi_u32 v3, v14, v3
+; CGP-NEXT: v_mul_hi_u32 v4, v14, v12
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CGP-NEXT: v_mul_hi_u32 v5, v17, v12
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v2
+; CGP-NEXT: v_addc_u32_e32 v17, vcc, v17, v3, vcc
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4]
+; CGP-NEXT: v_ashrrev_i32_e32 v15, 31, v11
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v15
+; CGP-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5]
+; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v15, vcc
+; CGP-NEXT: v_xor_b32_e32 v13, v3, v15
+; CGP-NEXT: v_mul_lo_u32 v3, v17, v2
+; CGP-NEXT: v_mul_lo_u32 v5, v14, v12
+; CGP-NEXT: v_xor_b32_e32 v16, v4, v15
+; CGP-NEXT: v_mul_hi_u32 v4, v14, v2
+; CGP-NEXT: v_mul_hi_u32 v2, v17, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v14, v3, vcc
-; CGP-NEXT: v_mul_lo_u32 v4, v10, v2
-; CGP-NEXT: v_mul_lo_u32 v5, v11, v3
-; CGP-NEXT: v_mul_hi_u32 v13, v11, v2
-; CGP-NEXT: v_mul_hi_u32 v2, v10, v2
-; CGP-NEXT: v_mul_hi_u32 v14, v10, v3
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v4, v17, v12
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT: v_mul_hi_u32 v5, v14, v12
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v17, v12
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v14, v2
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v17, v3, vcc
+; CGP-NEXT: v_mul_lo_u32 v4, v16, v2
+; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
+; CGP-NEXT: v_mul_hi_u32 v10, v13, v2
+; CGP-NEXT: v_mul_hi_u32 v2, v16, v2
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v10, v3
+; CGP-NEXT: v_mul_lo_u32 v10, v16, v3
; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT: v_mul_hi_u32 v5, v11, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v5, v13, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v2, v4
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v13, 0
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v2, v4
+; CGP-NEXT: v_mul_hi_u32 v10, v16, v3
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v11, v2
-; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v13, v[3:4]
-; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v4
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v10, v[3:4]
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v13, v2
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v1, v12, v[4:5]
+; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v16, v10, vcc
+; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v16, v10
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[4:5]
; CGP-NEXT: v_sub_i32_e32 v10, vcc, v2, v0
-; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v3, vcc
+; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v4, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v0
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v1
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0
-; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
; CGP-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v12
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v12
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v15
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v15
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v15
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v15, vcc
; CGP-NEXT: ; implicit-def: $vgpr4
; CGP-NEXT: ; implicit-def: $vgpr10
; CGP-NEXT: .LBB2_2: ; %Flow1
@@ -822,128 +822,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_xor_b32_e32 v3, v4, v3
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3
-; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v2
-; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc
+; CGP-NEXT: v_sub_i32_e32 v13, vcc, 0, v2
+; CGP-NEXT: v_subb_u32_e32 v14, vcc, 0, v3, vcc
; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
; CGP-NEXT: v_trunc_f32_e32 v6, v5
; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v12, v6
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6]
-; CGP-NEXT: v_mul_hi_u32 v13, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6]
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v12, v4
-; CGP-NEXT: v_mul_lo_u32 v14, v7, v5
-; CGP-NEXT: v_mul_lo_u32 v15, v12, v5
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v7, v5
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v5, v12, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v13, v6
+; CGP-NEXT: v_cvt_u32_f32_e32 v12, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v15, v6
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6]
+; CGP-NEXT: v_mul_lo_u32 v5, v15, v4
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7]
+; CGP-NEXT: v_mul_hi_u32 v6, v12, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v12, v10
+; CGP-NEXT: v_mul_lo_u32 v11, v15, v10
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v4
-; CGP-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6]
-; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v9
-; CGP-NEXT: v_mul_hi_u32 v13, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6]
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v10
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v10, vcc
-; CGP-NEXT: v_xor_b32_e32 v9, v6, v10
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v4
-; CGP-NEXT: v_mul_lo_u32 v11, v7, v5
-; CGP-NEXT: v_mul_hi_u32 v4, v12, v4
-; CGP-NEXT: v_xor_b32_e32 v8, v8, v10
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v12, v5
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6
-; CGP-NEXT: v_mul_hi_u32 v11, v7, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT: v_mul_hi_u32 v5, v12, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v12, v10
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT: v_mul_hi_u32 v7, v15, v10
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v4
+; CGP-NEXT: v_addc_u32_e32 v15, vcc, v15, v5, vcc
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6]
+; CGP-NEXT: v_ashrrev_i32_e32 v13, 31, v9
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v13
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7]
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v13, vcc
+; CGP-NEXT: v_xor_b32_e32 v11, v5, v13
+; CGP-NEXT: v_mul_lo_u32 v5, v15, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v12, v10
+; CGP-NEXT: v_xor_b32_e32 v14, v6, v13
+; CGP-NEXT: v_mul_hi_u32 v6, v12, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc
-; CGP-NEXT: v_mul_lo_u32 v6, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v7, v9, v5
-; CGP-NEXT: v_mul_hi_u32 v11, v9, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v8, v4
-; CGP-NEXT: v_mul_hi_u32 v12, v8, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v6, v15, v10
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v7, v12, v10
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT: v_mul_hi_u32 v7, v15, v10
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4
+; CGP-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc
+; CGP-NEXT: v_mul_lo_u32 v6, v14, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v11, v5
+; CGP-NEXT: v_mul_hi_u32 v8, v11, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v14, v4
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v11, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v8, v14, v5
; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_mul_hi_u32 v7, v9, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v7, v11, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v4, v6
+; CGP-NEXT: v_mul_hi_u32 v8, v14, v5
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6]
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v11, v[5:6]
-; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v8, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v6
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6]
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v10, v[6:7]
+; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v14, v8, vcc
+; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v14, v8
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
-; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3
+; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5]
; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2
-; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
+; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2
-; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v13
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v13
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc
; CGP-NEXT: ; implicit-def: $vgpr6
; CGP-NEXT: ; implicit-def: $vgpr8
; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -979,82 +979,82 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000
; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT: v_mov_b32_e32 v6, 0xfffff000
+; CHECK-NEXT: v_mov_b32_e32 v9, 0xfffff000
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v4, v3
; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0
; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1
; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT: v_mov_b32_e32 v5, 0x1000
+; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT: v_mov_b32_e32 v6, 0x1000
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
@@ -1062,39 +1062,39 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2]
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2
+; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v7
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v2
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2]
; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v4, v0
-; CHECK-NEXT: v_subb_u32_e64 v2, vcc, v9, v1, s[4:5]
-; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v9, v1
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
+; CHECK-NEXT: v_subb_u32_e64 v1, vcc, v5, v2, s[4:5]
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v6
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; CHECK-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc
-; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5
-; CHECK-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5
+; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6
+; CHECK-NEXT: v_subbrev_u32_e64 v2, s[4:5], 0, v2, s[4:5]
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc
-; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, 0x1000, v4
-; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
+; CHECK-NEXT: v_subrev_i32_e32 v6, vcc, 0x1000, v4
+; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v2, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = srem i64 %num, 4096
ret i64 %result
@@ -1120,114 +1120,114 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
; GISEL-NEXT: v_mov_b32_e32 v9, v5
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10]
; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[9:10]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11]
; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9
-; GISEL-NEXT: v_mul_lo_u32 v4, v8, v9
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v13
+; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13
+; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v14, v7, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v7, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc
; GISEL-NEXT: v_mov_b32_e32 v4, v14
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v19, v13
+; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15]
; GISEL-NEXT: s_mov_b32 s6, 1
; GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14
+; GISEL-NEXT: v_mul_lo_u32 v9, v18, v16
; GISEL-NEXT: s_subb_u32 s6, 0, 0
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v9, v18, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13
-; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14
+; GISEL-NEXT: v_mul_hi_u32 v9, v19, v13
+; GISEL-NEXT: v_mul_lo_u32 v13, v19, v16
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4
+; GISEL-NEXT: v_mul_hi_u32 v14, v18, v16
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v9, v14
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9
; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9
+; GISEL-NEXT: v_xor_b32_e32 v15, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v19, v16
+; GISEL-NEXT: v_xor_b32_e32 v17, v1, v9
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0
-; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1
-; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v13, v17, v0
+; GISEL-NEXT: v_mul_lo_u32 v14, v15, v1
+; GISEL-NEXT: v_mul_hi_u32 v16, v15, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0
; GISEL-NEXT: v_mov_b32_e32 v4, 0x1000
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1
+; GISEL-NEXT: v_mul_lo_u32 v16, v17, v1
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
-; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v13
+; GISEL-NEXT: v_mul_hi_u32 v18, v17, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v16
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v18, v13
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
+; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v0
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13
+; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4
+; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v15, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v7, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[13:14]
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v18, v4
+; GISEL-NEXT: v_mul_lo_u32 v13, v7, v0
+; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v19, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1244,72 +1244,72 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; GISEL-NEXT: v_xor_b32_e32 v1, v11, v9
-; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6]
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc
-; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11
-; GISEL-NEXT: v_mul_lo_u32 v2, v8, v0
-; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5
-; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11
-; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v1
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v15, v14, vcc
+; GISEL-NEXT: v_xor_b32_e32 v12, v7, v9
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc
+; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8]
+; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc
+; GISEL-NEXT: v_xor_b32_e32 v7, v2, v13
+; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT: v_xor_b32_e32 v8, v3, v13
+; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v8, v5
+; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3
-; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2
-; GISEL-NEXT: v_xor_b32_e32 v10, v10, v9
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v9
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v5, v8, v3
+; GISEL-NEXT: v_mul_lo_u32 v6, v7, v2
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v9
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; GISEL-NEXT: v_mul_hi_u32 v9, v7, v3
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, v13, v2
-; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT: v_mul_lo_u32 v9, v8, v2
+; GISEL-NEXT: v_mul_hi_u32 v3, v8, v3
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v2
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v6, v7, v2
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v13, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v3, v5
+; GISEL-NEXT: v_mul_hi_u32 v10, v8, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v5
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1329,10 +1329,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v11
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v11
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_v2i64_pow2k_denom:
@@ -1351,110 +1351,110 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
; CGP-NEXT: v_mov_b32_e32 v9, v5
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10]
; CGP-NEXT: v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11]
; CGP-NEXT: v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT: v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT: v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mul_lo_u32 v4, v7, v13
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT: v_mul_hi_u32 v14, v7, v13
+; CGP-NEXT: v_mul_hi_u32 v13, v8, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc
; CGP-NEXT: v_mov_b32_e32 v4, v14
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT: v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; CGP-NEXT: v_mul_lo_u32 v4, v19, v13
+; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15]
+; CGP-NEXT: v_mul_lo_u32 v9, v18, v16
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v18, v13
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT: v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v15, v16, v14
+; CGP-NEXT: v_mul_hi_u32 v9, v19, v13
+; CGP-NEXT: v_mul_lo_u32 v13, v19, v16
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v14, v18, v16
; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9
; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT: v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT: v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT: v_xor_b32_e32 v15, v0, v9
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v19, v16
+; CGP-NEXT: v_xor_b32_e32 v17, v1, v9
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1
; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v19, v0
-; CGP-NEXT: v_mul_lo_u32 v14, v18, v1
-; CGP-NEXT: v_mul_hi_u32 v15, v18, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v19, v0
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; CGP-NEXT: v_mul_lo_u32 v13, v17, v0
+; CGP-NEXT: v_mul_lo_u32 v14, v15, v1
+; CGP-NEXT: v_mul_hi_u32 v16, v15, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v17, v0
; CGP-NEXT: v_mov_b32_e32 v4, 0x1000
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v15, v19, v1
+; CGP-NEXT: v_mul_lo_u32 v16, v17, v1
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v14, v18, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v14, v15, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT: v_mul_hi_u32 v15, v19, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
-; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
-; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13
+; CGP-NEXT: v_mul_hi_u32 v18, v17, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v16
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v13
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
+; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0
+; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13
+; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc
; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4
-; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
+; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4
+; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4
+; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; CGP-NEXT: v_mov_b32_e32 v0, v5
-; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
-; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
-; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1]
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v1, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14]
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v18, v4
+; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v19, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CGP-NEXT: v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11
@@ -1472,72 +1472,72 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc
-; CGP-NEXT: v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT: v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT: v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT: v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1
+; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v14, vcc
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; CGP-NEXT: v_cndmask_b32_e32 v12, v16, v13, vcc
+; CGP-NEXT: v_xor_b32_e32 v13, v5, v9
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8]
+; CGP-NEXT: v_xor_b32_e32 v1, v12, v9
+; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT: v_xor_b32_e32 v7, v2, v12
+; CGP-NEXT: v_mul_lo_u32 v2, v11, v0
+; CGP-NEXT: v_mul_lo_u32 v6, v10, v5
+; CGP-NEXT: v_xor_b32_e32 v8, v3, v12
+; CGP-NEXT: v_mul_hi_u32 v3, v10, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v11, v0
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v3, v11, v5
; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT: v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v11, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT: v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0
+; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; CGP-NEXT: v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT: v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v7, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT: v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT: v_mul_hi_u32 v3, v8, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v13, v2
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5
+; CGP-NEXT: v_mul_hi_u32 v10, v8, v2
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v9
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v5
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4]
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1557,10 +1557,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v12
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v12
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = srem <2 x i64> %num, <i64 4096, i64 4096>
ret <2 x i64> %result
@@ -1572,82 +1572,82 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb
; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT: v_mov_b32_e32 v6, 0xffed2705
+; CHECK-NEXT: v_mov_b32_e32 v9, 0xffed2705
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v4, v3
; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v4
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT: v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT: v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT: v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT: v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT: v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT: v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT: v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT: v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT: v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT: v_mul_hi_u32 v3, v8, v6
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v3, v10, v6
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v2, v5, v0
; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1
; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT: v_mov_b32_e32 v5, 0x12d8fb
+; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT: v_mov_b32_e32 v6, 0x12d8fb
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
@@ -1655,39 +1655,39 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_mul_hi_u32 v7, v9, v1
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2]
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v0, v2
+; CHECK-NEXT: v_mul_hi_u32 v8, v5, v1
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v7
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v2
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2]
; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v4, v0
-; CHECK-NEXT: v_subb_u32_e64 v2, vcc, v9, v1, s[4:5]
-; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v9, v1
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
+; CHECK-NEXT: v_subb_u32_e64 v1, vcc, v5, v2, s[4:5]
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v6
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; CHECK-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc
-; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v5
-; CHECK-NEXT: v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v5
+; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6
+; CHECK-NEXT: v_subbrev_u32_e64 v2, s[4:5], 0, v2, s[4:5]
+; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v6
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; CHECK-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc
-; CHECK-NEXT: v_subrev_i32_e32 v7, vcc, 0x12d8fb, v4
-; CHECK-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
+; CHECK-NEXT: v_subrev_i32_e32 v6, vcc, 0x12d8fb, v4
+; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v2, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = srem i64 %num, 1235195
ret i64 %result
@@ -1713,114 +1713,114 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
; GISEL-NEXT: v_mov_b32_e32 v9, v5
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10]
; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v7, v[9:10]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], s6, v7, v[10:11]
; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT: v_mul_lo_u32 v13, v7, v9
-; GISEL-NEXT: v_mul_lo_u32 v4, v8, v9
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v13
+; GISEL-NEXT: v_mul_hi_u32 v11, v7, v4
+; GISEL-NEXT: v_mul_lo_u32 v9, v7, v13
+; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v14, v7, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v7, v4
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v7, v4
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc
; GISEL-NEXT: v_mov_b32_e32 v4, v14
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v17, v13
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v19, v13
+; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15]
; GISEL-NEXT: s_mov_b32 s6, 1
; GISEL-NEXT: s_cmp_lg_u32 s6, 0
-; GISEL-NEXT: v_mul_lo_u32 v9, v16, v14
+; GISEL-NEXT: v_mul_lo_u32 v9, v18, v16
; GISEL-NEXT: s_subb_u32 s6, 0, 0
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v16, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v9, v18, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_hi_u32 v9, v17, v13
-; GISEL-NEXT: v_mul_lo_u32 v13, v17, v14
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v15, v16, v14
+; GISEL-NEXT: v_mul_hi_u32 v9, v19, v13
+; GISEL-NEXT: v_mul_lo_u32 v13, v19, v16
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4
+; GISEL-NEXT: v_mul_hi_u32 v14, v18, v16
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v9, v14
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v9
; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT: v_xor_b32_e32 v18, v0, v9
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT: v_xor_b32_e32 v19, v1, v9
+; GISEL-NEXT: v_xor_b32_e32 v15, v0, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v19, v16
+; GISEL-NEXT: v_xor_b32_e32 v17, v1, v9
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v19, v0
-; GISEL-NEXT: v_mul_lo_u32 v14, v18, v1
-; GISEL-NEXT: v_mul_hi_u32 v15, v18, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v19, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v13, v17, v0
+; GISEL-NEXT: v_mul_lo_u32 v14, v15, v1
+; GISEL-NEXT: v_mul_hi_u32 v16, v15, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0
; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v15, v19, v1
+; GISEL-NEXT: v_mul_lo_u32 v16, v17, v1
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_mul_hi_u32 v14, v18, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v14, v15, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT: v_mul_hi_u32 v15, v19, v1
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
-; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v0, v13
+; GISEL-NEXT: v_mul_hi_u32 v18, v17, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v16
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v18, v13
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
+; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v15, v0
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13
+; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4
+; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v15, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT: v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; GISEL-NEXT: v_cndmask_b32_e32 v20, -1, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v7, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v7, v[13:14]
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v18, v4
+; GISEL-NEXT: v_mul_lo_u32 v13, v7, v0
+; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v19, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v19, v5, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v13
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1837,72 +1837,72 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT: v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; GISEL-NEXT: v_xor_b32_e32 v1, v11, v9
-; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v3
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v7, v[5:6]
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v14, v16, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc
-; GISEL-NEXT: v_xor_b32_e32 v12, v2, v11
-; GISEL-NEXT: v_mul_lo_u32 v2, v8, v0
-; GISEL-NEXT: v_mul_lo_u32 v6, v7, v5
-; GISEL-NEXT: v_xor_b32_e32 v13, v3, v11
-; GISEL-NEXT: v_mul_hi_u32 v3, v7, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v8, v0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v7, v1
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v15, v14, vcc
+; GISEL-NEXT: v_xor_b32_e32 v12, v7, v9
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc
+; GISEL-NEXT: v_xor_b32_e32 v1, v5, v9
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[7:8]
+; GISEL-NEXT: v_ashrrev_i32_e32 v13, 31, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v13, vcc
+; GISEL-NEXT: v_xor_b32_e32 v7, v2, v13
+; GISEL-NEXT: v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT: v_xor_b32_e32 v8, v3, v13
+; GISEL-NEXT: v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v8, v5
+; GISEL-NEXT: v_mul_lo_u32 v3, v11, v5
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT: v_mul_hi_u32 v6, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v13, v3
-; GISEL-NEXT: v_mul_lo_u32 v6, v12, v2
-; GISEL-NEXT: v_xor_b32_e32 v10, v10, v9
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v3
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v9
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v5, v8, v3
+; GISEL-NEXT: v_mul_lo_u32 v6, v7, v2
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v12, v9
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; GISEL-NEXT: v_mul_hi_u32 v9, v7, v3
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, v13, v2
-; GISEL-NEXT: v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT: v_mul_lo_u32 v9, v8, v2
+; GISEL-NEXT: v_mul_hi_u32 v3, v8, v3
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v2
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v6, v7, v2
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v13, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v3, v5
+; GISEL-NEXT: v_mul_hi_u32 v10, v8, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v5
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4]
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v4
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1922,10 +1922,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v11
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v11
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v11
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v13
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v13
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v13
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v13, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_v2i64_oddk_denom:
@@ -1944,110 +1944,110 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
; CGP-NEXT: v_mov_b32_e32 v9, v5
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10]
; CGP-NEXT: v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11]
; CGP-NEXT: v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT: v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT: v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT: v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT: v_mul_lo_u32 v4, v7, v13
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT: v_mul_hi_u32 v14, v7, v13
+; CGP-NEXT: v_mul_hi_u32 v13, v8, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT: v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT: v_add_i32_e32 v18, vcc, v7, v4
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; CGP-NEXT: v_addc_u32_e32 v19, vcc, v8, v9, vcc
; CGP-NEXT: v_mov_b32_e32 v4, v14
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT: v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; CGP-NEXT: v_mul_lo_u32 v4, v19, v13
+; CGP-NEXT: v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15]
+; CGP-NEXT: v_mul_lo_u32 v9, v18, v16
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT: v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v18, v13
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT: v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v15, v16, v14
+; CGP-NEXT: v_mul_hi_u32 v9, v19, v13
+; CGP-NEXT: v_mul_lo_u32 v13, v19, v16
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v14, v18, v16
; CGP-NEXT: v_add_i32_e32 v9, vcc, v13, v9
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v9, v15
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v9, v14
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v9
; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9
; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT: v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT: v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT: v_xor_b32_e32 v15, v0, v9
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v19, v16
+; CGP-NEXT: v_xor_b32_e32 v17, v1, v9
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1
; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v19, v0
-; CGP-NEXT: v_mul_lo_u32 v14, v18, v1
-; CGP-NEXT: v_mul_hi_u32 v15, v18, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v19, v0
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v18, v0
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; CGP-NEXT: v_mul_lo_u32 v13, v17, v0
+; CGP-NEXT: v_mul_lo_u32 v14, v15, v1
+; CGP-NEXT: v_mul_hi_u32 v16, v15, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v17, v0
; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v15, v19, v1
+; CGP-NEXT: v_mul_lo_u32 v16, v17, v1
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v14, v18, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v15, v0
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v14, v15, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT: v_mul_hi_u32 v15, v19, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, v18, v0
-; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v19, v13
-; CGP-NEXT: v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v0, v13
+; CGP-NEXT: v_mul_hi_u32 v18, v17, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v16
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v13
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
+; CGP-NEXT: v_sub_i32_e32 v15, vcc, v15, v0
+; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v17, v13
+; CGP-NEXT: v_subb_u32_e64 v16, s[4:5], v17, v13, vcc
; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v4
-; CGP-NEXT: v_sub_i32_e32 v16, vcc, v14, v4
+; CGP-NEXT: v_sub_i32_e32 v18, vcc, v15, v4
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v15, v4
+; CGP-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
-; CGP-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v18, v4
+; CGP-NEXT: v_cndmask_b32_e64 v17, -1, v1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; CGP-NEXT: v_mov_b32_e32 v0, v5
-; CGP-NEXT: v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v16, v4
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
-; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v18, vcc
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v16, v4
-; CGP-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1]
+; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v1, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14]
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v18, v4
+; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v19, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CGP-NEXT: v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT: v_cndmask_b32_e32 v16, v16, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v17, v17, v18, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v14, v18, v1, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v13, v19, v13, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11
@@ -2065,72 +2065,72 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT: v_cndmask_b32_e32 v5, v14, v16, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT: v_cndmask_b32_e32 v10, v15, v17, vcc
-; CGP-NEXT: v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT: v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT: v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT: v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v7, v1
+; CGP-NEXT: v_addc_u32_e32 v11, vcc, v8, v0, vcc
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17
+; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v14, vcc
+; CGP-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; CGP-NEXT: v_cndmask_b32_e32 v12, v16, v13, vcc
+; CGP-NEXT: v_xor_b32_e32 v13, v5, v9
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8]
+; CGP-NEXT: v_xor_b32_e32 v1, v12, v9
+; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT: v_xor_b32_e32 v7, v2, v12
+; CGP-NEXT: v_mul_lo_u32 v2, v11, v0
+; CGP-NEXT: v_mul_lo_u32 v6, v10, v5
+; CGP-NEXT: v_xor_b32_e32 v8, v3, v12
+; CGP-NEXT: v_mul_hi_u32 v3, v10, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v11, v0
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT: v_mul_lo_u32 v3, v11, v5
; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT: v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v10, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v11, v5
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT: v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v9
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v0
+; CGP-NEXT: v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; CGP-NEXT: v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT: v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v13, v9
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT: v_mul_hi_u32 v9, v7, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT: v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT: v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT: v_mul_hi_u32 v3, v8, v3
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v13, v2
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v3, v5
+; CGP-NEXT: v_mul_hi_u32 v10, v8, v2
+; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v9
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v5
+; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4]
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v8, v5
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; CGP-NEXT: v_sub_i32_e32 v7, vcc, v2, v4
; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -2150,10 +2150,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v12
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v12
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = srem <2 x i64> %num, <i64 1235195, i64 1235195>
ret <2 x i64> %result
@@ -2189,130 +2189,130 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: v_xor_b32_e32 v1, v2, v1
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v0
; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v1
-; CHECK-NEXT: v_sub_i32_e32 v9, vcc, 0, v0
-; CHECK-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT: v_sub_i32_e32 v11, vcc, 0, v0
+; CHECK-NEXT: v_subb_u32_e32 v12, vcc, 0, v1, vcc
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5
; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2
; CHECK-NEXT: v_trunc_f32_e32 v7, v5
; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v7
-; CHECK-NEXT: v_cvt_u32_f32_e32 v8, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v11, v7
-; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_cvt_u32_f32_e32 v10, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v13, v7
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
; CHECK-NEXT: v_mov_b32_e32 v2, v6
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3]
-; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v5
-; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT: v_mul_lo_u32 v12, v8, v6
-; CHECK-NEXT: v_mul_lo_u32 v13, v11, v6
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3]
+; CHECK-NEXT: v_mul_lo_u32 v2, v13, v5
+; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT: v_mul_hi_u32 v6, v10, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v13, v5
+; CHECK-NEXT: v_mul_lo_u32 v7, v10, v8
+; CHECK-NEXT: v_mul_lo_u32 v9, v13, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT: v_mul_hi_u32 v7, v8, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v10, v8
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v12, v2
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v9, v5
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT: v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT: v_mul_hi_u32 v7, v13, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v2
-; CHECK-NEXT: v_addc_u32_e32 v11, vcc, v11, v5, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v2
+; CHECK-NEXT: v_addc_u32_e32 v13, vcc, v13, v5, vcc
+; CHECK-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
; CHECK-NEXT: v_mov_b32_e32 v2, v6
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3]
-; CHECK-NEXT: v_ashrrev_i32_e32 v9, 31, v4
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v9
-; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v9, vcc
-; CHECK-NEXT: v_xor_b32_e32 v7, v2, v9
-; CHECK-NEXT: v_mul_lo_u32 v2, v11, v5
-; CHECK-NEXT: v_mul_lo_u32 v4, v8, v6
-; CHECK-NEXT: v_xor_b32_e32 v10, v3, v9
-; CHECK-NEXT: v_mul_hi_u32 v3, v8, v5
-; CHECK-NEXT: v_mul_hi_u32 v5, v11, v5
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3]
+; CHECK-NEXT: v_ashrrev_i32_e32 v11, 31, v4
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v11
+; CHECK-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v4, v11, vcc
+; CHECK-NEXT: v_xor_b32_e32 v9, v2, v11
+; CHECK-NEXT: v_mul_lo_u32 v2, v13, v5
+; CHECK-NEXT: v_mul_lo_u32 v4, v10, v8
+; CHECK-NEXT: v_xor_b32_e32 v12, v3, v11
+; CHECK-NEXT: v_mul_hi_u32 v3, v10, v5
+; CHECK-NEXT: v_mul_hi_u32 v5, v13, v5
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v3, v11, v6
+; CHECK-NEXT: v_mul_lo_u32 v3, v13, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2
-; CHECK-NEXT: v_mul_hi_u32 v4, v8, v6
+; CHECK-NEXT: v_mul_hi_u32 v4, v10, v8
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT: v_mul_hi_u32 v5, v11, v6
+; CHECK-NEXT: v_mul_hi_u32 v5, v13, v8
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v11, v3, vcc
-; CHECK-NEXT: v_mul_lo_u32 v4, v10, v2
-; CHECK-NEXT: v_mul_lo_u32 v5, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v6, v7, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v10, v2
-; CHECK-NEXT: v_mul_hi_u32 v8, v10, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2
+; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v13, v3, vcc
+; CHECK-NEXT: v_mul_lo_u32 v4, v12, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v9, v3
+; CHECK-NEXT: v_mul_hi_u32 v6, v9, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v12, v2
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v6, v10, v3
+; CHECK-NEXT: v_mul_lo_u32 v6, v12, v3
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT: v_mul_hi_u32 v5, v7, v3
+; CHECK-NEXT: v_mul_hi_u32 v5, v9, v3
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v2, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v6, 0
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v2, v4
+; CHECK-NEXT: v_mul_hi_u32 v6, v12, v3
+; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v6, v[3:4]
-; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v10, v3
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v4
+; CHECK-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4]
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v9, v2
+; CHECK-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5]
+; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v12, v6, vcc
+; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v12, v6
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v0
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v1, vcc
; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5]
; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v2, v0
-; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
+; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5]
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v9
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v9
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v11
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v11
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v11
+; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v11, vcc
; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6
; CHECK-NEXT: ; implicit-def: $vgpr3
; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -2357,85 +2357,85 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_xor_b32_e32 v7, v10, v7
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v5
; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v7
-; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v5
-; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v7, vcc
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v5
+; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v7, vcc
; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4
; GISEL-NEXT: v_trunc_f32_e32 v12, v10
; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12
-; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v16, v12
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v12
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
; GISEL-NEXT: v_mov_b32_e32 v4, v11
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v4, v16, v10
-; GISEL-NEXT: v_mul_hi_u32 v17, v13, v10
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10
-; GISEL-NEXT: v_mul_lo_u32 v12, v13, v11
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v4, v18, v10
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
+; GISEL-NEXT: v_mul_hi_u32 v12, v15, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v17
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v17, v16, v11
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v12, v4
-; GISEL-NEXT: v_mul_hi_u32 v12, v13, v11
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v17, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_mul_lo_u32 v12, v18, v13
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4
+; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12
-; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_mul_hi_u32 v12, v18, v13
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v4
-; GISEL-NEXT: v_addc_u32_e32 v16, vcc, v16, v10, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v4
+; GISEL-NEXT: v_addc_u32_e32 v18, vcc, v18, v10, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
; GISEL-NEXT: v_mov_b32_e32 v4, v11
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
; GISEL-NEXT: v_xor_b32_e32 v12, v0, v4
-; GISEL-NEXT: v_mul_lo_u32 v0, v16, v10
-; GISEL-NEXT: v_mul_lo_u32 v14, v13, v11
-; GISEL-NEXT: v_xor_b32_e32 v15, v1, v4
-; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10
-; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v0, v18, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13
+; GISEL-NEXT: v_xor_b32_e32 v16, v1, v4
+; GISEL-NEXT: v_mul_hi_u32 v1, v15, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v18, v10
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT: v_mul_hi_u32 v14, v13, v11
+; GISEL-NEXT: v_mul_lo_u32 v1, v18, v13
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v11, v15, v13
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT: v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v18, v13
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v0
-; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v16, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v15, v10
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v0
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v18, v1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v13, v16, v10
; GISEL-NEXT: v_mul_lo_u32 v14, v12, v11
; GISEL-NEXT: v_lshl_b64 v[0:1], v[8:9], v6
; GISEL-NEXT: v_mul_hi_u32 v6, v12, v10
-; GISEL-NEXT: v_mul_hi_u32 v10, v15, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v16, v10
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v14
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v8, v15, v11
+; GISEL-NEXT: v_mul_lo_u32 v8, v16, v11
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
; GISEL-NEXT: v_mul_hi_u32 v9, v12, v11
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
@@ -2444,127 +2444,127 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v8, v6
-; GISEL-NEXT: v_mul_hi_u32 v8, v15, v11
+; GISEL-NEXT: v_mul_hi_u32 v8, v16, v11
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v8, v6
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v8, v6
; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc
; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v5, v13, 0
; GISEL-NEXT: v_xor_b32_e32 v6, v0, v8
; GISEL-NEXT: v_xor_b32_e32 v8, v1, v8
-; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v6
-; GISEL-NEXT: v_cvt_f32_u32_e32 v16, v8
+; GISEL-NEXT: v_cvt_f32_u32_e32 v1, v6
+; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v8
; GISEL-NEXT: v_mov_b32_e32 v0, v10
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, v[0:1]
-; GISEL-NEXT: v_mac_f32_e32 v14, 0x4f800000, v16
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v10, v14
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 0, v6
-; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v10
+; GISEL-NEXT: v_sub_i32_e32 v17, vcc, 0, v6
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v5, v14, v[0:1]
+; GISEL-NEXT: v_mac_f32_e32 v1, 0x4f800000, v15
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v14, v1
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[10:11]
+; GISEL-NEXT: v_subb_u32_e32 v18, vcc, 0, v8, vcc
+; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v14
; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v1
; GISEL-NEXT: v_trunc_f32_e32 v13, v10
; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v13
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v1
-; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v13
-; GISEL-NEXT: v_subb_u32_e32 v17, vcc, 0, v8, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v1
+; GISEL-NEXT: v_cvt_u32_f32_e32 v19, v13
; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v17, v15, 0
+; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], v16, v0, vcc
; GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v13, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v1, v13, v10
-; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], v15, v0, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v14, v[11:12]
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v15, v0
-; GISEL-NEXT: v_mul_lo_u32 v12, v14, v11
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v18, v7
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v19, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v1, v19, v10
+; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v16, v0
+; GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v20, v7
; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc
-; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12
-; GISEL-NEXT: v_mul_hi_u32 v12, v14, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12
+; GISEL-NEXT: v_mul_lo_u32 v11, v15, v13
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v15, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v11
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[6:7]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v9, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v18, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[6:7]
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v20, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v11, v1, v11, s[6:7]
; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v9, v5
-; GISEL-NEXT: v_subbrev_u32_e64 v19, s[6:7], 0, v0, vcc
+; GISEL-NEXT: v_subbrev_u32_e64 v14, s[6:7], 0, v0, vcc
; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v19, v7
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[8:9], v14, v7
; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v0, v7, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[8:9]
; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v19, v7
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], v14, v7
; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[6:7]
+; GISEL-NEXT: v_cndmask_b32_e64 v16, v16, v21, s[6:7]
; GISEL-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v14, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_hi_u32 v1, v13, v10
-; GISEL-NEXT: v_mul_lo_u32 v10, v13, v11
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v15, v14, v11
+; GISEL-NEXT: v_mul_hi_u32 v1, v19, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v19, v13
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT: v_mul_hi_u32 v12, v15, v13
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15
-; GISEL-NEXT: v_mul_hi_u32 v11, v13, v11
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v19, v13
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v0
-; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v11, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v12, v1
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v0
+; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v19, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, 0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
; GISEL-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[1:2]
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v14, v[1:2]
; GISEL-NEXT: v_xor_b32_e32 v1, v5, v4
; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v11, v[9:10]
-; GISEL-NEXT: v_cndmask_b32_e32 v7, v18, v7, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v18, v13, v[9:10]
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v20, v7, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
; GISEL-NEXT: v_xor_b32_e32 v12, v2, v5
-; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT: v_mul_lo_u32 v10, v11, v9
-; GISEL-NEXT: v_xor_b32_e32 v14, v3, v5
-; GISEL-NEXT: v_mul_hi_u32 v3, v11, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v2, v14, v0
+; GISEL-NEXT: v_mul_lo_u32 v9, v13, v11
+; GISEL-NEXT: v_xor_b32_e32 v15, v3, v5
+; GISEL-NEXT: v_mul_hi_u32 v3, v13, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v13, v9
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; GISEL-NEXT: v_mul_hi_u32 v10, v11, v9
+; GISEL-NEXT: v_mul_lo_u32 v3, v14, v11
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2
+; GISEL-NEXT: v_mul_hi_u32 v9, v13, v11
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10
-; GISEL-NEXT: v_mul_hi_u32 v9, v13, v9
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v14, v11
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v14, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v13, v0
+; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v14, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v3, v15, v0
; GISEL-NEXT: v_mul_lo_u32 v9, v12, v2
; GISEL-NEXT: v_mul_hi_u32 v10, v12, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
; GISEL-NEXT: v_xor_b32_e32 v7, v7, v4
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v10
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v10, v14, v2
+; GISEL-NEXT: v_mul_lo_u32 v10, v15, v2
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3
; GISEL-NEXT: v_mul_hi_u32 v9, v12, v2
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
@@ -2573,19 +2573,19 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v10, v14, v2
+; GISEL-NEXT: v_mul_hi_u32 v10, v15, v2
; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v0
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v10, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v13, v[0:1]
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v7, v4, vcc
; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10]
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v14, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v14, v3
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v15, v3, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v15, v3
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v8
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
@@ -2641,103 +2641,103 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v1, v4, v1
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0
; CGP-NEXT: v_cvt_f32_u32_e32 v10, v1
-; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v0
-; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v1, vcc
+; CGP-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
+; CGP-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc
; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v10
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v4
; CGP-NEXT: v_trunc_f32_e32 v12, v10
; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v12
-; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v16, v12
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; CGP-NEXT: v_cvt_u32_f32_e32 v15, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v18, v12
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
; CGP-NEXT: v_mov_b32_e32 v4, v11
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; CGP-NEXT: v_mul_lo_u32 v4, v16, v10
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT: v_mul_hi_u32 v12, v13, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT: v_mul_lo_u32 v17, v13, v11
-; CGP-NEXT: v_mul_lo_u32 v18, v16, v11
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
+; CGP-NEXT: v_mul_lo_u32 v4, v18, v10
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
+; CGP-NEXT: v_mul_hi_u32 v11, v15, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v18, v10
+; CGP-NEXT: v_mul_lo_u32 v12, v15, v13
+; CGP-NEXT: v_mul_lo_u32 v14, v18, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT: v_mul_hi_u32 v12, v13, v11
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT: v_mul_hi_u32 v11, v15, v13
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v17, v4
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v18, v10
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v17, v12
-; CGP-NEXT: v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_mul_hi_u32 v12, v18, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4
-; CGP-NEXT: v_addc_u32_e32 v16, vcc, v16, v10, vcc
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v4
+; CGP-NEXT: v_addc_u32_e32 v18, vcc, v18, v10, vcc
+; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
; CGP-NEXT: v_mov_b32_e32 v4, v11
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v9
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v14
-; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v14, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v4, v14
-; CGP-NEXT: v_mul_lo_u32 v4, v16, v10
-; CGP-NEXT: v_mul_lo_u32 v9, v13, v11
-; CGP-NEXT: v_xor_b32_e32 v15, v8, v14
-; CGP-NEXT: v_mul_hi_u32 v8, v13, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v16, v10
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
+; CGP-NEXT: v_ashrrev_i32_e32 v16, 31, v9
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v16
+; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v16, vcc
+; CGP-NEXT: v_xor_b32_e32 v14, v4, v16
+; CGP-NEXT: v_mul_lo_u32 v4, v18, v10
+; CGP-NEXT: v_mul_lo_u32 v9, v15, v13
+; CGP-NEXT: v_xor_b32_e32 v17, v8, v16
+; CGP-NEXT: v_mul_hi_u32 v8, v15, v10
+; CGP-NEXT: v_mul_hi_u32 v10, v18, v10
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v8, v16, v11
+; CGP-NEXT: v_mul_lo_u32 v8, v18, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT: v_mul_hi_u32 v9, v13, v11
+; CGP-NEXT: v_mul_hi_u32 v9, v15, v13
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT: v_mul_hi_u32 v10, v16, v11
+; CGP-NEXT: v_mul_hi_u32 v10, v18, v13
; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_addc_u32_e32 v8, vcc, v16, v8, vcc
-; CGP-NEXT: v_mul_lo_u32 v9, v15, v4
-; CGP-NEXT: v_mul_lo_u32 v10, v12, v8
-; CGP-NEXT: v_mul_hi_u32 v11, v12, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v15, v4
-; CGP-NEXT: v_mul_hi_u32 v13, v15, v8
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, v18, v8, vcc
+; CGP-NEXT: v_mul_lo_u32 v9, v17, v4
+; CGP-NEXT: v_mul_lo_u32 v10, v14, v8
+; CGP-NEXT: v_mul_hi_u32 v11, v14, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v17, v4
; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v11, v15, v8
+; CGP-NEXT: v_mul_lo_u32 v11, v17, v8
; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT: v_mul_hi_u32 v10, v12, v8
+; CGP-NEXT: v_mul_hi_u32 v10, v14, v8
; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v9
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v11, 0
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v4, v9
+; CGP-NEXT: v_mul_hi_u32 v11, v17, v8
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v13, 0
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v13, v4
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4
; CGP-NEXT: v_mov_b32_e32 v4, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v10, v[4:5]
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v12, v8
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v1, v11, v[9:10]
-; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v15, v9, vcc
-; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v15, v9
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v0, v11, v[4:5]
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v14, v8
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v1, v13, v[9:10]
+; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v17, v11, vcc
+; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v17, v11
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v0
@@ -2750,11 +2750,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v1
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v0
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v1
; CGP-NEXT: v_subb_u32_e32 v1, vcc, v9, v1, vcc
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v0
-; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[4:5]
+; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
; CGP-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc
@@ -2762,10 +2762,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v14
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v14
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v14
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v14, vcc
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v16
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v16
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v16
+; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v16, vcc
; CGP-NEXT: ; implicit-def: $vgpr11_vgpr12
; CGP-NEXT: ; implicit-def: $vgpr8
; CGP-NEXT: .LBB8_2: ; %Flow1
@@ -2815,117 +2815,117 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_xor_b32_e32 v3, v4, v3
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v2
-; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v3, vcc
+; CGP-NEXT: v_sub_i32_e32 v14, vcc, 0, v2
+; CGP-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc
; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6
; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; CGP-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4
; CGP-NEXT: v_trunc_f32_e32 v6, v6
; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v11, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v13, v4
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
; CGP-NEXT: v_mov_b32_e32 v4, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5]
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5]
; CGP-NEXT: v_mul_lo_u32 v4, v6, v8
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v8
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT: v_mul_hi_u32 v9, v13, v8
; CGP-NEXT: v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT: v_mul_lo_u32 v14, v11, v9
-; CGP-NEXT: v_mul_lo_u32 v15, v6, v9
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v10, v13, v11
+; CGP-NEXT: v_mul_lo_u32 v12, v6, v11
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v9
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT: v_mul_hi_u32 v9, v13, v11
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v14, v4
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v15, v8
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v14, v10
-; CGP-NEXT: v_mul_hi_u32 v9, v6, v9
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT: v_mul_hi_u32 v10, v6, v11
; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8
; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v4
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v4
; CGP-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
; CGP-NEXT: v_mov_b32_e32 v4, v9
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5]
-; CGP-NEXT: v_ashrrev_i32_e32 v12, 31, v7
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v12
-; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v12, vcc
-; CGP-NEXT: v_xor_b32_e32 v7, v4, v12
+; CGP-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5]
+; CGP-NEXT: v_ashrrev_i32_e32 v14, 31, v7
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v14
+; CGP-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT: v_addc_u32_e32 v5, vcc, v7, v14, vcc
+; CGP-NEXT: v_xor_b32_e32 v10, v4, v14
; CGP-NEXT: v_mul_lo_u32 v4, v6, v8
-; CGP-NEXT: v_mul_lo_u32 v10, v11, v9
-; CGP-NEXT: v_xor_b32_e32 v13, v5, v12
-; CGP-NEXT: v_mul_hi_u32 v5, v11, v8
+; CGP-NEXT: v_mul_lo_u32 v7, v13, v11
+; CGP-NEXT: v_xor_b32_e32 v12, v5, v14
+; CGP-NEXT: v_mul_hi_u32 v5, v13, v8
; CGP-NEXT: v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v6, v9
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT: v_mul_hi_u32 v10, v11, v9
+; CGP-NEXT: v_mul_lo_u32 v5, v6, v11
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; CGP-NEXT: v_mul_hi_u32 v7, v13, v11
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT: v_mul_hi_u32 v9, v6, v9
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_mul_hi_u32 v8, v6, v11
; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
; CGP-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc
-; CGP-NEXT: v_mul_lo_u32 v6, v13, v4
-; CGP-NEXT: v_mul_lo_u32 v8, v7, v5
-; CGP-NEXT: v_mul_hi_u32 v9, v7, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v13, v4
-; CGP-NEXT: v_mul_hi_u32 v10, v13, v5
+; CGP-NEXT: v_mul_lo_u32 v6, v12, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v10, v5
+; CGP-NEXT: v_mul_hi_u32 v8, v10, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v12, v4
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8
-; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v9, v13, v5
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT: v_mul_hi_u32 v8, v7, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; CGP-NEXT: v_mul_lo_u32 v8, v12, v5
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT: v_mul_hi_u32 v7, v10, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v4, v6
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v4, v6
+; CGP-NEXT: v_mul_hi_u32 v8, v12, v5
+; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6]
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6]
-; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v13, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v6
+; CGP-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6]
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[6:7]
+; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v8, vcc
+; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v8
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3
; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
-; CGP-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3
+; CGP-NEXT: v_subb_u32_e32 v6, vcc, v6, v3, vcc
; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5]
; CGP-NEXT: v_sub_i32_e32 v8, vcc, v4, v2
-; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
+; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2
; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2
; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
@@ -2934,11 +2934,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v12
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v12
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v14
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v14
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v14
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v14, vcc
; CGP-NEXT: ; implicit-def: $vgpr9_vgpr10
; CGP-NEXT: ; implicit-def: $vgpr5
; CGP-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -3000,15 +3000,15 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) {
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0
; CGP-NEXT: v_rcp_f32_e32 v1, v1
; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v1
; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v3
-; CGP-NEXT: v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v2
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
+; CGP-NEXT: v_mul_lo_u32 v5, v1, v4
+; CGP-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0
+; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v0
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
; CGP-NEXT: v_mul_lo_u32 v0, v1, v3
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v3
@@ -3032,71 +3032,71 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1
; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0
-; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1
+; GISEL-NEXT: v_sub_i32_e32 v13, vcc, 0, v1
; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4
; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, vcc
; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v3
; GISEL-NEXT: v_trunc_f32_e32 v5, v5
; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5
-; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v3
; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v12, 0
; GISEL-NEXT: v_mov_b32_e32 v3, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4]
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v5, v[3:4]
; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v7
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[8:9]
+; GISEL-NEXT: v_mul_hi_u32 v8, v12, v7
; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7
-; GISEL-NEXT: v_mul_lo_u32 v13, v10, v8
-; GISEL-NEXT: v_mul_lo_u32 v14, v5, v8
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v9, v12, v10
+; GISEL-NEXT: v_mul_lo_u32 v11, v5, v10
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v12, v10
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v13, v3
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v9, v3
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v11, v7
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
-; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_mul_hi_u32 v9, v5, v10
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v3
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v3
; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v12, 0
; GISEL-NEXT: v_mov_b32_e32 v3, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v5, v[3:4]
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v5, v[3:4]
; GISEL-NEXT: v_mul_lo_u32 v3, v5, v7
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[8:9]
; GISEL-NEXT: v_and_b32_e32 v11, 0xffffff, v0
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
-; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7
+; GISEL-NEXT: v_mul_hi_u32 v0, v12, v7
+; GISEL-NEXT: v_mul_lo_u32 v8, v12, v10
; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7
-; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v5, v8
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT: v_mul_lo_u32 v3, v5, v10
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_mul_hi_u32 v8, v12, v10
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v5, v10
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v7, v3
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v8, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v3, vcc
; GISEL-NEXT: v_mul_lo_u32 v7, 0, v0
; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5
@@ -3111,126 +3111,126 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v6
-; GISEL-NEXT: v_mul_hi_u32 v9, 0, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v8, 0
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v6
+; GISEL-NEXT: v_mul_hi_u32 v8, 0, v5
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v10, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[0:1]
; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], 0, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v10, v[6:7]
; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v4
; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v11, v5
-; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], 0, v6, vcc
+; GISEL-NEXT: v_sub_i32_e32 v9, vcc, v11, v5
+; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, v8, vcc
; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0
-; GISEL-NEXT: v_trunc_f32_e32 v9, v4
-; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v9
-; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v0
-; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], 0, v3
-; GISEL-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT: v_sub_i32_e64 v13, s[4:5], 0, v6
+; GISEL-NEXT: v_trunc_f32_e32 v6, v4
+; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6
+; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v0
+; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], 0, v3
+; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v11, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v6
+; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 0, v8
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v14, v[0:1]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v14, -1, v0, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v0, v9, v4
-; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5
-; GISEL-NEXT: v_mul_hi_u32 v15, v10, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v13, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v15
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v11, v[5:6]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v16, -1, v0, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v0, v14, v4
+; GISEL-NEXT: v_mul_lo_u32 v5, v11, v7
+; GISEL-NEXT: v_mul_hi_u32 v8, v11, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v15, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v15, v9, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
-; GISEL-NEXT: v_mul_hi_u32 v6, v10, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT: v_mul_lo_u32 v8, v14, v7
+; GISEL-NEXT: v_mul_hi_u32 v4, v14, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT: v_mul_hi_u32 v5, v11, v7
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v7, v14, v7
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, 0
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v7, v1
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v0
+; GISEL-NEXT: v_addc_u32_e32 v14, vcc, v14, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v11, 0
+; GISEL-NEXT: v_sub_i32_e32 v15, vcc, v9, v1
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v11, v9, v[0:1]
-; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v13, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v10, v[5:6]
+; GISEL-NEXT: v_subbrev_u32_e32 v17, vcc, 0, v6, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v14, v[0:1]
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v15, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13
-; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v17
+; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v7, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v13, v11, v[5:6]
; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v15, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v13, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v9, v4
-; GISEL-NEXT: v_mul_lo_u32 v12, v10, v5
+; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
+; GISEL-NEXT: v_mul_lo_u32 v6, v14, v4
+; GISEL-NEXT: v_mul_lo_u32 v8, v11, v7
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v10, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v9, v4
-; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v11, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v14, v4
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v6, v0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v11, v9, v5
-; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v12, v0
-; GISEL-NEXT: v_mul_hi_u32 v12, v10, v5
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v11, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v12
-; GISEL-NEXT: v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT: v_mul_lo_u32 v6, v14, v7
+; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v8, v0
+; GISEL-NEXT: v_mul_hi_u32 v8, v11, v7
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v6, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8
+; GISEL-NEXT: v_mul_hi_u32 v7, v14, v7
; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v4, v0
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v11, v4
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4
-; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v10, v0
-; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], v9, v4, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v5, 0, v0
-; GISEL-NEXT: v_mul_lo_u32 v9, v2, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v13, v6, vcc
-; GISEL-NEXT: v_mul_hi_u32 v6, v2, v0
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v6, v4
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v7, v4
+; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v11, v0
+; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], v14, v4, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v6, 0, v0
+; GISEL-NEXT: v_mul_lo_u32 v7, v2, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v17, v5, vcc
+; GISEL-NEXT: v_mul_hi_u32 v5, v2, v0
; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
-; GISEL-NEXT: v_mul_lo_u32 v9, 0, v4
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_mul_lo_u32 v7, 0, v4
; GISEL-NEXT: v_mul_hi_u32 v0, 0, v0
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
; GISEL-NEXT: v_mul_hi_u32 v6, v2, v4
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5
-; GISEL-NEXT: v_mul_hi_u32 v11, 0, v4
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v5
+; GISEL-NEXT: v_mul_hi_u32 v7, 0, v4
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v11, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v0
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v0
; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[0:1]
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v8, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], 0, v11, v[5:6]
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v7
+; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], 0, v7, vcc
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v3
; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -3270,15 +3270,15 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0
; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT: v_mul_lo_u32 v0, v0, v6
-; CGP-NEXT: v_mul_lo_u32 v5, v1, v3
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v6, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
+; CGP-NEXT: v_mul_lo_u32 v5, v0, v6
+; CGP-NEXT: v_mul_lo_u32 v7, v1, v3
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v7
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v1
+; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v6, 0
; CGP-NEXT: v_sub_i32_e32 v7, vcc, v5, v3
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
; CGP-NEXT: v_mul_lo_u32 v6, v1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index ba5a8e9c68a1f..4952e3aa8f0ce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -132,65 +132,65 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_trunc_f32_e32 v2, v1
; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT: v_mov_b32_e32 v8, s9
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0
; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1
; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX8-NEXT: v_mul_hi_u32 v5, s9, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
@@ -203,51 +203,51 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v6, s9
-; GFX8-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2]
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v0
-; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v1
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GFX8-NEXT: v_subb_u32_e64 v3, s[0:1], v8, v4, vcc
+; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v4
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v6
-; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s10, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v3
+; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1]
+; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v2
; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v0, vcc
-; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4
-; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
+; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v6
+; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1]
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v8
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v7
-; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v5
+; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v8
-; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v7
+; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, s10, v5
; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9
-; GFX8-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
+; GFX8-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v0, vcc
; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
+; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v14, vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v15, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v13, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v0, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v9, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v14, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v4, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v10, v13, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -258,6 +258,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-LABEL: udivrem_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s19
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s18
@@ -271,64 +272,63 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_trunc_f32_e32 v2, v1
; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, s19
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT: v_mul_hi_u32 v4, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v4
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0
; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1
; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0
; GFX9-NEXT: v_mul_hi_u32 v0, s17, v0
-; GFX9-NEXT: v_mul_hi_u32 v6, s17, v1
+; GFX9-NEXT: v_mul_hi_u32 v5, s17, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
@@ -336,40 +336,40 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_mul_lo_u32 v4, s17, v1
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_hi_u32 v3, s16, v1
+; GFX9-NEXT: v_mov_b32_e32 v7, s19
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v5, 0
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s18, v6, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v3, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v6, s17
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v5, v[1:2]
+; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s18, v8, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v1, s17
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s19, v6, v[2:3]
; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s16, v0
-; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v6
-; GFX9-NEXT: v_sub_u32_e32 v0, s17, v1
+; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v1, v4, vcc
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v3
+; GFX9-NEXT: v_sub_u32_e32 v0, s17, v4
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v3
; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s18, v2
-; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc
-; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5
-; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v9
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1]
+; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s18, v2
+; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v0, vcc
+; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v6
+; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v8, s[0:1]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v5
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v8
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v4
; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v9
-; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s18, v8
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v5
+; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s18, v4
; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10
; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
@@ -378,14 +378,14 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v13, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v14, vcc
; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v10, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v7, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v15, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v5, s[0:1]
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[14:15]
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v8, v10, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v15, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v9, v[0:1], s[12:13]
+; GFX9-NEXT: global_store_dwordx2 v9, v[2:3], s[14:15]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: udivrem_i64:
@@ -1005,67 +1005,68 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT: v_mov_b32_e32 v9, s13
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX8-NEXT: v_trunc_f32_e32 v2, v1
; GFX8-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX8-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v8
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT: v_mul_lo_u32 v1, v7, v0
; GFX8-NEXT: s_sub_u32 s2, 0, s14
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX8-NEXT: s_subb_u32 s3, 0, s15
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT: v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0
; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1
; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0
@@ -1082,121 +1083,120 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v0, v2
; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v6, 0
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v7, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v7, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v3, s9
-; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s8, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v6, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v4, s13
-; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v3, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s9, v1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v8, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_sub_u32_e32 v10, vcc, s8, v0
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s13, v7, v[2:3]
+; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v1, v4, vcc
+; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s9, v4
; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v3, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v2, v3, s[0:1]
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s15
; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s14
-; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v4, vcc
+; GFX8-NEXT: v_subb_u32_e32 v12, vcc, v1, v9, vcc
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f800000, v2
; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1
-; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v8
-; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v5, vcc
+; GFX8-NEXT: v_subrev_u32_e32 v13, vcc, s12, v10
+; GFX8-NEXT: v_subbrev_u32_e64 v14, s[0:1], 0, v12, vcc
; GFX8-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
; GFX8-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1
; GFX8-NEXT: v_trunc_f32_e32 v3, v2
; GFX8-NEXT: v_mul_f32_e32 v2, 0xcf800000, v3
; GFX8-NEXT: v_add_f32_e32 v1, v2, v1
-; GFX8-NEXT: v_cvt_u32_f32_e32 v12, v1
-; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v6
-; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v7, s[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v12, 0
-; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v3
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11
-; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX8-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10
-; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s3, v12, v[2:3]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11
-; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1]
-; GFX8-NEXT: v_mul_lo_u32 v3, v15, v1
-; GFX8-NEXT: v_mul_lo_u32 v17, v12, v2
-; GFX8-NEXT: v_mul_hi_u32 v5, v12, v1
-; GFX8-NEXT: v_mul_hi_u32 v1, v15, v1
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v17
-; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5
+; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v1
+; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v7
+; GFX8-NEXT: v_addc_u32_e64 v17, s[0:1], 0, v8, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v15, 0
+; GFX8-NEXT: v_cvt_u32_f32_e32 v18, v3
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v14
+; GFX8-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v18, v[2:3]
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v13
+; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s3, v15, v[3:4]
+; GFX8-NEXT: v_mul_lo_u32 v3, v18, v1
+; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v12, v9, vcc
+; GFX8-NEXT: v_mul_lo_u32 v4, v15, v5
+; GFX8-NEXT: v_mul_hi_u32 v9, v15, v1
+; GFX8-NEXT: v_mul_hi_u32 v1, v18, v1
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v14
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v9
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v5, v15, v2
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v17, v3
-; GFX8-NEXT: v_mul_hi_u32 v17, v12, v2
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1
-; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v17
-; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v17
-; GFX8-NEXT: v_add_u32_e32 v17, vcc, 1, v13
-; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v14, vcc
-; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10
-; GFX8-NEXT: v_mul_hi_u32 v2, v15, v2
-; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v4, vcc
+; GFX8-NEXT: v_mul_lo_u32 v9, v18, v5
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
+; GFX8-NEXT: v_mul_hi_u32 v4, v15, v5
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v9, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v9, v4
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 1, v16
+; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v17, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v19, v2, s[0:1]
+; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v13
+; GFX8-NEXT: v_mul_hi_u32 v5, v18, v5
+; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v6, vcc
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v1
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v12, 0
-; GFX8-NEXT: v_addc_u32_e32 v15, vcc, v15, v2, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v13, v17, vcc
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, v15, v1
+; GFX8-NEXT: v_addc_u32_e32 v18, vcc, v18, v3, vcc
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, 0
+; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v9, vcc
; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[1:2]
-; GFX8-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v12, v[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v6, v2, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v13, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v19, vcc
-; GFX8-NEXT: v_mul_lo_u32 v7, v15, v3
-; GFX8-NEXT: v_mul_lo_u32 v9, v12, v4
-; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1]
-; GFX8-NEXT: v_mul_hi_u32 v8, v12, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v11, v20, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v9
+; GFX8-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[1:2]
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v17, v12, vcc
+; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, v2, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v6, s[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[2:3], s3, v15, v[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v13, v19, vcc
+; GFX8-NEXT: v_mul_lo_u32 v8, v18, v3
+; GFX8-NEXT: v_mul_lo_u32 v9, v15, v6
+; GFX8-NEXT: v_cndmask_b32_e64 v5, v10, v4, s[0:1]
+; GFX8-NEXT: v_mul_hi_u32 v4, v15, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v14, v20, vcc
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9
; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v8, v15, v4
-; GFX8-NEXT: v_mul_hi_u32 v3, v15, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT: v_mul_hi_u32 v9, v12, v4
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v8, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT: v_mul_lo_u32 v8, v18, v6
+; GFX8-NEXT: v_mul_hi_u32 v3, v18, v3
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v9, v4
+; GFX8-NEXT: v_mul_hi_u32 v9, v15, v6
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v8, v3
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v9
; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9
-; GFX8-NEXT: v_mul_hi_u32 v4, v15, v4
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v7
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, v12, v3
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc
-; GFX8-NEXT: v_mul_lo_u32 v7, s11, v3
-; GFX8-NEXT: v_mul_lo_u32 v8, s10, v4
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1]
+; GFX8-NEXT: v_mul_hi_u32 v6, v18, v6
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v8, v4
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, v15, v3
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v18, v4, vcc
+; GFX8-NEXT: v_mul_lo_u32 v8, s11, v3
+; GFX8-NEXT: v_mul_lo_u32 v9, s10, v4
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, v7, s[0:1]
; GFX8-NEXT: v_mul_hi_u32 v0, s10, v3
; GFX8-NEXT: v_mul_hi_u32 v3, s11, v3
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v8, v9
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1208,54 +1208,54 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, v3, v0
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, v3, v0
; GFX8-NEXT: v_mul_hi_u32 v8, s11, v4
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s14, v9, 0
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s14, v11, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, v8, v0
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, v8, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v4
-; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s14, v10, v[0:1]
+; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s14, v12, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v4, s11
; GFX8-NEXT: v_mov_b32_e32 v0, s15
-; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s15, v9, v[7:8]
-; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s10, v3
-; GFX8-NEXT: v_subb_u32_e64 v11, s[0:1], v4, v7, vcc
-; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s11, v7
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v11
+; GFX8-NEXT: v_mad_u64_u32 v[9:10], s[0:1], s15, v11, v[7:8]
+; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s10, v3
+; GFX8-NEXT: v_subb_u32_e64 v8, s[0:1], v4, v9, vcc
+; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s11, v9
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v8
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8
-; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v11
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7
+; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v8
; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v0, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s14, v8
-; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v3, vcc
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v12
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[0:1]
+; GFX8-NEXT: v_subrev_u32_e32 v9, vcc, s14, v7
+; GFX8-NEXT: v_subbrev_u32_e64 v10, s[0:1], 0, v3, vcc
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v10
; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v9
; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v12
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v10
; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1]
-; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v9
+; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v11
; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v3, v0, vcc
-; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v10, s[0:1]
+; GFX8-NEXT: v_addc_u32_e64 v15, s[0:1], 0, v12, s[0:1]
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v14
; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s14, v7
+; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s14, v9
; GFX8-NEXT: v_subbrev_u32_e64 v0, s[0:1], 0, v0, s[0:1]
+; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v13, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v14, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v10, s5
-; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v12, v14, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v9, s4
-; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v0, s[0:1]
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[0:1]
; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[1:4]
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
@@ -1274,65 +1274,66 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT: v_mov_b32_e32 v10, s5
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX9-NEXT: v_trunc_f32_e32 v2, v1
; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v2
; GFX9-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT: v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT: v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT: v_add_u32_e32 v2, v6, v2
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT: v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT: v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT: v_mul_hi_u32 v4, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT: v_add_u32_e32 v1, v3, v1
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v4
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT: v_mul_lo_u32 v1, v7, v0
; GFX9-NEXT: s_sub_u32 s2, 0, s6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT: v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v6, v4
; GFX9-NEXT: s_subb_u32 s3, 0, s7
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v4, v1
-; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT: v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT: v_mul_hi_u32 v2, v6, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT: v_add_u32_e32 v5, v6, v5
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT: v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v2, s17, v0
; GFX9-NEXT: v_mul_lo_u32 v3, s16, v1
; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0
@@ -1345,129 +1346,128 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_mul_lo_u32 v4, s17, v1
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_hi_u32 v3, s16, v1
+; GFX9-NEXT: v_mov_b32_e32 v6, s17
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v7, 0
+; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v8, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT: v_add3_u32 v8, v3, v2, v5
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s4, v8, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v4, s17
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v7, v[1:2]
+; GFX9-NEXT: v_add3_u32 v9, v3, v2, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s4, v9, v[1:2]
; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s16, v0
-; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v4, v2, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s5, v8, v[2:3]
+; GFX9-NEXT: v_subb_co_u32_e64 v0, s[0:1], v6, v4, vcc
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v1
+; GFX9-NEXT: v_sub_u32_e32 v2, s17, v4
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v0
-; GFX9-NEXT: v_cndmask_b32_e64 v9, v3, v4, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v4, s[0:1]
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7
; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6
-; GFX9-NEXT: v_sub_u32_e32 v2, s17, v2
-; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v5, vcc
+; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v10, vcc
; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v3
; GFX9-NEXT: v_add_f32_e32 v2, v2, v4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s4, v1
-; GFX9-NEXT: v_subbrev_co_u32_e64 v11, s[0:1], 0, v6, vcc
+; GFX9-NEXT: v_subrev_co_u32_e32 v13, vcc, s4, v1
+; GFX9-NEXT: v_subbrev_co_u32_e64 v14, s[0:1], 0, v12, vcc
; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; GFX9-NEXT: v_trunc_f32_e32 v4, v3
; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4
; GFX9-NEXT: v_add_f32_e32 v2, v3, v2
-; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v2
-; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v7
-; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0
-; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v4
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v11
-; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v11
-; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1]
-; GFX9-NEXT: v_mul_lo_u32 v4, v15, v2
-; GFX9-NEXT: v_mul_lo_u32 v17, v12, v3
-; GFX9-NEXT: v_mul_hi_u32 v6, v12, v2
-; GFX9-NEXT: v_mul_hi_u32 v2, v15, v2
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v17
-; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v2
+; GFX9-NEXT: v_add_co_u32_e64 v16, s[0:1], 1, v8
+; GFX9-NEXT: v_addc_co_u32_e64 v17, s[0:1], 0, v9, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0
+; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v4
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v14
+; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4]
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v13
+; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5]
+; GFX9-NEXT: v_mul_lo_u32 v4, v18, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v12, v10, vcc
+; GFX9-NEXT: v_mul_lo_u32 v5, v15, v6
+; GFX9-NEXT: v_mul_hi_u32 v10, v15, v2
+; GFX9-NEXT: v_mul_hi_u32 v2, v18, v2
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, v14
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v10
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3
-; GFX9-NEXT: v_add_u32_e32 v4, v17, v4
-; GFX9-NEXT: v_mul_hi_u32 v17, v12, v3
-; GFX9-NEXT: v_mul_hi_u32 v3, v15, v3
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2
-; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v17
-; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX9-NEXT: v_add_u32_e32 v6, v6, v17
-; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13
-; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v14, vcc
-; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v10
-; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v5, vcc
+; GFX9-NEXT: v_mul_lo_u32 v10, v18, v6
+; GFX9-NEXT: v_add_u32_e32 v4, v5, v4
+; GFX9-NEXT: v_mul_hi_u32 v5, v15, v6
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1]
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT: v_add_u32_e32 v5, v10, v5
+; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 1, v16
+; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v17, vcc
+; GFX9-NEXT: v_mul_hi_u32 v6, v18, v6
+; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s4, v13
+; GFX9-NEXT: v_subbrev_co_u32_e32 v20, vcc, 0, v7, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v2
-; GFX9-NEXT: v_add3_u32 v3, v6, v4, v3
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0
-; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v3, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc
+; GFX9-NEXT: v_add3_u32 v4, v5, v4, v6
+; GFX9-NEXT: v_add_co_u32_e32 v15, vcc, v15, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, v18, v4, vcc
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v16, v10, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, v5
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v15, v[2:3]
-; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v18, vcc
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s3, v12, v[5:6]
-; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v3, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v13, s[0:1]
-; GFX9-NEXT: v_mul_lo_u32 v7, v15, v4
-; GFX9-NEXT: v_mul_lo_u32 v8, v12, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v19, vcc
-; GFX9-NEXT: v_mul_hi_u32 v10, v12, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v20, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3]
+; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v3, s[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[2:3], s3, v15, v[5:6]
+; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v12, vcc
+; GFX9-NEXT: v_mul_lo_u32 v6, v18, v4
+; GFX9-NEXT: v_mul_lo_u32 v8, v15, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v10, s[0:1]
+; GFX9-NEXT: v_mul_hi_u32 v10, v15, v4
+; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v19, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v9, v14, v20, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v8
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v10, v15, v5
-; GFX9-NEXT: v_mul_hi_u32 v4, v15, v4
-; GFX9-NEXT: v_add_u32_e32 v7, v8, v7
-; GFX9-NEXT: v_mul_hi_u32 v8, v12, v5
-; GFX9-NEXT: v_mul_hi_u32 v5, v15, v5
+; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v10
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT: v_mul_lo_u32 v10, v18, v7
+; GFX9-NEXT: v_mul_hi_u32 v4, v18, v4
+; GFX9-NEXT: v_add_u32_e32 v6, v8, v6
+; GFX9-NEXT: v_mul_hi_u32 v8, v15, v7
+; GFX9-NEXT: v_mul_hi_u32 v7, v18, v7
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
; GFX9-NEXT: v_add_u32_e32 v8, v10, v8
-; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT: v_add3_u32 v5, v8, v7, v5
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4
-; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc
-; GFX9-NEXT: v_mul_lo_u32 v7, s19, v4
-; GFX9-NEXT: v_mul_lo_u32 v8, s18, v5
-; GFX9-NEXT: v_cndmask_b32_e64 v6, v1, v6, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT: v_add3_u32 v6, v8, v6, v7
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v15, v4
+; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v18, v6, vcc
+; GFX9-NEXT: v_mul_lo_u32 v8, s19, v4
+; GFX9-NEXT: v_mul_lo_u32 v10, s18, v7
+; GFX9-NEXT: v_cndmask_b32_e64 v6, v1, v5, s[0:1]
; GFX9-NEXT: v_mul_hi_u32 v1, s18, v4
; GFX9-NEXT: v_mul_hi_u32 v4, s19, v4
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8
+; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v8, v10
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v7, v1
+; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v5, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v7, s19, v5
+; GFX9-NEXT: v_mul_lo_u32 v5, s19, v7
; GFX9-NEXT: v_add_u32_e32 v1, v8, v1
-; GFX9-NEXT: v_mul_hi_u32 v8, s18, v5
-; GFX9-NEXT: v_mul_hi_u32 v12, s19, v5
-; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4
+; GFX9-NEXT: v_mul_hi_u32 v8, s18, v7
+; GFX9-NEXT: v_mul_hi_u32 v12, s19, v7
+; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
@@ -1476,45 +1476,45 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v7, v0, v9, s[0:1]
; GFX9-NEXT: v_add_u32_e32 v0, v10, v8
-; GFX9-NEXT: v_add3_u32 v8, v0, v1, v12
+; GFX9-NEXT: v_add3_u32 v10, v0, v1, v12
; GFX9-NEXT: v_mov_b32_e32 v0, v5
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v8, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v9, s19
+; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], s6, v10, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v12, s19
; GFX9-NEXT: v_mov_b32_e32 v5, s7
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s7, v11, v[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s7, v11, v[8:9]
; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s18, v4
-; GFX9-NEXT: v_subb_co_u32_e64 v9, s[0:1], v9, v0, vcc
+; GFX9-NEXT: v_subb_co_u32_e64 v9, s[0:1], v12, v0, vcc
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9
; GFX9-NEXT: v_sub_u32_e32 v0, s19, v0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9
; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v5, vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v10, vcc, s6, v1
+; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1]
+; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s6, v1
; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v0, vcc
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v12
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v10
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8
; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1]
; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v12
; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v11
; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v5, vcc
-; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1]
+; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v10, s[0:1]
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 1, v14
; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v15, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v5, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s6, v10
+; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s6, v8
; GFX9-NEXT: v_subbrev_co_u32_e64 v0, s[0:1], 0, v0, s[0:1]
; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
; GFX9-NEXT: v_mov_b32_e32 v13, 0
; GFX9-NEXT: v_cndmask_b32_e64 v4, v11, v5, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v14, s[0:1]
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v15, vcc
+; GFX9-NEXT: v_cndmask_b32_e64 v5, v10, v14, s[0:1]
+; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v15, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v8, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 7ea98a16e3b84..8fe19c85ade91 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -2407,51 +2407,52 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0
-; GISEL-NEXT: v_mul_lo_u32 v24, v30, v19
-; GISEL-NEXT: v_mul_lo_u32 v25, v29, v18
+; GISEL-NEXT: v_mul_lo_u32 v27, v30, v19
+; GISEL-NEXT: v_mul_lo_u32 v36, v29, v18
; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v20, 0
; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v2, 0
-; GISEL-NEXT: v_mul_lo_u32 v26, v35, v3
-; GISEL-NEXT: v_mul_lo_u32 v27, v34, v2
+; GISEL-NEXT: v_mul_lo_u32 v37, v35, v3
+; GISEL-NEXT: v_mul_lo_u32 v38, v34, v2
; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v29, v32, v[14:15]
; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[22:23]
; GISEL-NEXT: v_mov_b32_e32 v22, v19
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[2:3]
-; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v4, v20, v[14:15]
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v30, v32, v[1:2]
-; GISEL-NEXT: v_mov_b32_e32 v23, v14
-; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v21, v[22:23]
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v29, v31, v[1:2]
-; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v3, v24, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v34, v20, v[22:23]
-; GISEL-NEXT: v_addc_u32_e64 v14, s[6:7], v15, v26, s[6:7]
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v25, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[23:24], s[4:5], v10, v31, v[2:3]
+; GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v4, v20, v[14:15]
+; GISEL-NEXT: v_mov_b32_e32 v2, v23
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], vcc, v30, v32, v[1:2]
+; GISEL-NEXT: v_mov_b32_e32 v23, v25
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v35, v21, v[22:23]
+; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v29, v31, v[14:15]
+; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v24, v27, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[6:7], v34, v20, v[1:2]
+; GISEL-NEXT: v_addc_u32_e64 v2, s[6:7], v26, v37, s[6:7]
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v3, v36, vcc
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT: v_xor_b32_e32 v15, v0, v28
-; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v14, v27, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v18
-; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v22, s[4:5]
-; GISEL-NEXT: v_xor_b32_e32 v16, v12, v33
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v10, v32, v[3:4]
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v28
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v4, v21, v[0:1]
-; GISEL-NEXT: v_xor_b32_e32 v14, v14, v33
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v31, v[12:13]
-; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v15, v28
+; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v17, v22, vcc
+; GISEL-NEXT: v_xor_b32_e32 v19, v0, v28
+; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v2, v38, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v12, v18
+; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v14, s[4:5]
+; GISEL-NEXT: v_xor_b32_e32 v18, v2, v33
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v10, v32, v[1:2]
+; GISEL-NEXT: v_xor_b32_e32 v1, v16, v28
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v4, v21, v[0:1]
+; GISEL-NEXT: v_xor_b32_e32 v10, v14, v33
+; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v11, v31, v[2:3]
+; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v19, v28
; GISEL-NEXT: v_subb_u32_e64 v1, s[6:7], v1, v28, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v5, v20, v[3:4]
-; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v16, v33
-; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v14, v33, s[8:9]
-; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc
-; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v10, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v28
-; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v23, s[4:5]
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v5, v20, v[12:13]
+; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v18, v33
+; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v10, v33, s[8:9]
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v8, v23, vcc
+; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v16, vcc
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28
+; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v15, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v7, v2, vcc
; GISEL-NEXT: v_xor_b32_e32 v6, v6, v33
; GISEL-NEXT: v_xor_b32_e32 v7, v8, v28
-; GISEL-NEXT: v_xor_b32_e32 v8, v3, v33
-; GISEL-NEXT: v_subb_u32_e64 v2, vcc, v2, v28, s[6:7]
+; GISEL-NEXT: v_xor_b32_e32 v8, v2, v33
+; GISEL-NEXT: v_subb_u32_e64 v2, vcc, v3, v28, s[6:7]
; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v28, vcc
; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v33, s[8:9]
; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v8, v33, vcc
@@ -3216,36 +3217,38 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0
; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0
-; GISEL-NEXT: v_mul_lo_u32 v28, v8, v21
-; GISEL-NEXT: v_mul_lo_u32 v29, v9, v20
+; GISEL-NEXT: v_mul_lo_u32 v34, v8, v21
+; GISEL-NEXT: v_mul_lo_u32 v35, v9, v20
; GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v24, 0
; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v12, v18, 0
-; GISEL-NEXT: v_mul_lo_u32 v30, v12, v19
-; GISEL-NEXT: v_mul_lo_u32 v31, v13, v18
+; GISEL-NEXT: v_mul_lo_u32 v36, v12, v19
+; GISEL-NEXT: v_mul_lo_u32 v37, v13, v18
; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v33, v[22:23]
; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v13, v25, v[26:27]
-; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v10, v32, v[18:19]
-; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v14, v24, v[22:23]
-; GISEL-NEXT: v_mad_u64_u32 v[17:18], vcc, v8, v33, v[17:18]
-; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v12, v25, v[21:22]
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v9, v32, v[17:18]
-; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], v19, v28, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v13, v24, v[21:22]
-; GISEL-NEXT: v_addc_u32_e64 v18, s[6:7], v23, v30, s[6:7]
-; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v29, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v10, v32, v[18:19]
+; GISEL-NEXT: v_mad_u64_u32 v[28:29], s[4:5], v14, v24, v[22:23]
+; GISEL-NEXT: v_mov_b32_e32 v18, v26
+; GISEL-NEXT: v_mad_u64_u32 v[30:31], vcc, v8, v33, v[17:18]
+; GISEL-NEXT: v_mov_b32_e32 v22, v28
+; GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v12, v25, v[21:22]
+; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v9, v32, v[30:31]
+; GISEL-NEXT: v_addc_u32_e64 v12, s[6:7], v27, v34, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v13, v24, v[17:18]
+; GISEL-NEXT: v_addc_u32_e64 v13, s[6:7], v29, v36, s[6:7]
+; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v35, vcc
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v18, v31, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v21, vcc
+; GISEL-NEXT: v_addc_u32_e64 v13, s[4:5], v13, v37, s[4:5]
; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v20
-; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v12, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[17:18]
-; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v32, v[16:17]
-; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v15, v24, v[18:19]
-; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v9, vcc
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v13, s[4:5]
-; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v11, vcc
+; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v8, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[12:13]
+; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[13:14]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v11, v32, v[16:17]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v15, v24, v[18:19]
+; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v22, vcc
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v9, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v10, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
%shl = urem <2 x i128> %lhs, %rhs
ret <2 x i128> %shl
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 189b897793381..11047ce5ae279 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -234,31 +234,31 @@ define i128 @fptosi_f64_to_i128(double %x) {
; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v10, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr9
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6
; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr8
@@ -272,17 +272,18 @@ define i128 @fptosi_f64_to_i128(double %x) {
; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3]
-; GISEL-NEXT: v_mul_lo_u32 v6, v5, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v4, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7]
-; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3]
+; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9
+; GISEL-NEXT: v_mov_b32_e32 v2, v4
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4]
+; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7]
+; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc
; GISEL-NEXT: .LBB0_6: ; %Flow1
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: .LBB0_7: ; %Flow2
@@ -598,31 +599,31 @@ define i128 @fptoui_f64_to_i128(double %x) {
; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v10, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v11, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr9
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6
; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr8
@@ -636,17 +637,18 @@ define i128 @fptoui_f64_to_i128(double %x) {
; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0
-; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3]
-; GISEL-NEXT: v_mul_lo_u32 v6, v5, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v4, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7]
-; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v1, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3]
+; GISEL-NEXT: v_mul_lo_u32 v10, v7, v9
+; GISEL-NEXT: v_mov_b32_e32 v2, v4
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v6, v6, v9
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4]
+; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7]
+; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v10, vcc
; GISEL-NEXT: .LBB1_6: ; %Flow1
; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
; GISEL-NEXT: .LBB1_7: ; %Flow2
@@ -955,31 +957,31 @@ define i128 @fptosi_f32_to_i128(float %x) {
; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0
; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v10, v8
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v11, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v8
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr8
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6
; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr9
@@ -992,12 +994,14 @@ define i128 @fptosi_f32_to_i128(float %x) {
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0
-; GISEL-NEXT: v_mul_lo_u32 v5, v4, v8
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8
+; GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5]
+; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
; GISEL-NEXT: .LBB2_6: ; %Flow1
; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
; GISEL-NEXT: .LBB2_7: ; %Flow2
@@ -1306,31 +1310,31 @@ define i128 @fptoui_f32_to_i128(float %x) {
; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0
; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v10, v8
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v11, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v8
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v8
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr8
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6
; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr9
@@ -1343,12 +1347,14 @@ define i128 @fptoui_f32_to_i128(float %x) {
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0
-; GISEL-NEXT: v_mul_lo_u32 v5, v4, v8
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GISEL-NEXT: v_mul_lo_u32 v7, v6, v8
+; GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5]
+; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
; GISEL-NEXT: .LBB3_6: ; %Flow1
; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
; GISEL-NEXT: .LBB3_7: ; %Flow2
@@ -1693,31 +1699,31 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
-; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr5
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GISEL-NEXT: ; implicit-def: $vgpr8
; GISEL-NEXT: .LBB6_4: ; %Flow
@@ -2040,31 +2046,31 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
; GISEL-NEXT: v_add_u32_e32 v10, 0xffffff7a, v5
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v10, v[6:7]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10
-; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5
; GISEL-NEXT: v_sub_u32_e32 v2, 64, v10
; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[6:7]
; GISEL-NEXT: v_lshlrev_b64 v[4:5], v4, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7]
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
-; GISEL-NEXT: v_mov_b32_e32 v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v11, v9
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
-; GISEL-NEXT: v_mul_lo_u32 v4, v12, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v13, v9
; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
-; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v3, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr5
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
; GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7
; GISEL-NEXT: ; implicit-def: $vgpr8
; GISEL-NEXT: .LBB7_4: ; %Flow
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 742d87f099ce4..ed3e38ddb1fa3 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -5777,28 +5777,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX7-GISEL-LABEL: clpeak_imad_pat_i64:
; GFX7-GISEL: ; %bb.0: ; %entry
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2]
-; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, v0, v6
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, v4, v7, vcc
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7]
-; GFX7-GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0
-; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v4
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v8
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5]
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -5833,28 +5833,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX8-GISEL-LABEL: clpeak_imad_pat_i64:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 1, v0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2]
-; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, v0, v6
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, v4, v7, vcc
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7]
-; GFX8-GISEL-NEXT: v_add_u32_e32 v6, vcc, 1, v0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0
-; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 1, v5
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v4
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 1, v0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, v0, v8
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, v6, v9, vcc
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v7, vcc, 1, v0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-GISEL-NEXT: v_add_u32_e32 v8, vcc, 1, v4
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5]
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -5885,28 +5885,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX900-GISEL-LABEL: clpeak_imad_pat_i64:
; GFX900-GISEL: ; %bb.0: ; %entry
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2]
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, v0, v6
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, v4, v7, vcc
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7]
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v5
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v4
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v2, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v8
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, v6, v9, vcc
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v4
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v9, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5]
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -5937,29 +5937,29 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
; GFX90A-GISEL-LABEL: clpeak_imad_pat_i64:
; GFX90A-GISEL: ; %bb.0: ; %entry
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v3, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v4
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, v0, v6
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v7, vcc
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v2, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v6, v3, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, v[6:7]
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v5, v5, v2
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v4
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v5, vcc
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, v[0:1]
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v4, v3, v0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v7, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v8, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, v[2:3]
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v0
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v3, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v6
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, v0, v8
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v10, vcc, v1, v9, vcc
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v3, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v2, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v2, v[6:7]
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v6, v5, v8
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 1, v4
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v6, vcc
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v7, v[0:1]
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v6, v3, v4
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[4:5]
; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -6410,52 +6410,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX7-GISEL-LABEL: clpeak_imad_pat_v2i64:
; GFX7-GISEL: ; %bb.0: ; %entry
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
-; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v2
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc
+; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v2
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2]
; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
-; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v12
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, v2, v14
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v14
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11]
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v12, v15, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0
+; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, v2, v16
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v10
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2]
; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v14
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
-; GFX7-GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2]
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v17, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc
; GFX7-GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v2
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2]
; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc
-; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v10
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v7
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
-; GFX7-GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v13
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
-; GFX7-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
+; GFX7-GISEL-NEXT: v_add_i32_e32 v14, vcc, 1, v9
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc
+; GFX7-GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v13
+; GFX7-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2]
; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v3
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12]
; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v5
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -6515,52 +6515,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i64:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
-; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v2
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v2
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2]
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
-; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v12
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v16, vcc, v8, v13, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, v2, v14
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v3, vcc, v0, v14
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11]
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v12, v15, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0
+; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, v2, v16
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v10
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2]
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v14
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, v9, v15, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
-; GFX8-GISEL-NEXT: v_add_u32_e32 v11, vcc, 1, v0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v8, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2]
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v19, vcc, v8, v17, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v10, vcc, 1, v0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc
; GFX8-GISEL-NEXT: v_add_u32_e32 v12, vcc, 1, v2
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2]
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v9, vcc
-; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v10
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v7
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
-; GFX8-GISEL-NEXT: v_add_u32_e32 v16, vcc, 1, v13
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
-; GFX8-GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v4, vcc
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
+; GFX8-GISEL-NEXT: v_add_u32_e32 v14, vcc, 1, v9
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v15, vcc, 0, v16, vcc
+; GFX8-GISEL-NEXT: v_add_u32_e32 v17, vcc, 1, v13
+; GFX8-GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2]
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v3
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12]
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v5
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -6612,52 +6612,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX900-GISEL-LABEL: clpeak_imad_pat_v2i64:
; GFX900-GISEL: ; %bb.0: ; %entry
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v2
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v1, vcc
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v2
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v3, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2]
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v12
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v8, v13, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, v2, v14
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, v0, v14
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11]
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v12, v15, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, v2, v16
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v10
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2]
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v14
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, v9, v15, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v0
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v8, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2]
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v19, vcc, v8, v17, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v0
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v12, vcc
; GFX900-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v8, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2]
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v9, vcc
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v10
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, v7
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
-; GFX900-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, 1, v13
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
-; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v4, vcc
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v9
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v16, vcc
+; GFX900-GISEL-NEXT: v_add_co_u32_e32 v17, vcc, 1, v13
+; GFX900-GISEL-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v3, vcc
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2]
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v3
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
-; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9]
+; GFX900-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12]
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, v5
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -6709,54 +6709,54 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
; GFX90A-GISEL-LABEL: clpeak_imad_pat_v2i64:
; GFX90A-GISEL: ; %bb.0: ; %entry
; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v0
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v2
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v3, vcc
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v5, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v4, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v4, v[2:3]
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v7, 0
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v10
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v6, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v6, v[8:9]
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v14, vcc, v1, v11, vcc
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v8
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, v2, v12
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v4, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v10, v5, 0
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, v3, v13, vcc
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v4, v[10:11]
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v7, 0
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v9, v4
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v6, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v6, v[10:11]
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v5, v5, v6
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 1, v2
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v3, vcc
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v8
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v6, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v6, v[0:1]
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v9, vcc
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v8, v3, v0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v11, 0
-; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v4
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v10, v[0:1]
-; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v5, vcc
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v7, v7, v0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v12, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v13, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v12, v[2:3]
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v15, 0
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 1, v0
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 1, v2
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[2:3]
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v8
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v14, v7, 0
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, v0, v12
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v15, v6, v[8:9]
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, v1, v13, vcc
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v10
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, v2, v14
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v12, v5, 0
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, v3, v15, vcc
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v4, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v4, v[10:11]
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v7, 0
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v9, v12
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v6, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v15, v6, v[10:11]
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v10, v5, v12
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v11, vcc, 1, v2
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v3, vcc
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v13, vcc, 1, v8
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v9, vcc
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v5, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v5, v[0:1]
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v12, 0
+; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v15, vcc, 1, v4
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v8, v3, v6
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v11, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v11, v[0:1]
+; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v10, vcc
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v9, v7, v4
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v14, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v13, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v13, v[4:5]
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v16, 0
; GFX90A-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v14, 0
-; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v14, v[4:5]
-; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v4
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v15, 0
+; GFX90A-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v15, v[4:5]
+; GFX90A-GISEL-NEXT: v_add_u32_e32 v3, v3, v6
; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i64:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
index 1ab4cb0f00192..d82d6bcb437cc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
@@ -781,16 +781,23 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr
; GISEL12-NEXT: v_dual_mov_b32 v14, v38 :: v_dual_mov_b32 v15, v39
; GISEL12-NEXT: s_wait_kmcnt 0x0
; GISEL12-NEXT: s_swappc_b64 s[30:31], s[0:1]
-; GISEL12-NEXT: v_dual_mov_b32 v24, v0 :: v_dual_mov_b32 v25, v1
-; GISEL12-NEXT: v_dual_mov_b32 v26, v2 :: v_dual_mov_b32 v27, v3
-; GISEL12-NEXT: v_dual_mov_b32 v28, v4 :: v_dual_mov_b32 v29, v5
-; GISEL12-NEXT: v_dual_mov_b32 v30, v6 :: v_dual_mov_b32 v31, v7
-; GISEL12-NEXT: v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9
-; GISEL12-NEXT: v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11
-; GISEL12-NEXT: v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13
-; GISEL12-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15
+; GISEL12-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
+; GISEL12-NEXT: v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v43, v3
+; GISEL12-NEXT: v_dual_mov_b32 v44, v4 :: v_dual_mov_b32 v45, v5
+; GISEL12-NEXT: v_dual_mov_b32 v46, v6 :: v_dual_mov_b32 v47, v7
+; GISEL12-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v49, v9
+; GISEL12-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v11
+; GISEL12-NEXT: v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v53, v13
+; GISEL12-NEXT: v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v55, v15
; GISEL12-NEXT: s_mov_b32 exec_lo, s9
-; GISEL12-NEXT: ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec
+; GISEL12-NEXT: v_dual_mov_b32 v24, v40 :: v_dual_mov_b32 v25, v41
+; GISEL12-NEXT: v_dual_mov_b32 v26, v42 :: v_dual_mov_b32 v27, v43
+; GISEL12-NEXT: v_dual_mov_b32 v28, v44 :: v_dual_mov_b32 v29, v45
+; GISEL12-NEXT: v_dual_mov_b32 v30, v46 :: v_dual_mov_b32 v31, v47
+; GISEL12-NEXT: v_dual_mov_b32 v32, v48 :: v_dual_mov_b32 v33, v49
+; GISEL12-NEXT: v_dual_mov_b32 v34, v50 :: v_dual_mov_b32 v35, v51
+; GISEL12-NEXT: v_dual_mov_b32 v36, v52 :: v_dual_mov_b32 v37, v53
+; GISEL12-NEXT: v_dual_mov_b32 v38, v54 :: v_dual_mov_b32 v39, v55
; GISEL12-NEXT: .LBB5_2: ; %tail
; GISEL12-NEXT: s_wait_alu 0xfffe
; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s4
@@ -946,24 +953,39 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr
; GISEL10-NEXT: s_mov_b64 s[2:3], s[50:51]
; GISEL10-NEXT: s_waitcnt lgkmcnt(0)
; GISEL10-NEXT: s_swappc_b64 s[30:31], s[12:13]
-; GISEL10-NEXT: v_mov_b32_e32 v24, v0
-; GISEL10-NEXT: v_mov_b32_e32 v25, v1
-; GISEL10-NEXT: v_mov_b32_e32 v26, v2
-; GISEL10-NEXT: v_mov_b32_e32 v27, v3
-; GISEL10-NEXT: v_mov_b32_e32 v28, v4
-; GISEL10-NEXT: v_mov_b32_e32 v29, v5
-; GISEL10-NEXT: v_mov_b32_e32 v30, v6
-; GISEL10-NEXT: v_mov_b32_e32 v31, v7
-; GISEL10-NEXT: v_mov_b32_e32 v32, v8
-; GISEL10-NEXT: v_mov_b32_e32 v33, v9
-; GISEL10-NEXT: v_mov_b32_e32 v34, v10
-; GISEL10-NEXT: v_mov_b32_e32 v35, v11
-; GISEL10-NEXT: v_mov_b32_e32 v36, v12
-; GISEL10-NEXT: v_mov_b32_e32 v37, v13
-; GISEL10-NEXT: v_mov_b32_e32 v38, v14
-; GISEL10-NEXT: v_mov_b32_e32 v39, v15
+; GISEL10-NEXT: v_mov_b32_e32 v40, v0
+; GISEL10-NEXT: v_mov_b32_e32 v41, v1
+; GISEL10-NEXT: v_mov_b32_e32 v42, v2
+; GISEL10-NEXT: v_mov_b32_e32 v43, v3
+; GISEL10-NEXT: v_mov_b32_e32 v44, v4
+; GISEL10-NEXT: v_mov_b32_e32 v45, v5
+; GISEL10-NEXT: v_mov_b32_e32 v46, v6
+; GISEL10-NEXT: v_mov_b32_e32 v47, v7
+; GISEL10-NEXT: v_mov_b32_e32 v48, v8
+; GISEL10-NEXT: v_mov_b32_e32 v49, v9
+; GISEL10-NEXT: v_mov_b32_e32 v50, v10
+; GISEL10-NEXT: v_mov_b32_e32 v51, v11
+; GISEL10-NEXT: v_mov_b32_e32 v52, v12
+; GISEL10-NEXT: v_mov_b32_e32 v53, v13
+; GISEL10-NEXT: v_mov_b32_e32 v54, v14
+; GISEL10-NEXT: v_mov_b32_e32 v55, v15
; GISEL10-NEXT: s_mov_b32 exec_lo, s9
-; GISEL10-NEXT: ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec
+; GISEL10-NEXT: v_mov_b32_e32 v24, v40
+; GISEL10-NEXT: v_mov_b32_e32 v25, v41
+; GISEL10-NEXT: v_mov_b32_e32 v26, v42
+; GISEL10-NEXT: v_mov_b32_e32 v27, v43
+; GISEL10-NEXT: v_mov_b32_e32 v28, v44
+; GISEL10-NEXT: v_mov_b32_e32 v29, v45
+; GISEL10-NEXT: v_mov_b32_e32 v30, v46
+; GISEL10-NEXT: v_mov_b32_e32 v31, v47
+; GISEL10-NEXT: v_mov_b32_e32 v32, v48
+; GISEL10-NEXT: v_mov_b32_e32 v33, v49
+; GISEL10-NEXT: v_mov_b32_e32 v34, v50
+; GISEL10-NEXT: v_mov_b32_e32 v35, v51
+; GISEL10-NEXT: v_mov_b32_e32 v36, v52
+; GISEL10-NEXT: v_mov_b32_e32 v37, v53
+; GISEL10-NEXT: v_mov_b32_e32 v38, v54
+; GISEL10-NEXT: v_mov_b32_e32 v39, v55
; GISEL10-NEXT: .LBB5_2: ; %tail
; GISEL10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GISEL10-NEXT: v_mov_b32_e32 v8, v24
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
index 98919f565d902..8dcd89956460e 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
@@ -2473,10 +2473,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) {
; GFX7-GISEL: ; %bb.0: ; %entry
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v1
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: test_vector_reduce_mul_v2i64:
@@ -2494,10 +2495,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) {
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v1
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: test_vector_reduce_mul_v2i64:
@@ -2516,8 +2518,8 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) {
; GFX9-GISEL-NEXT: v_mov_b32_e32 v5, v1
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v4, v3, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v4, v2, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v5, v2, v[6:7]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v5, v2, v[6:7]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v8
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_mul_v2i64:
@@ -2618,9 +2620,9 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v8
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2]
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9]
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: test_vector_reduce_mul_v3i64:
@@ -2646,9 +2648,9 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v8
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2]
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9]
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: test_vector_reduce_mul_v3i64:
@@ -2669,12 +2671,12 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v3, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v0, v2, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v2, v[8:9]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v1, v2, v[8:9]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v8, v7, v10
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v6, v5, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v4, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v4, v[2:3]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v8, v4, v[2:3]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_mul_v3i64:
@@ -2808,10 +2810,10 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v12
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1]
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4]
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -2845,10 +2847,10 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v12
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1]
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4]
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -2873,16 +2875,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v0, v5, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v4, v[10:11]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v9, v0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v1, v4, v[10:11]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v7, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v2, v6, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v3, v6, v[0:1]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v2, v5, v0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v8, v2, 0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v3, v6, v[0:1]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v10
+; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v9, v12
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v8, v5, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v8, v4, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v9, v4, v[2:3]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v9, v4, v[2:3]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -3060,31 +3062,29 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
; GFX7-GISEL: ; %bb.0: ; %entry
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v13
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v13, v20
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v5, v21
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6]
; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v20
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v7
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v8
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9]
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3131,31 +3131,29 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v13
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v13, v20
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, v21
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6]
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v20
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v7
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v8
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9]
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3196,32 +3194,32 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v0, v9, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v0, v8, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v8, v[18:19]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v1, v8, v[18:19]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v2, v11, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v17, v17, v0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, v10, v[8:9]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v3, v10, v[8:9]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v13, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v5, v12, v[8:9]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v3, v4
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v5, v12, v[8:9]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v6, v15, 0
+; GFX9-GISEL-NEXT: v_add_u32_e32 v18, v1, v18
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v3, v10
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v14, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v7, v14, v[8:9]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v7, v14, v[8:9]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v17, v17, v20
+; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v10
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v16, v1, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v6
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v17, v2, v[8:9]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v5, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v16, v2, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v17, v2, v[8:9]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v2
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, v5, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v10, v4, v[0:1]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v18, v4, v[8:9]
; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v3, v0
+; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v7, v10
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v3, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v2, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v2, v[4:5]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v10, v2, v[4:5]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v6
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3542,63 +3540,63 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
; GFX7-GISEL: ; %bb.0: ; %entry
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v9, v33
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v24, v25
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v8, v9
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35]
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2]
-; GFX7-GISEL-NEXT: buffer_load_dword v9, off, s[0:3], s32
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v17
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2]
+; GFX7-GISEL-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v9
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v32
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26]
; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0
; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v18
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v10
; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, v13
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, v18
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6]
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2]
-; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v2, v5
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v10
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1]
+; GFX7-GISEL-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2]
+; GFX7-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5]
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: test_vector_reduce_mul_v16i64:
@@ -3687,63 +3685,63 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v9, v33
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v24, v25
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v8, v9
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35]
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v11
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2]
-; GFX8-GISEL-NEXT: buffer_load_dword v9, off, s[0:3], s32
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v17
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2]
+; GFX8-GISEL-NEXT: buffer_load_dword v27, off, s[0:3], s32
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v9
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v32
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26]
; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v18
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v10
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v12
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, v13
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v3, v18
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6]
-; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2]
-; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, v5
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v10
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1]
+; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2]
+; GFX8-GISEL-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5]
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: test_vector_reduce_mul_v16i64:
@@ -3819,65 +3817,65 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
; GFX9-GISEL-NEXT: scratch_load_dword v31, off, s32
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[0:1], v0, v17, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[32:33], s[0:1], v0, v16, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v1, v16, v[34:35]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[36:37], s[0:1], v1, v16, v[34:35]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v2, v19, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v33, v33, v0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v3, v18, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[34:35], s[0:1], v3, v18, v[16:17]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v4, v21, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v18, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v20, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v5, v20, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v5, v20, v[16:17]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v6, v23, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v19, v3, v4
+; GFX9-GISEL-NEXT: v_add_u32_e32 v20, v3, v18
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v22, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v7, v22, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v7, v22, v[16:17]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v8, v25, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v20, v5, v6
+; GFX9-GISEL-NEXT: v_add_u32_e32 v21, v5, v18
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v8, v24, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v9, v24, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v9, v24, v[16:17]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v10, v27, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v7, v8
+; GFX9-GISEL-NEXT: v_add_u32_e32 v34, v1, v34
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v7, v18
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v10, v26, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v11, v26, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v11, v26, v[16:17]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v12, v29, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v9, v10
+; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v9, v18
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v12, v28, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v13, v28, v[16:17]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v11, v12
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v13, v28, v[16:17]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v33, v33, v36
+; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v11, v18
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v14, v30, 0
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v14, v31, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v15, v30, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v15, v30, v[16:17]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v32, v1, 0
-; GFX9-GISEL-NEXT: v_add_u32_e32 v9, v13, v14
+; GFX9-GISEL-NEXT: v_add_u32_e32 v11, v13, v18
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v33, v6, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v0, v3, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v32, v6, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v33, v6, v[16:17]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v11, v15, v6
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v0, v8, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, v3, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v18, v8, v[0:1]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v7, v7, v0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v34, v8, v[16:17]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v2, v5, 0
+; GFX9-GISEL-NEXT: v_add_u32_e32 v13, v15, v18
+; GFX9-GISEL-NEXT: v_add_u32_e32 v15, v7, v0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v2, v5, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v19, v10, v[2:3]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v20, v10, v[8:9]
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v4, v11, 0
; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v4, v9, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v20, v12, v[4:5]
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v21, v12, v[8:9]
; GFX9-GISEL-NEXT: v_add_u32_e32 v3, v3, v4
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v14, v0, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v11, v0, v[8:9]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v5, v0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v13, v0, v[8:9]
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v6, v3, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v6, v2, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v7, v2, v[0:1]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v2, v9, v0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[6:7], s[0:1], v15, v2, v[0:1]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v10, v5, v10
+; GFX9-GISEL-NEXT: v_add_u32_e32 v5, v9, v6
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
; GFX9-GISEL-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v4, v8, 0
-; GFX9-GISEL-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v5, v8, v[2:3]
-; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v10, v8, v[2:3]
+; GFX9-GISEL-NEXT: v_add_u32_e32 v1, v1, v4
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_mul_v16i64:
>From d306cadcbf36c2b23d0bd5de232e082271151278 Mon Sep 17 00:00:00 2001
From: Abhay Kanhere <abhay at kanhere.net>
Date: Thu, 7 Aug 2025 13:51:17 -0700
Subject: [PATCH 3/4] Extend machine verifier to check for multiple defs.
---
llvm/lib/CodeGen/MachineVerifier.cpp | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index ebef1c9034f4a..867892a25d7c1 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -2326,12 +2326,14 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
}
// Verify earlyClobber def operand
- if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
- if (!MI->getOperand(0).isReg())
- report("Early clobber must be a register", MI);
- if (!MI->getOperand(0).isEarlyClobber())
- report("Missing earlyClobber flag", MI);
- }
+ for (unsigned i = 0; i < MI->getNumOperands(); i++)
+ if (MCID.getOperandConstraint(i, MCOI::EARLY_CLOBBER) != -1) {
+ const MachineOperand &Op = MI->getOperand(i);
+ if (!Op.isReg())
+ report("Early clobber must be a register", MI);
+ if (!Op.isEarlyClobber())
+ report("Missing earlyClobber flag", MI);
+ }
// Debug values must not have a slot index.
// Other instructions must have one, unless they are inside a bundle.
if (LiveInts) {
>From 562410743942b10b8e8421e9525a1a55ddad9660 Mon Sep 17 00:00:00 2001
From: Abhay Kanhere <abhay at kanhere.net>
Date: Thu, 7 Aug 2025 13:51:51 -0700
Subject: [PATCH 4/4] update tests
---
llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 143 ++++++++++---------
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 38 ++---
2 files changed, 92 insertions(+), 89 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 59824917592fa..08aebcf0ff74c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -809,10 +809,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX1250-NEXT: v_mad_u32 v9, v2, v3, v5
; GFX1250-NEXT: v_mov_b32_e32 v8, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v6, v4, v[8:9]
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], v7, v3, v[4:5]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v6, v4, v[8:9]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v7, v3, v[10:11]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
+; GFX1250-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%result = mul i96 %num, %den
ret i96 %result
@@ -1218,16 +1218,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v9, v5, v[0:1]
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v4, 0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v2, v4, v[10:11]
-; GFX1250-NEXT: v_mov_b32_e32 v12, v1
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[12:13], v2, v4, v[10:11]
+; GFX1250-NEXT: v_mov_b32_e32 v10, v1
; GFX1250-NEXT: v_mul_lo_u32 v1, v9, v6
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mov_b32_e32 v13, v10
-; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[12:13]
+; GFX1250-NEXT: v_mov_b32_e32 v11, v12
+; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], vcc_lo, v8, v5, v[10:11]
; GFX1250-NEXT: v_mul_lo_u32 v8, v8, v7
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13]
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v11, v8, s0
+; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[14:15]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v13, v8, s0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v8, v1, vcc_lo
; GFX1250-NEXT: v_mad_u32 v1, v2, v5, v1
@@ -2874,86 +2874,87 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v14, 0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v0, v12, 0
-; GFX1250-NEXT: v_mul_lo_u32 v27, v5, v10
-; GFX1250-NEXT: v_mul_lo_u32 v29, v3, v12
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v1, v13, v[16:17]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[18:19]
+; GFX1250-NEXT: v_mul_lo_u32 v30, v4, v11
+; GFX1250-NEXT: v_mul_lo_u32 v29, v5, v10
+; GFX1250-NEXT: v_mul_lo_u32 v31, v3, v12
+; GFX1250-NEXT: v_mul_lo_u32 v32, v2, v13
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v1, v13, v[16:17]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v12, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v2, v12, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v1, v11, v[16:17]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v2, v12, v[16:17]
+; GFX1250-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v3, v11, v[20:21]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v0, v10, 0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v3, v11, v[16:17]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v4, v10, v[16:17]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], vcc_lo, v3, v9, v[20:21]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v4, v10, v[16:17]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v5, v9, v[16:17]
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[22:23], v6, v8, v[16:17]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[20:21]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
-; GFX1250-NEXT: v_mul_lo_u32 v22, v6, v9
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v5, v9, v[18:19]
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v0, v10, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[16:17]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v26, vcc_lo
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[24:25], v6, v8, v[20:21]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v9, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_dual_mov_b32 v18, v23 :: v_dual_mov_b32 v19, v24
+; GFX1250-NEXT: v_mul_lo_u32 v24, v6, v9
; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX1250-NEXT: v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[16:17]
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v0, v13, v[20:21]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v0, v13, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v2, v8, v[16:17]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_mad_co_u64_u32 v[16:17], s0, v1, v12, v[20:21]
-; GFX1250-NEXT: v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18
-; GFX1250-NEXT: v_mul_lo_u32 v25, v4, v11
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v0, v11, v[20:21]
-; GFX1250-NEXT: v_cndmask_b32_e64 v28, 0, 1, s2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
+; GFX1250-NEXT: v_mov_b32_e32 v13, v18
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s2, v0, v11, v[20:21]
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[16:17]
+; GFX1250-NEXT: v_cndmask_b32_e64 v11, 0, 1, s2
; GFX1250-NEXT: v_mad_nc_u64_u32 v[16:17], v0, v8, 0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s2, v1, v10, v[18:19]
-; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
-; GFX1250-NEXT: v_mul_lo_u32 v20, v2, v13
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v28, s2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
-; GFX1250-NEXT: v_dual_mov_b32 v18, v17 :: v_dual_mov_b32 v19, v24
-; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v21, s2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s6, v0, v9, v[18:19]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[26:27], s2, v1, v10, v[22:23]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v33, null, 0, v11, s2
+; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s3, v3, v10, v[20:21]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mov_b32_e32 v12, v17
+; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s2, v2, v9, v[26:27]
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s6, v0, v9, v[12:13]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s4, v4, v9, v[22:23]
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v33, s2
; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v15
-; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
-; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
; GFX1250-NEXT: v_mul_lo_u32 v9, v1, v14
-; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[10:11]
+; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
; GFX1250-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, s2
-; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[18:19]
+; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v1, v8, v[20:21]
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6, v13, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v2, v10, s2
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v26, v11, s2
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v28, v11, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v23, v0, s2
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v25, v0, s2
; GFX1250-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v1, v14
; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v9, s5
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v20, s4
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v29, s3
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v32, s4
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v31, s3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v25, s1
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v27, s0
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v30, s1
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v29, s0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v22, vcc_lo
+; GFX1250-NEXT: v_add_co_ci_u32_e64 v0, null, v0, v24, vcc_lo
; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v0
; GFX1250-NEXT: v_mov_b32_e32 v0, v16
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -3018,9 +3019,9 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
;
; GFX1250-LABEL: s_mul_u64_zext_with_vregs:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT: global_load_b32 v4, v[2:3], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v2, 0
+; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v4, 0
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX1250-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
@@ -3212,9 +3213,9 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
;
; GFX1250-LABEL: s_mul_u64_sext_with_vregs:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT: global_load_b32 v4, v[2:3], off
; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v2, 0
+; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v4, 0
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX1250-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 78be949baabac..1380bd927127b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -3238,14 +3238,15 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
;
; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
; GFX942-VGPR: ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v17, 64 cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 7
-; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -4604,14 +4605,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
;
; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm_splat:
; GFX942-VGPR: ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2.0
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, 1.0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, 1.0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 7
-; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -4760,16 +4761,17 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
;
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16_imm_splat:
; GFX942-VGPR: ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x3c003c00
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0x40004000
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0x3c003c00
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, v16
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 0x40004000
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, v18
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], 1.0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[0:1], v[2:3], 1.0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 7
-; GFX942-VGPR-NEXT: s_nop 1
+; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -4984,15 +4986,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
;
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_imm_splat:
; GFX942-VGPR: ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 2.0
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0
-; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, 0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: s_nop 7
-; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
More information about the llvm-commits
mailing list